当前位置: 首页>>代码示例>>Python>>正文


Python pdfdocument.PDFDocument方法代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument方法的典型用法代码示例。如果您正苦于以下问题:Python pdfdocument.PDFDocument方法的具体用法?Python pdfdocument.PDFDocument怎么用?Python pdfdocument.PDFDocument使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfdocument的用法示例。


在下文中一共展示了pdfdocument.PDFDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: read_fields

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter) 
开发者ID:jhpyle,项目名称:docassemble,代码行数:19,代码来源:pdftk.py

示例2: __init__

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def __init__(self,
        stream,
        pages = None,
        laparams = None,
        precision = 0.001,
        password = ""
    ):
        self.laparams = None if laparams == None else LAParams(**laparams)
        self.stream = stream
        self.pages_to_parse = pages
        self.precision = precision
        rsrcmgr = PDFResourceManager()
        self.doc = PDFDocument(PDFParser(stream), password = password)
        self.metadata = {}
        for info in self.doc.info:
            self.metadata.update(info)
        for k, v in self.metadata.items():
            if hasattr(v, "resolve"):
                v = v.resolve()
            if type(v) == list:
                self.metadata[k] = list(map(decode_text, v))
            elif isinstance(v, PSLiteral):
                self.metadata[k] = decode_text(v.name)
            elif isinstance(v, bool):
                self.metadata[k] = v
            else:
                self.metadata[k] = decode_text(v)
        self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) 
开发者ID:jsvine,项目名称:pdfplumber,代码行数:31,代码来源:pdf.py

示例3: main

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def main():
    fn = input("PDF filename: ")
    character = {}
    with open(fn, mode='rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        try:
            fields = resolve1(doc.catalog['AcroForm'])
            fields = resolve1(fields['Fields'])
        except:
            raise Exception('This is not a form-fillable character sheet!')
        for i in fields:
            field = resolve1(i)
            name, value = field.get('T'), field.get('V')
            if isinstance(value, PSLiteral):
                value = value.name
            elif value is not None:
                try:
                    value = value.decode('iso-8859-1').strip()
                except:
                    pass

            character[name.decode('iso-8859-1').strip()] = value

        print(character)
    with open('./output/pdfsheet-test.json', mode='w') as f:
        json.dump(character, f, skipkeys=True, sort_keys=True, indent=4) 
开发者ID:avrae,项目名称:avrae,代码行数:29,代码来源:pdfsheet.py

示例4: analyze_pages

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:36,代码来源:pdf_utils.py

示例5: extract_first_jpeg_in_pdf

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
开发者ID:konstantint,项目名称:PassportEye,代码行数:41,代码来源:pdf.py

示例6: parse_case

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
开发者ID:thomas0809,项目名称:GraphIE,代码行数:41,代码来源:parse_pdf.py

示例7: get_pdf_metadata

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def get_pdf_metadata(self, pdf):
        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {'author': 'UNKNOWN_AUTHOR',
                    'title': 'UNKNOWN_TITLE',
                    'year': 'UNKNOWN_YEAR'}

        pdf_parser = PDFParser(temp_pdf_file)

        try:
            pdf_doc = PDFDocument(pdf_parser)
            pdf_metadata = pdf_doc.info[0]

            author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
            if author and author != '':
                metadata['author'] = author

            title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
            if title and title != '':
                metadata['title'] = title

            year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
            if year and year != '':
                metadata['year'] = year
        except Exception as e:
            pass

        temp_pdf_file.close()

        return metadata 
开发者ID:leovan,项目名称:SciHubEVA,代码行数:33,代码来源:scihub_api.py

示例8: parse_pdf

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
开发者ID:rammie,项目名称:pdfjinja,代码行数:16,代码来源:pdfjinja.py

示例9: process_pdf

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
        parser = pdfparser.PDFParser(pdf)
        document = pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfinterp.PDFResourceManager(caching=True)

        params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
                                 line_overlap=0.4, boxes_flow=0.5)
        device = converter.PDFPageAggregator(rsrcmgr, laparams=params)

        interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        outlines = document.get_outlines()
        registers = {}
        pages = dict((pageno, page) for (pageno, page)
                     in enumerate(pdfpage.PDFPage.create_pages(document)))
        for xref in document.xrefs:
            for oid in xref.get_objids():
                obj = document.getobj(oid)
                if type(obj) == dict:
                    if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
                        pageoid = obj['A'].resolve()['D'][0].objid
                        (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
                                          if p.pageid == pageoid][0]
                        cls.process_table_index(parser, document, rsrcmgr, params, device,
                                                interpreter, pages, page, pageno, output,
                                                verbose, tables)
                        return 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:28,代码来源:parse_am37x_register_tables.py

示例10: _read_file

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def _read_file(self, filename):
        parser = PDFParser(open(filename, 'rb'))
        document = PDFDocument(parser)
        return document 
开发者ID:johnlinp,项目名称:pdf-to-markdown,代码行数:6,代码来源:parser.py

示例11: xmlFromPdf

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def xmlFromPdf(pdfpath, xmlpath=None):
    '''find xfa data in pdf file'''
    with open(pdfpath, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        all_objids = set(objid for xref in doc.xrefs
                         for objid in xref.get_objids())
        for objid in all_objids:
            obj = doc.getobj(objid)
            if not isinstance(obj, PDFStream):
                continue
            try:
                data = obj.get_data()
            except PDFNotImplementedError:
                # eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode
                continue
            if b'xfa-template' in data:
                break
        else:
            msg='Cannot find form data in %s' % pdfpath
            raise CrypticXml(msg)
    # data == <form>-text.xml
    tree = etree.fromstring(data)
    if xmlpath is not None:
        with open(xmlpath, 'wb') as out:
            out.write(etree.tostring(tree, pretty_print=True))
    return tree 
开发者ID:jsaponara,项目名称:opentaxforms,代码行数:29,代码来源:extractFillableFields.py

示例12: p2t

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc 
开发者ID:startprogress,项目名称:China_stock_announcement,代码行数:37,代码来源:formatFun.py

示例13: main

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def main(argv):
	for arg in argv[1:]:
		fd = open(arg)
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print "Document not extractable."
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print "Processing page %i" % i
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()
	
		print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail) 
开发者ID:zneak,项目名称:x86doc,代码行数:28,代码来源:extract.py

示例14: getToc

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def getToc(self, pdfPath):
        infile = open(pdfPath, 'rb')
        parser = PDFParser(infile)
        document = PDFDocument(parser)

        toc = list()
        for (level,title,dest,a,structelem) in document.get_outlines():
            toc.append((level, title))

        return toc 
开发者ID:wowdd1,项目名称:xlinkBook,代码行数:12,代码来源:outline.py

示例15: init

# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device


#----------------Get the latest creation time of annos---------------- 
开发者ID:Xunius,项目名称:Menotexport,代码行数:32,代码来源:extracthl2.py


注:本文中的pdfminer.pdfdocument.PDFDocument方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。