当前位置: 首页>>代码示例>>Python>>正文


Python converter.PDFPageAggregator方法代码示例

本文整理汇总了Python中pdfminer.converter.PDFPageAggregator方法的典型用法代码示例。如果您正苦于以下问题:Python converter.PDFPageAggregator方法的具体用法?Python converter.PDFPageAggregator怎么用?Python converter.PDFPageAggregator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.converter的用法示例。


在下文中一共展示了converter.PDFPageAggregator方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: convert_pdf_to_txt

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt) 
开发者ID:opensourcesec,项目名称:Forager,代码行数:22,代码来源:pdfConverter.py

示例2: __init__

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def __init__(self,
        stream,
        pages = None,
        laparams = None,
        precision = 0.001,
        password = ""
    ):
        self.laparams = None if laparams == None else LAParams(**laparams)
        self.stream = stream
        self.pages_to_parse = pages
        self.precision = precision
        rsrcmgr = PDFResourceManager()
        self.doc = PDFDocument(PDFParser(stream), password = password)
        self.metadata = {}
        for info in self.doc.info:
            self.metadata.update(info)
        for k, v in self.metadata.items():
            if hasattr(v, "resolve"):
                v = v.resolve()
            if type(v) == list:
                self.metadata[k] = list(map(decode_text, v))
            elif isinstance(v, PSLiteral):
                self.metadata[k] = decode_text(v.name)
            elif isinstance(v, bool):
                self.metadata[k] = v
            else:
                self.metadata[k] = decode_text(v)
        self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) 
开发者ID:jsvine,项目名称:pdfplumber,代码行数:31,代码来源:pdf.py

示例3: extract_first_jpeg_in_pdf

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
开发者ID:konstantint,项目名称:PassportEye,代码行数:41,代码来源:pdf.py

示例4: parse_case

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
开发者ID:thomas0809,项目名称:GraphIE,代码行数:41,代码来源:parse_pdf.py

示例5: parse

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse():
    fp = open(path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'out.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n') 
开发者ID:YinChao126,项目名称:anack,代码行数:40,代码来源:pdf_decoder.py

示例6: parse_pdf

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
开发者ID:rammie,项目名称:pdfjinja,代码行数:16,代码来源:pdfjinja.py

示例7: process_pdf

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
        parser = pdfparser.PDFParser(pdf)
        document = pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfinterp.PDFResourceManager(caching=True)

        params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
                                 line_overlap=0.4, boxes_flow=0.5)
        device = converter.PDFPageAggregator(rsrcmgr, laparams=params)

        interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        outlines = document.get_outlines()
        registers = {}
        pages = dict((pageno, page) for (pageno, page)
                     in enumerate(pdfpage.PDFPage.create_pages(document)))
        for xref in document.xrefs:
            for oid in xref.get_objids():
                obj = document.getobj(oid)
                if type(obj) == dict:
                    if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
                        pageoid = obj['A'].resolve()['D'][0].objid
                        (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
                                          if p.pageid == pageoid][0]
                        cls.process_table_index(parser, document, rsrcmgr, params, device,
                                                interpreter, pages, page, pageno, output,
                                                verbose, tables)
                        return 
开发者ID:bx,项目名称:bootloader_instrumentation_suite,代码行数:28,代码来源:parse_am37x_register_tables.py

示例8: _prepare_tools

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def _prepare_tools(self):
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        return device, interpreter 
开发者ID:johnlinp,项目名称:pdf-to-markdown,代码行数:9,代码来源:parser.py

示例9: __init__

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def __init__(self):
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # la=layout analysis
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.textPoz = None 
开发者ID:jsaponara,项目名称:opentaxforms,代码行数:10,代码来源:Form.py

示例10: p2t

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc 
开发者ID:startprogress,项目名称:China_stock_announcement,代码行数:37,代码来源:formatFun.py

示例11: main

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def main(argv):
	for arg in argv[1:]:
		fd = open(arg)
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print "Document not extractable."
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print "Processing page %i" % i
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()
	
		print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail) 
开发者ID:zneak,项目名称:x86doc,代码行数:28,代码来源:extract.py

示例12: init

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device


#----------------Get the latest creation time of annos---------------- 
开发者ID:Xunius,项目名称:Menotexport,代码行数:32,代码来源:extracthl2.py

示例13: init

# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device
    



#----------------Get the latest creation time of annos---------------- 
开发者ID:Xunius,项目名称:Menotexport,代码行数:34,代码来源:extracthl.py


注:本文中的pdfminer.converter.PDFPageAggregator方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。