当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.get_pages方法代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.get_pages方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_pages方法的具体用法?Python PDFDocument.get_pages怎么用?Python PDFDocument.get_pages使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfdocument.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.get_pages方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_pages [as 别名]
    def __init__(self, *args, **kwargs):
        super(AccountRIB, self).__init__(*args, **kwargs)

        self.parsed_text = b''

        try:
            try:
                from pdfminer.pdfdocument import PDFDocument
                from pdfminer.pdfpage import PDFPage
                newapi = True
            except ImportError:
                from pdfminer.pdfparser import PDFDocument
                newapi = False
            from pdfminer.pdfparser import PDFParser, PDFSyntaxError
            from pdfminer.converter import TextConverter
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        except ImportError:
            self.logger.warning('Please install python-pdfminer to get IBANs')
        else:
            parser = PDFParser(BytesIO(self.doc))
            try:
                if newapi:
                    doc = PDFDocument(parser)
                else:
                    doc = PDFDocument()
                    parser.set_document(doc)
                    doc.set_parser(parser)
            except PDFSyntaxError:
                return

            rsrcmgr = PDFResourceManager()
            out = BytesIO()
            device = TextConverter(rsrcmgr, out)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            if newapi:
                pages = PDFPage.create_pages(doc)
            else:
                doc.initialize()
                pages = doc.get_pages()
            for page in pages:
                interpreter.process_page(page)

            self.parsed_text = out.getvalue()
开发者ID:P4ncake,项目名称:weboob,代码行数:45,代码来源:accountlist.py

示例2: extract_text

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_pages [as 别名]
def extract_text(data):
    try:
        try:
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfpage import PDFPage
            newapi = True
        except ImportError:
            from pdfminer.pdfparser import PDFDocument
            newapi = False
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
        from pdfminer.converter import TextConverter
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    except ImportError:
        raise ImportError('Please install python-pdfminer to parse PDF')
    else:
        parser = PDFParser(BytesIO(data))
        try:
            if newapi:
                doc = PDFDocument(parser)
            else:
                doc = PDFDocument()
                parser.set_document(doc)
                doc.set_parser(parser)
        except PDFSyntaxError:
            return

        rsrcmgr = PDFResourceManager()
        out = BytesIO()
        device = TextConverter(rsrcmgr, out)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        if newapi:
            pages = PDFPage.create_pages(doc)
        else:
            doc.initialize()
            pages = doc.get_pages()
        for page in pages:
            interpreter.process_page(page)

        return out.getvalue()
开发者ID:laurentb,项目名称:weboob,代码行数:41,代码来源:pdf.py

示例3: open

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_pages [as 别名]
#coding=utf-8
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
import LAParams
import PDFPageAggregator
fp = open('/home/zzq/learngit/pdf_document/php.pdf')#打开文件
parser=PDFParser(fp)#解析器
doc =PDFDocument()#文档
doc.set_parser(parser)#设置解析器
doc.initialize("")#初始化
resource=PDFResourceManager()#资源管理器
laparams=LAParams()#参数分析期
#聚合器
device=PDFPageAggregator()
#页面解析器
interpreter=PDFPageInterpreter(resource,device)

for page in doc.get_pages():
	interpreter.process_page(page)
	layout=device.get_result()
	for out in layout:
		print out.get_text()
开发者ID:zhengziqiang,项目名称:mypython,代码行数:29,代码来源:read_pdf.py

示例4: pdf2csv

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_pages [as 别名]
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    print doc
    # Connect the parser and document objects.
    # parser.set_document(doc)
    # doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    # if not doc.is_extractable:
    #     raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    layout = device.get_result()
    
  
    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print (layout.width, layout.height)
        i=0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>5:
                i=i+1
                continue
            j=0
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>5:
                    j=j+1
                    continue
                draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1)
                j=j+1
            i=i+1
        del draw
        fp=open("out%s.png" % pageno,'wb')
        im.save(fp,"PNG")
        fp.close()
开发者ID:abner0908,项目名称:plc,代码行数:64,代码来源:stuff2.py

示例5: get_pdf_rows

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_pages [as 别名]
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
#.........这里部分代码省略.........
开发者ID:laurentb,项目名称:weboob,代码行数:103,代码来源:pdf.py


注:本文中的pdfminer.pdfdocument.PDFDocument.get_pages方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。