当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.get_pages方法代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.get_pages方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_pages方法的具体用法?Python PDFDocument.get_pages怎么用?Python PDFDocument.get_pages使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfparser.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.get_pages方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:27,代码来源:pdf.py

示例2: pdf_to_text

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def pdf_to_text(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    print "There are: " + str(len(list(doc.get_pages()))) + " pages"

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
开发者ID:mayhewsw,项目名称:projects,代码行数:34,代码来源:pdftotext.py

示例3: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
开发者ID:frid,项目名称:PythonPool,代码行数:29,代码来源:dumppdf.py

示例4: extractContent

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
开发者ID:CaliopeProject,项目名称:CaliopeServer,代码行数:31,代码来源:PDFProcessor.py

示例5: extract_text_elements_from_pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
开发者ID:hsoft,项目名称:pdfmasher,代码行数:31,代码来源:pdf.py

示例6: ParseAllPages

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
开发者ID:i11uminator,项目名称:bookservice,代码行数:29,代码来源:MyPdfMiner.py

示例7: pdf_to_csv

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def pdf_to_csv(filename):
    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
开发者ID:jlas,项目名称:misc,代码行数:28,代码来源:statement2csv.py

示例8: process_pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
开发者ID:zaim,项目名称:bukutip,代码行数:27,代码来源:pdf2text.py

示例9: load

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
开发者ID:samdavey,项目名称:Random,代码行数:35,代码来源:PDF_Parser.py

示例10: getPageLayouts

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
开发者ID:ricardo369f,项目名称:RSS-page-ripper,代码行数:36,代码来源:pdfTextExt.py

示例11: read_invoice_pdfminer3k

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
开发者ID:vinovator,项目名称:Vinlab,代码行数:36,代码来源:pdfInvoiceMiner.py

示例12: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
开发者ID:joshmgrant,项目名称:pdfminer,代码行数:29,代码来源:dumppdf.py

示例13: parse

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
开发者ID:hcouch21,项目名称:styloproject,代码行数:30,代码来源:PdfParser.py

示例14: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
开发者ID:Adniel,项目名称:ComparePdf,代码行数:31,代码来源:dumppdf.py

示例15: parse_pdf_pdfminer

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_pages [as 别名]
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
开发者ID:sebdraven,项目名称:ioc_parser,代码行数:33,代码来源:iocp.py


注:本文中的pdfminer.pdfparser.PDFDocument.get_pages方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。