当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.initialize方法代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.initialize方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.initialize方法的具体用法?Python PDFDocument.initialize怎么用?Python PDFDocument.initialize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfparser.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.initialize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: pdf_function

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
开发者ID:ArcainOne,项目名称:anathema,代码行数:27,代码来源:book_parser.py

示例2: ParseAllPages

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
开发者ID:i11uminator,项目名称:bookservice,代码行数:29,代码来源:MyPdfMiner.py

示例3: extract_text_elements_from_pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
开发者ID:hsoft,项目名称:pdfmasher,代码行数:31,代码来源:pdf.py

示例4: initialize_pdf_miner

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:AymanYac,项目名称:OCP-Graduation-Project,代码行数:33,代码来源:pdftables.py

示例5: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
开发者ID:frid,项目名称:PythonPool,代码行数:29,代码来源:dumppdf.py

示例6: pdf_isvalid

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
开发者ID:emulbreh,项目名称:ecs,代码行数:28,代码来源:pdfutils.py

示例7: getPDFMetadata

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
开发者ID:tanmoydeb07,项目名称:ebookreader,代码行数:31,代码来源:readerUtilities.py

示例8: WithPdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
开发者ID:i11uminator,项目名称:bookservice,代码行数:30,代码来源:MyPdfMiner.py

示例9: read_invoice_pdfminer3k

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
开发者ID:vinovator,项目名称:Vinlab,代码行数:36,代码来源:pdfInvoiceMiner.py

示例10: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
开发者ID:joshmgrant,项目名称:pdfminer,代码行数:29,代码来源:dumppdf.py

示例11: create_pages

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
开发者ID:atassumer,项目名称:django-public-project,代码行数:30,代码来源:doc_scanner.py

示例12: PdfSerializer

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
开发者ID:vsharonlynn,项目名称:cs3219-project-CViA,代码行数:28,代码来源:pdf_serializer.py

示例13: get_toc

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
    def get_toc(self):
        fp = open(self.pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        
        # title
        if doc.info:
            metadict = doc.info[0]
            if 'Title' in metadict.keys():
                self.title = normalize_title(metadict['Title'])

        # level 1 of toc
        try:
            outlines = doc.get_outlines()
            toc = list()
            select_level = self.get_level1(outlines)
        except:
            return None
        for (level,title,dest,a,se) in doc.get_outlines():
            if level==select_level:
                toc.append(normalize_toc_item(title))
        return toc
开发者ID:larscwallin,项目名称:pdfdig,代码行数:27,代码来源:pdfinfo.py

示例14: parse

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
开发者ID:hcouch21,项目名称:styloproject,代码行数:30,代码来源:PdfParser.py

示例15: dumppdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
开发者ID:Adniel,项目名称:ComparePdf,代码行数:31,代码来源:dumppdf.py


注:本文中的pdfminer.pdfparser.PDFDocument.initialize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。