当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.set_parser方法代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.set_parser方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.set_parser方法的具体用法?Python PDFDocument.set_parser怎么用?Python PDFDocument.set_parser使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfparser.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.set_parser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:27,代码来源:pdf.py

示例2: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
开发者ID:joshmgrant,项目名称:pdfminer,代码行数:29,代码来源:dumppdf.py

示例3: getData

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
开发者ID:jmortega,项目名称:europython_ethical_hacking,代码行数:34,代码来源:metadataPDF.py

示例4: initialize_pdf_miner

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:AymanYac,项目名称:OCP-Graduation-Project,代码行数:33,代码来源:pdftables.py

示例5: get_metadata

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
开发者ID:staffanm,项目名称:protokollen,代码行数:30,代码来源:pdf.py

示例6: extract_text_elements_from_pdf

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
开发者ID:hsoft,项目名称:pdfmasher,代码行数:31,代码来源:pdf.py

示例7: parse_pdf_pdfminer

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
开发者ID:sebdraven,项目名称:ioc_parser,代码行数:33,代码来源:iocp.py

示例8: create_pages

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
开发者ID:atassumer,项目名称:django-public-project,代码行数:30,代码来源:doc_scanner.py

示例9: harvest_file

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
 def harvest_file(self, path):
     with open(path, 'rb') as fp:
         # FIXME: how do we know which encoding to use? Should we
         # use 'chardet' to detect it?
         encoding = 'utf-8'
         parser = PDFParser(fp)
         if HAS_PDFMINER_3K:
             doc = PDFDocument()
             parser.set_document(doc)
             doc.set_parser(parser)
         else:
             doc = PDFDocument(parser)
         title = doc.info[0].get('Title', '')
         if isinstance(title, PDFObjRef):
             title = title.resolve()
         if isinstance(title, bytes):
             # This may not be necessary with pdfminer3k.
             try:
                 title = title.decode(encoding)
             except UnicodeDecodeError:
                 logger.warning('Could not correctly decode title of "%s".', path)
                 title = title.decode(encoding, 'ignore')
         fp.seek(0)
         content = extract_content(fp, encoding).strip()
         try:
             content = content.decode(encoding)
         except UnicodeDecodeError:
             logger.warning('Could not correctly decode content of "%s".', path)
             content = content.decode(encoding, 'ignore')
     return {
         'title': title,
         'content': content,
         'kind': 'PDF',
     }
开发者ID:Polyconseil,项目名称:dokang_pdf,代码行数:36,代码来源:__init__.py

示例10: extractContent

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
开发者ID:CaliopeProject,项目名称:CaliopeServer,代码行数:31,代码来源:PDFProcessor.py

示例11: read_invoice_pdfminer3k

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
开发者ID:vinovator,项目名称:Vinlab,代码行数:36,代码来源:pdfInvoiceMiner.py

示例12: convert_pdf_to_txt

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
开发者ID:suriyan,项目名称:autosum,代码行数:28,代码来源:autosumpdf.py

示例13: ParseAllPages

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
开发者ID:i11uminator,项目名称:bookservice,代码行数:29,代码来源:MyPdfMiner.py

示例14: PdfSerializer

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
开发者ID:vsharonlynn,项目名称:cs3219-project-CViA,代码行数:28,代码来源:pdf_serializer.py

示例15: get_pdf_metadata

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
开发者ID:kristerhedfors,项目名称:bin,代码行数:33,代码来源:pdfmeta.py


注:本文中的pdfminer.pdfparser.PDFDocument.set_parser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。