當前位置: 首頁>>代碼示例>>Python>>正文


Python pdfparser.PDFParser方法代碼示例

本文整理匯總了Python中pdfminer.pdfparser.PDFParser方法的典型用法代碼示例。如果您正苦於以下問題:Python pdfparser.PDFParser方法的具體用法?Python pdfparser.PDFParser怎麽用?Python pdfparser.PDFParser使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pdfminer.pdfparser的用法示例。


在下文中一共展示了pdfparser.PDFParser方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: convert_pdf_to_txt

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt) 
開發者ID:opensourcesec,項目名稱:Forager,代碼行數:22,代碼來源:pdfConverter.py

示例2: read_fields

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter) 
開發者ID:jhpyle,項目名稱:docassemble,代碼行數:19,代碼來源:pdftk.py

示例3: __init__

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def __init__(self,
        stream,
        pages = None,
        laparams = None,
        precision = 0.001,
        password = ""
    ):
        self.laparams = None if laparams == None else LAParams(**laparams)
        self.stream = stream
        self.pages_to_parse = pages
        self.precision = precision
        rsrcmgr = PDFResourceManager()
        self.doc = PDFDocument(PDFParser(stream), password = password)
        self.metadata = {}
        for info in self.doc.info:
            self.metadata.update(info)
        for k, v in self.metadata.items():
            if hasattr(v, "resolve"):
                v = v.resolve()
            if type(v) == list:
                self.metadata[k] = list(map(decode_text, v))
            elif isinstance(v, PSLiteral):
                self.metadata[k] = decode_text(v.name)
            elif isinstance(v, bool):
                self.metadata[k] = v
            else:
                self.metadata[k] = decode_text(v)
        self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) 
開發者ID:jsvine,項目名稱:pdfplumber,代碼行數:31,代碼來源:pdf.py

示例4: main

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def main():
    fn = input("PDF filename: ")
    character = {}
    with open(fn, mode='rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        try:
            fields = resolve1(doc.catalog['AcroForm'])
            fields = resolve1(fields['Fields'])
        except:
            raise Exception('This is not a form-fillable character sheet!')
        for i in fields:
            field = resolve1(i)
            name, value = field.get('T'), field.get('V')
            if isinstance(value, PSLiteral):
                value = value.name
            elif value is not None:
                try:
                    value = value.decode('iso-8859-1').strip()
                except:
                    pass

            character[name.decode('iso-8859-1').strip()] = value

        print(character)
    with open('./output/pdfsheet-test.json', mode='w') as f:
        json.dump(character, f, skipkeys=True, sort_keys=True, indent=4) 
開發者ID:avrae,項目名稱:avrae,代碼行數:29,代碼來源:pdfsheet.py

示例5: getData

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info" 
開發者ID:Yukinoshita47,項目名稱:Yuki-Chan-The-Auto-Pentest,代碼行數:34,代碼來源:metadataPDF.py

示例6: analyze_pages

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout 
開發者ID:HazyResearch,項目名稱:pdftotree,代碼行數:36,代碼來源:pdf_utils.py

示例7: extract_first_jpeg_in_pdf

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
開發者ID:konstantint,項目名稱:PassportEye,代碼行數:41,代碼來源:pdf.py

示例8: parse_case

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
開發者ID:thomas0809,項目名稱:GraphIE,代碼行數:41,代碼來源:parse_pdf.py

示例9: get_pdf_metadata

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def get_pdf_metadata(self, pdf):
        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {'author': 'UNKNOWN_AUTHOR',
                    'title': 'UNKNOWN_TITLE',
                    'year': 'UNKNOWN_YEAR'}

        pdf_parser = PDFParser(temp_pdf_file)

        try:
            pdf_doc = PDFDocument(pdf_parser)
            pdf_metadata = pdf_doc.info[0]

            author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
            if author and author != '':
                metadata['author'] = author

            title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
            if title and title != '':
                metadata['title'] = title

            year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
            if year and year != '':
                metadata['year'] = year
        except Exception as e:
            pass

        temp_pdf_file.close()

        return metadata 
開發者ID:leovan,項目名稱:SciHubEVA,代碼行數:33,代碼來源:scihub_api.py

示例10: parse

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse():
    fp = open(path, 'rb') # 以二進製讀模式打開
    #用文件對象來創建一個pdf文檔分析器
    praser = PDFParser(fp)
    # 創建一個PDF文檔
    doc = PDFDocument()
    # 連接分析器 與文檔對象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密碼
    # 如果沒有密碼 就創建一個空的字符串
    doc.initialize()

    # 檢測文檔是否提供txt轉換,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 創建PDf 資源管理器 來管理共享資源
        rsrcmgr = PDFResourceManager()
        # 創建一個PDF設備對象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 創建一個PDF解釋器對象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循環遍曆列表,每次處理一個page的內容
        for page in doc.get_pages(): # doc.get_pages() 獲取page列表
            interpreter.process_page(page)
            # 接受該頁麵的LTPage對象
            layout = device.get_result()
            # 這裏layout是一個LTPage對象 裏麵存放著 這個page解析出的各種對象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要獲取文本就獲得對象的text屬性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'out.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n') 
開發者ID:YinChao126,項目名稱:anack,代碼行數:40,代碼來源:pdf_decoder.py

示例11: parse_pdf

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
開發者ID:rammie,項目名稱:pdfjinja,代碼行數:16,代碼來源:pdfjinja.py

示例12: process_pdf

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
        parser = pdfparser.PDFParser(pdf)
        document = pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfinterp.PDFResourceManager(caching=True)

        params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
                                 line_overlap=0.4, boxes_flow=0.5)
        device = converter.PDFPageAggregator(rsrcmgr, laparams=params)

        interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        outlines = document.get_outlines()
        registers = {}
        pages = dict((pageno, page) for (pageno, page)
                     in enumerate(pdfpage.PDFPage.create_pages(document)))
        for xref in document.xrefs:
            for oid in xref.get_objids():
                obj = document.getobj(oid)
                if type(obj) == dict:
                    if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
                        pageoid = obj['A'].resolve()['D'][0].objid
                        (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
                                          if p.pageid == pageoid][0]
                        cls.process_table_index(parser, document, rsrcmgr, params, device,
                                                interpreter, pages, page, pageno, output,
                                                verbose, tables)
                        return 
開發者ID:bx,項目名稱:bootloader_instrumentation_suite,代碼行數:28,代碼來源:parse_am37x_register_tables.py

示例13: _read_file

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def _read_file(self, filename):
        parser = PDFParser(open(filename, 'rb'))
        document = PDFDocument(parser)
        return document 
開發者ID:johnlinp,項目名稱:pdf-to-markdown,代碼行數:6,代碼來源:parser.py

示例14: xmlFromPdf

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def xmlFromPdf(pdfpath, xmlpath=None):
    '''find xfa data in pdf file'''
    with open(pdfpath, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        all_objids = set(objid for xref in doc.xrefs
                         for objid in xref.get_objids())
        for objid in all_objids:
            obj = doc.getobj(objid)
            if not isinstance(obj, PDFStream):
                continue
            try:
                data = obj.get_data()
            except PDFNotImplementedError:
                # eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode
                continue
            if b'xfa-template' in data:
                break
        else:
            msg='Cannot find form data in %s' % pdfpath
            raise CrypticXml(msg)
    # data == <form>-text.xml
    tree = etree.fromstring(data)
    if xmlpath is not None:
        with open(xmlpath, 'wb') as out:
            out.write(etree.tostring(tree, pretty_print=True))
    return tree 
開發者ID:jsaponara,項目名稱:opentaxforms,代碼行數:29,代碼來源:extractFillableFields.py

示例15: p2t

# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 來創建一個pdf文檔分析器
        parser = PDFParser(fp)
        #創建一個PDF文檔對象存儲文檔結構
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正確下載')
        # 檢查文件是否允許文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允許提取文本')
             # 創建一個PDF資源管理器對象來存儲共賞資源
            rsrcmgr=PDFResourceManager()
             # 設定參數進行分析
            laparams=LAParams()
             # 創建一個PDF設備對象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 創建一個PDF解釋器對象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 處理每一頁
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受該頁麵的LTPage對象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已轉為 ' + outfile)

##############################################把doc轉為txt##############################################
# 調用之前要確保你在linux 下裝了catdoc 
開發者ID:startprogress,項目名稱:China_stock_announcement,代碼行數:37,代碼來源:formatFun.py


注:本文中的pdfminer.pdfparser.PDFParser方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。