本文整理匯總了Python中pdfminer.pdfparser.PDFParser方法的典型用法代碼示例。如果您正苦於以下問題:Python pdfparser.PDFParser方法的具體用法?Python pdfparser.PDFParser怎麽用?Python pdfparser.PDFParser使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pdfminer.pdfparser
的用法示例。
在下文中一共展示了pdfparser.PDFParser方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: convert_pdf_to_txt
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def convert_pdf_to_txt(path):
fp = open(path, 'rb')
txt = ''
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
txt += lt_obj.get_text()
return(txt)
示例2: read_fields
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def read_fields(pdffile):
import string
printable = set(string.printable)
outfields = list()
fp = open(pdffile, 'rb')
id_to_page = dict()
parser = PDFParser(fp)
doc = PDFDocument(parser)
pageno = 1;
for page in PDFPage.create_pages(doc):
id_to_page[page.pageid] = pageno
pageno += 1
if 'AcroForm' not in doc.catalog:
return None
fields = resolve1(doc.catalog['AcroForm'])['Fields']
recursively_add_fields(fields, id_to_page, outfields)
return sorted(outfields, key=fieldsorter)
示例3: __init__
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def __init__(self,
stream,
pages = None,
laparams = None,
precision = 0.001,
password = ""
):
self.laparams = None if laparams == None else LAParams(**laparams)
self.stream = stream
self.pages_to_parse = pages
self.precision = precision
rsrcmgr = PDFResourceManager()
self.doc = PDFDocument(PDFParser(stream), password = password)
self.metadata = {}
for info in self.doc.info:
self.metadata.update(info)
for k, v in self.metadata.items():
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, bool):
self.metadata[k] = v
else:
self.metadata[k] = decode_text(v)
self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
示例4: main
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def main():
fn = input("PDF filename: ")
character = {}
with open(fn, mode='rb') as f:
parser = PDFParser(f)
doc = PDFDocument(parser)
try:
fields = resolve1(doc.catalog['AcroForm'])
fields = resolve1(fields['Fields'])
except:
raise Exception('This is not a form-fillable character sheet!')
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
if isinstance(value, PSLiteral):
value = value.name
elif value is not None:
try:
value = value.decode('iso-8859-1').strip()
except:
pass
character[name.decode('iso-8859-1').strip()] = value
print(character)
with open('./output/pdfsheet-test.json', mode='w') as f:
json.dump(character, f, skipkeys=True, sort_keys=True, indent=4)
示例5: getData
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def getData(self):
doc = PDFDocument()
fp = file(self.fname, 'rb')
parser = PDFParser(fp)
try:
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(self.password)
except:
return "error"
parser.close()
fp.close()
#try:
# metadata = resolve1(doc.catalog['Metadata'])
# return "ok"
#except:
# print "[x] Error in PDF extractor, Metadata catalog"
try:
for xref in doc.xrefs:
info_ref=xref.trailer.get('Info')
if info_ref:
info=resolve1(info_ref)
self.metadata=info
self.raw = info
if self.raw == None:
return "Empty metadata"
else:
return "ok"
except Exception,e:
return e
print "\t [x] Error in PDF extractor, Trailer Info"
示例6: analyze_pages
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def analyze_pages(file_name, char_margin=1.0):
"""
Input: the file path to the PDF file
Output: yields the layout object for each page in the PDF
"""
log = logging.getLogger(__name__)
# Open a PDF file.
with open(os.path.realpath(file_name), "rb") as fp:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser, password="")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams(
char_margin=char_margin, word_margin=0.1, detect_vertical=True
)
# Create a PDF page aggregator object.
device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page_num, page in enumerate(PDFPage.create_pages(document)):
try:
interpreter.process_page(page)
except OverflowError as oe:
log.exception(
"{}, skipping page {} of {}".format(oe, page_num, file_name)
)
continue
layout = device.get_result()
yield layout
示例7: extract_first_jpeg_in_pdf
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def extract_first_jpeg_in_pdf(fstream):
"""
Reads a given PDF file and scans for the first valid embedded JPEG image.
Returns either None (if none found) or a string of data for the image.
There is no 100% guarantee for this code, yet it seems to work fine with most
scanner-produced images around.
More testing might be needed though.
Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
layout = device.result
for el in layout:
if isinstance(el, LTFigure):
for im in el:
if isinstance(im, LTImage):
# Found one!
st = None
try:
imdata = im.stream.get_data()
except:
# Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
imdata = im.stream.get_rawdata()
if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
return imdata
return None
示例8: parse_case
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse_case(case_path):
"""Parse all the pdf files in the folder."""
try:
result = {
'id': case_path.split('/')[-2],
'docs': {}
}
for name in os.listdir(case_path):
if name[0] == '.' or name[-4:] != '.pdf':
continue
doc_id = name.split('.')[0]
result['docs'][doc_id] = {'pages': {}}
doc_obj = result['docs'][doc_id]
path = case_path + name
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(detect_vertical=True, all_texts=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
doc_obj['pages'][layout.pageid] = {
'size': (layout.width, layout.height),
'text': parse_text(layout)
}
# print(layout.width, layout.height)
output = open(case_path + 'parsed.json', 'w')
json.dump(result, output, indent=None)
except:
print("Error " + case_path)
return None
示例9: get_pdf_metadata
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def get_pdf_metadata(self, pdf):
temp_pdf_file = tempfile.TemporaryFile()
temp_pdf_file.write(pdf)
metadata = {'author': 'UNKNOWN_AUTHOR',
'title': 'UNKNOWN_TITLE',
'year': 'UNKNOWN_YEAR'}
pdf_parser = PDFParser(temp_pdf_file)
try:
pdf_doc = PDFDocument(pdf_parser)
pdf_metadata = pdf_doc.info[0]
author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
if author and author != '':
metadata['author'] = author
title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
if title and title != '':
metadata['title'] = title
year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
if year and year != '':
metadata['year'] = year
except Exception as e:
pass
temp_pdf_file.close()
return metadata
示例10: parse
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse():
fp = open(path, 'rb') # 以二進製讀模式打開
#用文件對象來創建一個pdf文檔分析器
praser = PDFParser(fp)
# 創建一個PDF文檔
doc = PDFDocument()
# 連接分析器 與文檔對象
praser.set_document(doc)
doc.set_parser(praser)
# 提供初始化密碼
# 如果沒有密碼 就創建一個空的字符串
doc.initialize()
# 檢測文檔是否提供txt轉換,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 創建PDf 資源管理器 來管理共享資源
rsrcmgr = PDFResourceManager()
# 創建一個PDF設備對象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 創建一個PDF解釋器對象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 循環遍曆列表,每次處理一個page的內容
for page in doc.get_pages(): # doc.get_pages() 獲取page列表
interpreter.process_page(page)
# 接受該頁麵的LTPage對象
layout = device.get_result()
# 這裏layout是一個LTPage對象 裏麵存放著 這個page解析出的各種對象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要獲取文本就獲得對象的text屬性,
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open(r'out.txt', 'a') as f:
results = x.get_text()
print(results)
f.write(results + '\n')
示例11: parse_pdf
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def parse_pdf(self, fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for pgnum, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
page.annots and self.parse_annotations(pgnum, page)
示例12: process_pdf
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
parser = pdfparser.PDFParser(pdf)
document = pdfdocument.PDFDocument(parser)
rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
line_overlap=0.4, boxes_flow=0.5)
device = converter.PDFPageAggregator(rsrcmgr, laparams=params)
interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
outlines = document.get_outlines()
registers = {}
pages = dict((pageno, page) for (pageno, page)
in enumerate(pdfpage.PDFPage.create_pages(document)))
for xref in document.xrefs:
for oid in xref.get_objids():
obj = document.getobj(oid)
if type(obj) == dict:
if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
pageoid = obj['A'].resolve()['D'][0].objid
(pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
if p.pageid == pageoid][0]
cls.process_table_index(parser, document, rsrcmgr, params, device,
interpreter, pages, page, pageno, output,
verbose, tables)
return
示例13: _read_file
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def _read_file(self, filename):
parser = PDFParser(open(filename, 'rb'))
document = PDFDocument(parser)
return document
示例14: xmlFromPdf
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def xmlFromPdf(pdfpath, xmlpath=None):
'''find xfa data in pdf file'''
with open(pdfpath, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
all_objids = set(objid for xref in doc.xrefs
for objid in xref.get_objids())
for objid in all_objids:
obj = doc.getobj(objid)
if not isinstance(obj, PDFStream):
continue
try:
data = obj.get_data()
except PDFNotImplementedError:
# eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode
continue
if b'xfa-template' in data:
break
else:
msg='Cannot find form data in %s' % pdfpath
raise CrypticXml(msg)
# data == <form>-text.xml
tree = etree.fromstring(data)
if xmlpath is not None:
with open(xmlpath, 'wb') as out:
out.write(etree.tostring(tree, pretty_print=True))
return tree
示例15: p2t
# 需要導入模塊: from pdfminer import pdfparser [as 別名]
# 或者: from pdfminer.pdfparser import PDFParser [as 別名]
def p2t(sourcefile, outfile):
with open(sourcefile, 'rb') as fp:
# 來創建一個pdf文檔分析器
parser = PDFParser(fp)
#創建一個PDF文檔對象存儲文檔結構
try:
document = PDFDocument(parser)
except:
print(sourcefile + ' :pdf未正確下載')
# 檢查文件是否允許文本提取
else:
if not document.is_extractable:
print(sourcefile + ' :不允許提取文本')
# 創建一個PDF資源管理器對象來存儲共賞資源
rsrcmgr=PDFResourceManager()
# 設定參數進行分析
laparams=LAParams()
# 創建一個PDF設備對象
# device=PDFDevice(rsrcmgr)
device=PDFPageAggregator(rsrcmgr,laparams=laparams)
# 創建一個PDF解釋器對象
interpreter=PDFPageInterpreter(rsrcmgr,device)
# 處理每一頁
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受該頁麵的LTPage對象
layout=device.get_result()
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(outfile, 'a') as f:
f.write(x.get_text().encode('utf-8')+'\n')
print(sourcefile + ' 已轉為 ' + outfile)
##############################################把doc轉為txt##############################################
# 調用之前要確保你在linux 下裝了catdoc