本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument方法的典型用法代码示例。如果您正苦于以下问题:Python pdfdocument.PDFDocument方法的具体用法?Python pdfdocument.PDFDocument怎么用?Python pdfdocument.PDFDocument使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfdocument
的用法示例。
在下文中一共展示了pdfdocument.PDFDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_fields
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def read_fields(pdffile):
import string
printable = set(string.printable)
outfields = list()
fp = open(pdffile, 'rb')
id_to_page = dict()
parser = PDFParser(fp)
doc = PDFDocument(parser)
pageno = 1;
for page in PDFPage.create_pages(doc):
id_to_page[page.pageid] = pageno
pageno += 1
if 'AcroForm' not in doc.catalog:
return None
fields = resolve1(doc.catalog['AcroForm'])['Fields']
recursively_add_fields(fields, id_to_page, outfields)
return sorted(outfields, key=fieldsorter)
示例2: __init__
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def __init__(self,
stream,
pages = None,
laparams = None,
precision = 0.001,
password = ""
):
self.laparams = None if laparams == None else LAParams(**laparams)
self.stream = stream
self.pages_to_parse = pages
self.precision = precision
rsrcmgr = PDFResourceManager()
self.doc = PDFDocument(PDFParser(stream), password = password)
self.metadata = {}
for info in self.doc.info:
self.metadata.update(info)
for k, v in self.metadata.items():
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, bool):
self.metadata[k] = v
else:
self.metadata[k] = decode_text(v)
self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
示例3: main
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def main():
fn = input("PDF filename: ")
character = {}
with open(fn, mode='rb') as f:
parser = PDFParser(f)
doc = PDFDocument(parser)
try:
fields = resolve1(doc.catalog['AcroForm'])
fields = resolve1(fields['Fields'])
except:
raise Exception('This is not a form-fillable character sheet!')
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
if isinstance(value, PSLiteral):
value = value.name
elif value is not None:
try:
value = value.decode('iso-8859-1').strip()
except:
pass
character[name.decode('iso-8859-1').strip()] = value
print(character)
with open('./output/pdfsheet-test.json', mode='w') as f:
json.dump(character, f, skipkeys=True, sort_keys=True, indent=4)
示例4: analyze_pages
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def analyze_pages(file_name, char_margin=1.0):
"""
Input: the file path to the PDF file
Output: yields the layout object for each page in the PDF
"""
log = logging.getLogger(__name__)
# Open a PDF file.
with open(os.path.realpath(file_name), "rb") as fp:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser, password="")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams(
char_margin=char_margin, word_margin=0.1, detect_vertical=True
)
# Create a PDF page aggregator object.
device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page_num, page in enumerate(PDFPage.create_pages(document)):
try:
interpreter.process_page(page)
except OverflowError as oe:
log.exception(
"{}, skipping page {} of {}".format(oe, page_num, file_name)
)
continue
layout = device.get_result()
yield layout
示例5: extract_first_jpeg_in_pdf
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def extract_first_jpeg_in_pdf(fstream):
"""
Reads a given PDF file and scans for the first valid embedded JPEG image.
Returns either None (if none found) or a string of data for the image.
There is no 100% guarantee for this code, yet it seems to work fine with most
scanner-produced images around.
More testing might be needed though.
Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
layout = device.result
for el in layout:
if isinstance(el, LTFigure):
for im in el:
if isinstance(im, LTImage):
# Found one!
st = None
try:
imdata = im.stream.get_data()
except:
# Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
imdata = im.stream.get_rawdata()
if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
return imdata
return None
示例6: parse_case
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def parse_case(case_path):
"""Parse all the pdf files in the folder."""
try:
result = {
'id': case_path.split('/')[-2],
'docs': {}
}
for name in os.listdir(case_path):
if name[0] == '.' or name[-4:] != '.pdf':
continue
doc_id = name.split('.')[0]
result['docs'][doc_id] = {'pages': {}}
doc_obj = result['docs'][doc_id]
path = case_path + name
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(detect_vertical=True, all_texts=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
doc_obj['pages'][layout.pageid] = {
'size': (layout.width, layout.height),
'text': parse_text(layout)
}
# print(layout.width, layout.height)
output = open(case_path + 'parsed.json', 'w')
json.dump(result, output, indent=None)
except:
print("Error " + case_path)
return None
示例7: get_pdf_metadata
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def get_pdf_metadata(self, pdf):
temp_pdf_file = tempfile.TemporaryFile()
temp_pdf_file.write(pdf)
metadata = {'author': 'UNKNOWN_AUTHOR',
'title': 'UNKNOWN_TITLE',
'year': 'UNKNOWN_YEAR'}
pdf_parser = PDFParser(temp_pdf_file)
try:
pdf_doc = PDFDocument(pdf_parser)
pdf_metadata = pdf_doc.info[0]
author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
if author and author != '':
metadata['author'] = author
title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
if title and title != '':
metadata['title'] = title
year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
if year and year != '':
metadata['year'] = year
except Exception as e:
pass
temp_pdf_file.close()
return metadata
示例8: parse_pdf
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def parse_pdf(self, fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for pgnum, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
page.annots and self.parse_annotations(pgnum, page)
示例9: process_pdf
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
parser = pdfparser.PDFParser(pdf)
document = pdfdocument.PDFDocument(parser)
rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
line_overlap=0.4, boxes_flow=0.5)
device = converter.PDFPageAggregator(rsrcmgr, laparams=params)
interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
outlines = document.get_outlines()
registers = {}
pages = dict((pageno, page) for (pageno, page)
in enumerate(pdfpage.PDFPage.create_pages(document)))
for xref in document.xrefs:
for oid in xref.get_objids():
obj = document.getobj(oid)
if type(obj) == dict:
if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
pageoid = obj['A'].resolve()['D'][0].objid
(pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
if p.pageid == pageoid][0]
cls.process_table_index(parser, document, rsrcmgr, params, device,
interpreter, pages, page, pageno, output,
verbose, tables)
return
示例10: _read_file
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def _read_file(self, filename):
parser = PDFParser(open(filename, 'rb'))
document = PDFDocument(parser)
return document
示例11: xmlFromPdf
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def xmlFromPdf(pdfpath, xmlpath=None):
'''find xfa data in pdf file'''
with open(pdfpath, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
all_objids = set(objid for xref in doc.xrefs
for objid in xref.get_objids())
for objid in all_objids:
obj = doc.getobj(objid)
if not isinstance(obj, PDFStream):
continue
try:
data = obj.get_data()
except PDFNotImplementedError:
# eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode
continue
if b'xfa-template' in data:
break
else:
msg='Cannot find form data in %s' % pdfpath
raise CrypticXml(msg)
# data == <form>-text.xml
tree = etree.fromstring(data)
if xmlpath is not None:
with open(xmlpath, 'wb') as out:
out.write(etree.tostring(tree, pretty_print=True))
return tree
示例12: p2t
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def p2t(sourcefile, outfile):
with open(sourcefile, 'rb') as fp:
# 来创建一个pdf文档分析器
parser = PDFParser(fp)
#创建一个PDF文档对象存储文档结构
try:
document = PDFDocument(parser)
except:
print(sourcefile + ' :pdf未正确下载')
# 检查文件是否允许文本提取
else:
if not document.is_extractable:
print(sourcefile + ' :不允许提取文本')
# 创建一个PDF资源管理器对象来存储共赏资源
rsrcmgr=PDFResourceManager()
# 设定参数进行分析
laparams=LAParams()
# 创建一个PDF设备对象
# device=PDFDevice(rsrcmgr)
device=PDFPageAggregator(rsrcmgr,laparams=laparams)
# 创建一个PDF解释器对象
interpreter=PDFPageInterpreter(rsrcmgr,device)
# 处理每一页
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout=device.get_result()
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(outfile, 'a') as f:
f.write(x.get_text().encode('utf-8')+'\n')
print(sourcefile + ' 已转为 ' + outfile)
##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc
示例13: main
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def main(argv):
for arg in argv[1:]:
fd = open(arg)
parser = PDFParser(fd)
document = PDFDocument(parser)
if not document.is_extractable:
print "Document not extractable."
return 1
params = LAParams(char_margin=1)
resMan = PDFResourceManager(caching=True)
device = PDFPageAggregator(resMan, laparams=params)
interpreter = PDFPageInterpreter(resMan, device)
parser = x86ManParser("html", params)
i = 1
for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
print "Processing page %i" % i
interpreter.process_page(page)
page = device.get_result()
parser.process_page(page)
i += 1
parser.flush()
fd.close()
print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail)
示例14: getToc
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def getToc(self, pdfPath):
infile = open(pdfPath, 'rb')
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
for (level,title,dest,a,structelem) in document.get_outlines():
toc.append((level, title))
return toc
示例15: init
# 需要导入模块: from pdfminer import pdfdocument [as 别名]
# 或者: from pdfminer.pdfdocument import PDFDocument [as 别名]
def init(filename,verbose=True):
'''Initiate analysis objs
'''
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return document, interpreter, device
#----------------Get the latest creation time of annos----------------