本文整理汇总了Python中pdfminer.converter.PDFPageAggregator方法的典型用法代码示例。如果您正苦于以下问题:Python converter.PDFPageAggregator方法的具体用法?Python converter.PDFPageAggregator怎么用?Python converter.PDFPageAggregator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.converter
的用法示例。
在下文中一共展示了converter.PDFPageAggregator方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: convert_pdf_to_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def convert_pdf_to_txt(path):
fp = open(path, 'rb')
txt = ''
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
txt += lt_obj.get_text()
return(txt)
示例2: __init__
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def __init__(self,
stream,
pages = None,
laparams = None,
precision = 0.001,
password = ""
):
self.laparams = None if laparams == None else LAParams(**laparams)
self.stream = stream
self.pages_to_parse = pages
self.precision = precision
rsrcmgr = PDFResourceManager()
self.doc = PDFDocument(PDFParser(stream), password = password)
self.metadata = {}
for info in self.doc.info:
self.metadata.update(info)
for k, v in self.metadata.items():
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, bool):
self.metadata[k] = v
else:
self.metadata[k] = decode_text(v)
self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
示例3: extract_first_jpeg_in_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def extract_first_jpeg_in_pdf(fstream):
"""
Reads a given PDF file and scans for the first valid embedded JPEG image.
Returns either None (if none found) or a string of data for the image.
There is no 100% guarantee for this code, yet it seems to work fine with most
scanner-produced images around.
More testing might be needed though.
Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
for page in pages:
interpreter.process_page(page)
layout = device.result
for el in layout:
if isinstance(el, LTFigure):
for im in el:
if isinstance(im, LTImage):
# Found one!
st = None
try:
imdata = im.stream.get_data()
except:
# Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
imdata = im.stream.get_rawdata()
if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
return imdata
return None
示例4: parse_case
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse_case(case_path):
"""Parse all the pdf files in the folder."""
try:
result = {
'id': case_path.split('/')[-2],
'docs': {}
}
for name in os.listdir(case_path):
if name[0] == '.' or name[-4:] != '.pdf':
continue
doc_id = name.split('.')[0]
result['docs'][doc_id] = {'pages': {}}
doc_obj = result['docs'][doc_id]
path = case_path + name
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(detect_vertical=True, all_texts=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
layout = device.get_result()
doc_obj['pages'][layout.pageid] = {
'size': (layout.width, layout.height),
'text': parse_text(layout)
}
# print(layout.width, layout.height)
output = open(case_path + 'parsed.json', 'w')
json.dump(result, output, indent=None)
except:
print("Error " + case_path)
return None
示例5: parse
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse():
fp = open(path, 'rb') # 以二进制读模式打开
#用文件对象来创建一个pdf文档分析器
praser = PDFParser(fp)
# 创建一个PDF文档
doc = PDFDocument()
# 连接分析器 与文档对象
praser.set_document(doc)
doc.set_parser(praser)
# 提供初始化密码
# 如果没有密码 就创建一个空的字符串
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 循环遍历列表,每次处理一个page的内容
for page in doc.get_pages(): # doc.get_pages() 获取page列表
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout = device.get_result()
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open(r'out.txt', 'a') as f:
results = x.get_text()
print(results)
f.write(results + '\n')
示例6: parse_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def parse_pdf(self, fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for pgnum, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
page.annots and self.parse_annotations(pgnum, page)
示例7: process_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def process_pdf(cls, pdf, output, verbose=False, tables=None):
parser = pdfparser.PDFParser(pdf)
document = pdfdocument.PDFDocument(parser)
rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
line_overlap=0.4, boxes_flow=0.5)
device = converter.PDFPageAggregator(rsrcmgr, laparams=params)
interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
outlines = document.get_outlines()
registers = {}
pages = dict((pageno, page) for (pageno, page)
in enumerate(pdfpage.PDFPage.create_pages(document)))
for xref in document.xrefs:
for oid in xref.get_objids():
obj = document.getobj(oid)
if type(obj) == dict:
if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
pageoid = obj['A'].resolve()['D'][0].objid
(pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
if p.pageid == pageoid][0]
cls.process_table_index(parser, document, rsrcmgr, params, device,
interpreter, pages, page, pageno, output,
verbose, tables)
return
示例8: _prepare_tools
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def _prepare_tools(self):
laparams = LAParams()
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return device, interpreter
示例9: __init__
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def __init__(self):
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# la=layout analysis
laparams = LAParams()
self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
self.textPoz = None
示例10: p2t
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def p2t(sourcefile, outfile):
with open(sourcefile, 'rb') as fp:
# 来创建一个pdf文档分析器
parser = PDFParser(fp)
#创建一个PDF文档对象存储文档结构
try:
document = PDFDocument(parser)
except:
print(sourcefile + ' :pdf未正确下载')
# 检查文件是否允许文本提取
else:
if not document.is_extractable:
print(sourcefile + ' :不允许提取文本')
# 创建一个PDF资源管理器对象来存储共赏资源
rsrcmgr=PDFResourceManager()
# 设定参数进行分析
laparams=LAParams()
# 创建一个PDF设备对象
# device=PDFDevice(rsrcmgr)
device=PDFPageAggregator(rsrcmgr,laparams=laparams)
# 创建一个PDF解释器对象
interpreter=PDFPageInterpreter(rsrcmgr,device)
# 处理每一页
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout=device.get_result()
for x in layout:
if(isinstance(x,LTTextBoxHorizontal)):
with open(outfile, 'a') as f:
f.write(x.get_text().encode('utf-8')+'\n')
print(sourcefile + ' 已转为 ' + outfile)
##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc
示例11: main
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def main(argv):
for arg in argv[1:]:
fd = open(arg)
parser = PDFParser(fd)
document = PDFDocument(parser)
if not document.is_extractable:
print "Document not extractable."
return 1
params = LAParams(char_margin=1)
resMan = PDFResourceManager(caching=True)
device = PDFPageAggregator(resMan, laparams=params)
interpreter = PDFPageInterpreter(resMan, device)
parser = x86ManParser("html", params)
i = 1
for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
print "Processing page %i" % i
interpreter.process_page(page)
page = device.get_result()
parser.process_page(page)
i += 1
parser.flush()
fd.close()
print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail)
示例12: init
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def init(filename,verbose=True):
'''Initiate analysis objs
'''
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return document, interpreter, device
#----------------Get the latest creation time of annos----------------
示例13: init
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import PDFPageAggregator [as 别名]
def init(filename,verbose=True):
'''Initiate analysis objs
'''
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return document, interpreter, device
#----------------Get the latest creation time of annos----------------