本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.set_parser方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.set_parser方法的具体用法?Python PDFDocument.set_parser怎么用?Python PDFDocument.set_parser使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfdocument.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.set_parser方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: with_pdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def with_pdf (pdf_doc, pdf_pwd, fn, *args):
"""Open the pdf document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdf_doc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
result = fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例2: load_document
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def load_document(self, _file, password=""):
"""turn the file into a PDFMiner document"""
log.info("loading document...")
parser = module_parser(_file)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if not doc.is_extractable:
raise ValueError("PDF text extraction not allowed")
return doc
示例3: pdf_from_resource
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def pdf_from_resource(resource):
"""
Builds PDF mining objects from input data.
This function attempts to open a PDF file for processing.
"""
parser = PDFParser(resource)
document = PDFDocument()
parser.set_document(document)
document.set_parser(parser)
document.initialize()
return document
示例4: __init__
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def __init__(self, *args, **kwargs):
super(AccountRIB, self).__init__(*args, **kwargs)
self.parsed_text = b''
try:
try:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
newapi = True
except ImportError:
from pdfminer.pdfparser import PDFDocument
newapi = False
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
except ImportError:
self.logger.warning('Please install python-pdfminer to get IBANs')
else:
parser = PDFParser(BytesIO(self.doc))
try:
if newapi:
doc = PDFDocument(parser)
else:
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
except PDFSyntaxError:
return
rsrcmgr = PDFResourceManager()
out = BytesIO()
device = TextConverter(rsrcmgr, out)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if newapi:
pages = PDFPage.create_pages(doc)
else:
doc.initialize()
pages = doc.get_pages()
for page in pages:
interpreter.process_page(page)
self.parsed_text = out.getvalue()
示例5: extract_text
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def extract_text(data):
try:
try:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
newapi = True
except ImportError:
from pdfminer.pdfparser import PDFDocument
newapi = False
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
except ImportError:
raise ImportError('Please install python-pdfminer to parse PDF')
else:
parser = PDFParser(BytesIO(data))
try:
if newapi:
doc = PDFDocument(parser)
else:
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
except PDFSyntaxError:
return
rsrcmgr = PDFResourceManager()
out = BytesIO()
device = TextConverter(rsrcmgr, out)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if newapi:
pages = PDFPage.create_pages(doc)
else:
doc.initialize()
pages = doc.get_pages()
for page in pages:
interpreter.process_page(page)
return out.getvalue()
示例6: open
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
#coding=utf-8
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
import LAParams
import PDFPageAggregator
fp = open('/home/zzq/learngit/pdf_document/php.pdf')#打开文件
parser=PDFParser(fp)#解析器
doc =PDFDocument()#文档
doc.set_parser(parser)#设置解析器
doc.initialize("")#初始化
resource=PDFResourceManager()#资源管理器
laparams=LAParams()#参数分析期
#聚合器
device=PDFPageAggregator()
#页面解析器
interpreter=PDFPageInterpreter(resource,device)
for page in doc.get_pages():
interpreter.process_page(page)
layout=device.get_result()
for out in layout:
print out.get_text()
示例7: get_pdf_rows
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import set_parser [as 别名]
def get_pdf_rows(data, miner_layout=True):
"""
Takes PDF file content as string and yield table row data for each page.
For each page in the PDF, the function yields a list of rows.
Each row is a list of cells. Each cell is a list of strings present in the cell.
Note that the rows may belong to different tables.
There are no logic tables in PDF format, so this parses PDF drawing instructions
and tries to find rectangles and arrange them in rows, then arrange text in
the rectangles.
External dependencies:
PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
"""
try:
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
except ImportError:
raise ImportError('Please install python-pdfminer')
try:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
newapi = True
except ImportError:
from pdfminer.pdfparser import PDFDocument
newapi = False
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve
parser = PDFParser(BytesIO(data))
try:
if newapi:
doc = PDFDocument(parser)
else:
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
except PDFSyntaxError:
return
rsrcmgr = PDFResourceManager()
if miner_layout:
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
else:
device = PDFPageAggregator(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if newapi:
pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
else:
doc.initialize()
pages = doc.get_pages()
if LOGGER.isEnabledFor(DEBUGFILES):
import tempfile
import PIL.Image as Image
import PIL.ImageDraw as ImageDraw
import random
path = tempfile.mkdtemp(prefix='pdf')
for npage, page in enumerate(pages):
LOGGER.debug('processing page %s', npage)
interpreter.process_page(page)
page_layout = device.get_result()
texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
LOGGER.debug('found %d text objects', len(texts))
if LOGGER.isEnabledFor(DEBUGFILES):
img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
draw = ImageDraw.Draw(img)
for t in texts:
color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
fpath = '%s/1text-%03d.png' % (path, npage)
img.save(fpath)
LOGGER.log(DEBUGFILES, 'saved %r', fpath)
if not miner_layout:
texts.sort(key=lambda t: (t.y0, t.x0))
# TODO filter ltcurves that are not lines?
# TODO convert rects to 4 lines?
lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
LOGGER.debug('found %d lines', len(lines))
if LOGGER.isEnabledFor(DEBUGFILES):
img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
draw = ImageDraw.Draw(img)
for l in lines:
color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
fpath = '%s/2lines-%03d.png' % (path, npage)
img.save(fpath)
LOGGER.log(DEBUGFILES, 'saved %r', fpath)
lines = list(uniq_lines(lines))
#.........这里部分代码省略.........