本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.set_parser方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.set_parser方法的具体用法?Python PDFDocument.set_parser怎么用?Python PDFDocument.set_parser使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.set_parser方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
class Pdf(object):
def __init__(self, pdf_file):
parser = PDFParser(pdf_file)
self._doc = PDFDocument()
parser.set_document(self._doc)
self._doc.initialize
self._doc.set_parser(parser)
@property
def pages(self):
return len(tuple(self._doc.get_pages()))
def to_text(self):
rsrcmgr = PDFResourceManager()
output = StringIO()
laparams = LAParams()
laparams.detect_vertical = True
laparams.all_texts = True
laparams.word_margin = 0.4
device = TextConverter(rsrcmgr, output, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self._doc.get_pages():
interpreter.process_page(page)
return output.getvalue().decode('utf-8', 'ignore')
示例2: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
示例3: getData
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def getData(fileName):
doc = PDFDocument()
fp = file(fileName, 'rb')
parser = PDFParser(fp)
try:
parser.set_document(doc)
doc.set_parser(parser)
except:
return "error"
parser.close()
fp.close()
try:
for xref in doc.xrefs:
info_ref=xref.trailer.get('Info')
if info_ref:
info=resolve1(info_ref)
metadata=info
if metadata == None:
return "Empty metadata"
else:
if metadata.has_key('Author'):
print("Author "+metadata['Author'])
if metadata.has_key('Company'):
print("Company "+metadata['Company'])
if metadata.has_key('Producer'):
print("Producer "+metadata['Producer'])
if metadata.has_key('Creator'):
print("Creator "+metadata['Creator'])
except Exception,e:
print "\t [x] Error in PDF extractor"
return e
示例4: initialize_pdf_miner
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def initialize_pdf_miner(fh):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fh)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize("")
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise ValueError("PDFDocument is_extractable was False.")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
# for page in doc.get_pages():
# interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams()
laparams.word_margin = 0.0
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return doc, interpreter, device
示例5: get_metadata
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def get_metadata(self):
"""Returns metadata from both
the info field (older PDFs) and XMP (newer PDFs).
Return format is a .modules.metadata.Metadata object
"""
file_pointer = open(self.path, 'rb')
parser = PDFParser(file_pointer)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
metadata = Metadata()
for i in doc.info:
metadata.add(i)
if 'Metadata' in doc.catalog:
xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
xmp_dict = xmp_to_dict(xmp_metadata)
#Let's add only the most useful one
if "xap" in xmp_dict:
metadata.add(xmp_dict["xap"])
if "pdf" in xmp_dict:
metadata.add(xmp_dict["pdf"])
if "dc" in xmp_dict:
metadata.add(xmp_dict["dc"], metadataType="dc")
file_pointer.close()
self.metadata = metadata
return metadata
示例6: extract_text_elements_from_pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
"""Opens a PDF and extract every element that is text based (LTText).
"""
fp = open(path, 'rb')
doc = PDFDocument(caching=True)
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
all_elements = []
enumerated_pages = list(enumerate(doc.get_pages()))
progress_msg = "Reading page %i of %i"
for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
interpreter.process_page(page)
page_layout = device.get_result()
pages.append(Page(page_layout.width, page_layout.height))
textboxes = extract_textboxes(page_layout)
elements = [create_element(box) for box in textboxes]
merge_oneletter_elems(elements)
for i, elem in enumerate(elements):
elem.page = pageno
elem.order = i
all_elements += elements
return pages, all_elements
示例7: parse_pdf_pdfminer
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
parser= PDFParser(f)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
for page in doc.get_pages():
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_num += 1
interpreter.process_page(page)
data = retstr.getvalue()
self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
retstr.close()
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例8: create_pages
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def create_pages(self):
"""Apply parsing function, returning the results"""
from public_project.models import Page
# create a parser object associated with the file object
parser = PDFParser(self.pdf_file)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
pdf_pwd = ''
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
doc_pages = self._parse_pages(doc)
i = 1
for doc_page in doc_pages:
page = Page(
document=self.document,
number=i,
content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
)
page.save()
i = i + 1
示例9: harvest_file
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def harvest_file(self, path):
with open(path, 'rb') as fp:
# FIXME: how do we know which encoding to use? Should we
# use 'chardet' to detect it?
encoding = 'utf-8'
parser = PDFParser(fp)
if HAS_PDFMINER_3K:
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
else:
doc = PDFDocument(parser)
title = doc.info[0].get('Title', '')
if isinstance(title, PDFObjRef):
title = title.resolve()
if isinstance(title, bytes):
# This may not be necessary with pdfminer3k.
try:
title = title.decode(encoding)
except UnicodeDecodeError:
logger.warning('Could not correctly decode title of "%s".', path)
title = title.decode(encoding, 'ignore')
fp.seek(0)
content = extract_content(fp, encoding).strip()
try:
content = content.decode(encoding)
except UnicodeDecodeError:
logger.warning('Could not correctly decode content of "%s".', path)
content = content.decode(encoding, 'ignore')
return {
'title': title,
'content': content,
'kind': 'PDF',
}
示例10: extractContent
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def extractContent(file):
print "extractContent"
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
rsrcmgr = PDFResourceManager()
codec = 'UTF-8'
laparams = LAParams()
outfp = StringIO.StringIO()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
#if not doc.is_extractable:
# return None
for i, page in enumerate(doc.get_pages()):
print "page=" + str(i)
if page is not None:
interpreter.process_page(page)
print "EOF"
device.close()
fp.close()
return outfp.getvalue()
示例11: read_invoice_pdfminer3k
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def read_invoice_pdfminer3k(pdfFile):
fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
invoice_text = ""
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
invoice_text += lt_obj.get_text()
# Extract client info from the string extracted from pdf
client = extract_info(invoice_text, client_start, client_end)
print("client :" + client)
# Extract invoice no from the pdf file name
invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
print("invoice no :" + invoice_no)
# Pass the client info and invoice no to the method which writes to excel file
write_excel(client, invoice_no)
示例12: convert_pdf_to_txt
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
示例13: ParseAllPages
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def ParseAllPages(self, filepath):
# Open a PDF file.
self.filepath = filepath
fp = open(filepath, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ""
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
示例14: PdfSerializer
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
class PdfSerializer(object):
def __init__(self, filename):
self.__filename = filename
fp = open(self.__filename, 'rb')
parser = PDFParser(fp)
self.__doc = PDFDocument()
parser.set_document(self.__doc)
self.__doc.set_parser(parser)
self.__doc.initialize('')
def writeToTxt(self):
text = self.getString()
txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
txtFile.write(text.encode('ascii','replace').decode("utf-8"))
txtFile.close()
def getString(self):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
string = StringIO()
device = TextConverter(rsrcmgr, string, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self.__doc.get_pages():
interpreter.process_page(page)
return string.getvalue()
示例15: get_pdf_metadata
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import set_parser [as 别名]
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
if len(args) > 1:
prefix = fileOrUrl + ':'
fp = None
if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
request = urllib2.Request(fileOrUrl)
if basicauth:
request.add_header('Authorization', 'Basic ' + basicauth)
fobj = urllib2.urlopen(request)
pdfdata = fobj.read()
fobj.close()
fp = StringIO.StringIO(pdfdata)
else:
fp = open(fileOrUrl, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
fp.close()
if textmode:
for obj in doc.info:
for (name, val) in obj.iteritems():
print '{0}:{1}={2}'.format(
fileOrUrl, name, val
)
else:
val = doc.info
if type(val) is list and len(val) == 1:
val = val[0]
print prefix + str(val)