本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.initialize方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.initialize方法的具体用法?Python PDFDocument.initialize怎么用?Python PDFDocument.initialize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.initialize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pdf_function
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def pdf_function(pdf_doc, password='', *args, **kwargs):
result = None
try:
# open the pdf file
fp = open(pdf_doc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
doc.initialize(password)
if doc.is_extractable:
# apply the function and return the result
result = function(doc, *args, **kwargs)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例2: ParseAllPages
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def ParseAllPages(self, filepath):
# Open a PDF file.
self.filepath = filepath
fp = open(filepath, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ""
doc.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
interpreter.process_page(page)
示例3: extract_text_elements_from_pdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def extract_text_elements_from_pdf(path, j=nulljob):
"""Opens a PDF and extract every element that is text based (LTText).
"""
fp = open(path, 'rb')
doc = PDFDocument(caching=True)
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = []
all_elements = []
enumerated_pages = list(enumerate(doc.get_pages()))
progress_msg = "Reading page %i of %i"
for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
interpreter.process_page(page)
page_layout = device.get_result()
pages.append(Page(page_layout.width, page_layout.height))
textboxes = extract_textboxes(page_layout)
elements = [create_element(box) for box in textboxes]
merge_oneletter_elems(elements)
for i, elem in enumerate(elements):
elem.page = pageno
elem.order = i
all_elements += elements
return pages, all_elements
示例4: initialize_pdf_miner
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def initialize_pdf_miner(fh):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fh)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize("")
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise ValueError("PDFDocument is_extractable was False.")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
# for page in doc.get_pages():
# interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams()
laparams.word_margin = 0.0
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return doc, interpreter, device
示例5: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例6: pdf_isvalid
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def pdf_isvalid(filelike):
''' returns True if valid pdf, else False
@param filelike: filelike object, seekable
'''
logger = logging.getLogger()
isvalid = False
filelike.seek(0)
if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
return False
else:
filelike.seek(0)
try:
parser = PDFParser(filelike)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
if doc.is_extractable:
isvalid = True
except PDFException as excobj:
logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
isvalid = False
filelike.seek(0)
return isvalid
示例7: getPDFMetadata
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def getPDFMetadata(path):
result = {}
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
result = doc.info
if 'Metadata' in doc.catalog:
metadata = resolve1(doc.catalog['Metadata']).get_data()
try:
result.update( metadata ) # The raw XMP metadata
except:
pass
try:
result.update( xmp_to_dict(metadata) )
except:
pass
return result[0]
示例8: WithPdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def WithPdf(self, pdfdoc, password, fn, *args):
"""Open the pdf document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdfdoc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
if password:
self.password = password
doc.initialize(self.password)
if doc.is_extractable:
# apply the function and return the result
result = fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例9: read_invoice_pdfminer3k
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def read_invoice_pdfminer3k(pdfFile):
fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
invoice_text = ""
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
invoice_text += lt_obj.get_text()
# Extract client info from the string extracted from pdf
client = extract_info(invoice_text, client_start, client_end)
print("client :" + client)
# Extract invoice no from the pdf file name
invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
print("invoice no :" + invoice_no)
# Pass the client info and invoice no to the method which writes to excel file
write_excel(client, invoice_no)
示例10: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
示例11: create_pages
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def create_pages(self):
"""Apply parsing function, returning the results"""
from public_project.models import Page
# create a parser object associated with the file object
parser = PDFParser(self.pdf_file)
# create a PDFDocument object that stores the document structure
doc = PDFDocument()
# connect the parser and document objects
parser.set_document(doc)
doc.set_parser(parser)
# supply the password for initialization
pdf_pwd = ''
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
doc_pages = self._parse_pages(doc)
i = 1
for doc_page in doc_pages:
page = Page(
document=self.document,
number=i,
content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
)
page.save()
i = i + 1
示例12: PdfSerializer
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
class PdfSerializer(object):
def __init__(self, filename):
self.__filename = filename
fp = open(self.__filename, 'rb')
parser = PDFParser(fp)
self.__doc = PDFDocument()
parser.set_document(self.__doc)
self.__doc.set_parser(parser)
self.__doc.initialize('')
def writeToTxt(self):
text = self.getString()
txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
txtFile.write(text.encode('ascii','replace').decode("utf-8"))
txtFile.close()
def getString(self):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
string = StringIO()
device = TextConverter(rsrcmgr, string, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self.__doc.get_pages():
interpreter.process_page(page)
return string.getvalue()
示例13: get_toc
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def get_toc(self):
fp = open(self.pdf, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
# title
if doc.info:
metadict = doc.info[0]
if 'Title' in metadict.keys():
self.title = normalize_title(metadict['Title'])
# level 1 of toc
try:
outlines = doc.get_outlines()
toc = list()
select_level = self.get_level1(outlines)
except:
return None
for (level,title,dest,a,se) in doc.get_outlines():
if level==select_level:
toc.append(normalize_toc_item(title))
return toc
示例14: parse
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def parse(self, path):
out = StringIO.StringIO()
fp = None
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
fp = file(path)
rsrc = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin = 2.0
laparams.word_margin = 0.0
device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
interpreter = PDFPageInterpreter(rsrc, device)
for page in doc.get_pages():
interpreter.process_page(page)
device.close()
sample = Sample(path, None, out.getvalue())
out.close()
return sample
示例15: dumppdf
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return