本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.initialize方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.initialize方法的具体用法?Python PDFDocument.initialize怎么用?Python PDFDocument.initialize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfdocument.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.initialize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: with_pdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
"""Open the pdf document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdf_doc, "rb")
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument(parser)
# connect the parser and document objects
parser.set_document(doc)
# supply the password for initialization
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
result = fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例2: extractembedded
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def extract1(obj):
filename = os.path.basename(obj['UF'] or obj['F'])
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise PDFValueError(
'unable to process PDF: reference for %r is not a PDFStream' %
(filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
extract1(obj)
return
示例3: extract_pdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def extract_pdf(file):
"""
extract the string content of a pdf
"""
parser = PDFParser(file)
document = PDFDocument(parser)
document.initialize("")
if not document.is_extractable:
return -1
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pagenos = set()
for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True,
check_extractable=True):
interpreter.process_page(page)
content = retstr.getvalue()
return content
示例4: dumppdf
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例5: pdf_to_text
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf_to_text(page_object):
parser = PDFParser(page_object)
# Create a PDF document object that stores the document structure
doc = PDFDocument(parser)
# Connect the parser and document objects.
parser.set_document(doc)
doc.initialize('')
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF page aggregator object
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
# i = page number #without this it doesn't work
# page are items in page
for i, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for object in layout:
if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
trial = []
trial.append(object.get_text())
for word in trial:
text_content.append(word)
return text_content
示例6: pdf2metadata
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf2metadata(fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
doc.initialize()
if 'Metadata' in doc.catalog:
metadata = resolve1(doc.catalog['Metadata']).get_data()
#print metadata # The raw XMP metadata
return doc.info # The "Info" metadata
示例7: loadPDF
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def loadPDF(library, file_name):
"""adds a paper to the library"""
fp = open(file_name, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ""
document.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
print "CANT"
# raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
authors = [] #list of authors
citations = [] #list of authors that have been cited
#pages_length = sum(1 for page in document.get_pages())
for ii, page in enumerate(PDFPage.create_pages(document)):
print '---------------------------------------------------------------------------------------------------'
print "page number {}".format(ii)
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for jj, lt_obj in enumerate(layout._objs):
if jj>3:
break
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
cur_line = lt_obj.get_text().encode('ascii', 'ignore')
match = pattern_ignore.match(cur_line)
if match is None and len(cur_line)<200:
print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC
else:
print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC
else:
print "PICTURE"
break
paper_title = file_name
paper = library.getPaper(paper_title)
paper.addAuthorIds(authors)
paper.addCitationIds(citations)
示例8: proc
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def proc(self, pdfFp):
"""Get meta-data as available from a PDF document"""
parser = PDFParser(pdfFp)
doc = PDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.info = doc.info
if 'Metadata' in doc.catalog:
self.metadata = xmp_to_dict(
resolve1(doc.catalog['Metadata']).get_data()
)
self.raw_doc = pdfFp.getvalue()
示例9: getDocumentInfoAndAnnotations
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def getDocumentInfoAndAnnotations(pdfFile):
logger.info("Parsing pdf file " + pdfFile);
# Open PDF file.
fp = open(pdfFile, 'rb');
docInfo = None;
docAnnotations = [];
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp);
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser);
# Supply the password for initialization.
# (If no password is set, give an empty string.)
document.initialize('');
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager();
# Create a PDF device object.
device = PDFDevice(rsrcmgr);
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device);
# Process each page contained in the document.
pageNum = 0;
for page in PDFPage.create_pages(document):
pageNum += 1;
interpreter.process_page(page);
if(page.annots):
try:
if isinstance( page.annots, list ):
annots = page.annots;
else:
annots = page.annots.resolve();
for annot in annots:
if isinstance( annot, PDFObjRef ):
annot = annot.resolve();
if(annot.has_key('Subj')):
if(annot['Subj'] == 'Sticky Note' and docInfo == None):
logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']);
docInfo = annot['Contents'];
elif(annot['Subj'] == 'Comment on Text'):
logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']);
contents = annot['Contents'];
docAnnotations.append(str(pageNum) + ':' + contents);
else:
logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']);
except Exception, e:
logger.error("error getting annotation");
logger.exception(e);
# move file to error
os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));
示例10: load_document
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def load_document(self, _file, password=""):
"""turn the file into a PDFMiner document"""
log.info("loading document...")
parser = module_parser(_file)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if not doc.is_extractable:
raise ValueError("PDF text extraction not allowed")
return doc
示例11: pdf_from_resource
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def pdf_from_resource(resource):
"""
Builds PDF mining objects from input data.
This function attempts to open a PDF file for processing.
"""
parser = PDFParser(resource)
document = PDFDocument()
parser.set_document(document)
document.set_parser(parser)
document.initialize()
return document
示例12: Parse_PDF
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def Parse_PDF(self):
def parse_lt_objs (lt_objs, page_number, text=[]):
"""Iterate through the list of LT* objects and capture the text or image data contained in each"""
text_content = []
page_text = {}
for lt_obj in lt_objs:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
# text, so arrange is logically based on its column width
text_content.append(lt_obj.get_text())
elif isinstance(lt_obj, LTFigure):
# LTFigure objects are containers for other LT* objects, so recurse through the children
text_content.append(parse_lt_objs(lt_obj, page_number, text_content))
for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
# sort the page_text hash by the keys (x0,x1 values of the bbox),
# which produces a top-down, left-to-right sequence of related columns
text_content.append(''.join(v))
return '\n'.join(text_content)
fp = open( self.filePath, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
try:
document.initialize('')
except:
pass
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
i = 0
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
self.text_content.append(parse_lt_objs(layout, (i+1)).strip())
i += 1
return self.text_content
示例13: check_pdf_password
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def check_pdf_password(pdf, password):
fp = open(pdf, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
try:
doc.initialize(password)
if doc.is_extractable:
print ''
print 'The PDF Password Is:' + password
return True
else:
print 'exception'
return False
except:
print '\r',
return False
示例14: convert_file
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def convert_file(pdf_file, file_name):
parser = PDFParser(pdf_file)
pdf = PDFDocument(parser)
pdf.initialize("")
if not pdf.is_extractable:
raise PDFPage.PDFTextExtractionNotAllowed("Document does not allow text extraction: " + file_name)
resource = PDFResourceManager()
laparams = LAParams()
output = StringIO.StringIO()
device = TextConverter(resource, output, codec="utf-8", laparams=laparams)
interpreter = PDFPageInterpreter(resource, device)
for page in PDFPage.create_pages(pdf):
interpreter.process_page(page)
return output.getvalue()
示例15: dumpoutline
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import initialize [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page)
in enumerate(PDFPage.create_pages(doc)) )
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict):
dest = dest['D']
return dest
try:
outlines = doc.get_outlines()
outfp.write('<outlines>\n')
for (level,title,dest,a,se) in outlines:
pageno = None
if dest:
dest = resolve_dest(dest)
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
outfp.write('<outline level="%r" title="%s">\n' % (level, s))
if dest is not None:
outfp.write('<dest>')
dumpxml(outfp, dest)
outfp.write('</dest>\n')
if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno)
outfp.write('</outline>\n')
outfp.write('</outlines>\n')
except PDFNoOutlines:
pass
parser.close()
fp.close()
return