本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument类的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument类的具体用法?Python PDFDocument怎么用?Python PDFDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PDFDocument类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
return
示例2: parse_paragraphs
def parse_paragraphs(self, text):
# Will only work for markdown elements
# divided by '##' markers
# or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
lines = text.split('\n')
headlines = []
if self.is_pdf:
with open(self.paper_filename, 'rb') as pdf:
parser = PDFParser(pdf)
document = PDFDocument(parser)
try:
outlines = document.get_outlines()
for (level, title, _, _, _) in outlines:
if level == 1:
headlines.append(title)
except PDFNoOutlines:
logging.info(
"No outline found -> skipping paragraph search..."
)
else: # check markdown headlines
for index, line in enumerate(lines):
if line.startswith('## '):
headlines.append(line)
if len(headlines) > 0:
self.count_paragraphs(text, lines, headlines)
示例3: with_pdf
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
"""Open the pdf document, and apply the function, returning the results"""
result = None
try:
# open the pdf file
fp = open(pdf_doc, "rb")
# create a parser object associated with the file object
parser = PDFParser(fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument(parser)
# connect the parser and document objects
parser.set_document(doc)
# supply the password for initialization
doc.initialize(pdf_pwd)
if doc.is_extractable:
# apply the function and return the result
result = fn(doc, *args)
# close the pdf file
fp.close()
except IOError:
# the file doesn't exist or similar problem
pass
return result
示例4: extract_pdf
def extract_pdf(file):
"""
extract the string content of a pdf
"""
parser = PDFParser(file)
document = PDFDocument(parser)
document.initialize("")
if not document.is_extractable:
return -1
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pagenos = set()
for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True,
check_extractable=True):
interpreter.process_page(page)
content = retstr.getvalue()
return content
示例5: parse
def parse (self):
fp = file(self.pdf, 'rb')
parser = PDFParser(fp, dbg=self.debug)
doc = PDFDocument(parser, dbg=self.debug)
#extract blob of data after EOF (if it exists)
if doc.found_eof and doc.eof_distance > 3:
self.bin_blob = parser.read_from_end(doc.eof_distance)
res = '<pdf>'
visited = set() #keep track of the objects already visited
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited:
continue
if objid == 21 or objid == 67:
print objid
visited.add(objid)
try:
obj = doc.getobj(objid)
res += '<object id="' + str(objid) + '">\n'
res += self.dump(obj)
res += '\n</object>\n\n'
except PDFObjectNotFound as e:
mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
mal_obj = mal_obj.replace('<', '0x3C')
res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
self.takenote(self.malformed, 'objects', objid)
except Exception as e:
res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
fp.close()
res += self.dumptrailers(doc)
res += '</pdf>'
self.xml=res
self.errors = doc.errors
self.bytes_read = parser.BYTES
return
示例6: extractembedded
def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def extract1(obj):
filename = os.path.basename(obj['UF'] or obj['F'])
fileref = obj['EF']['F']
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
raise PDFValueError(
'unable to process PDF: reference for %r is not a PDFStream' %
(filename))
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r is not an EmbeddedFile' %
(filename))
path = os.path.join(extractdir, filename)
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print >>sys.stderr, 'extracting: %r' % path
out = file(path, 'wb')
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
extract1(obj)
return
示例7: dumppdf
def dumppdf(fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
res = ""
if objids:
for objid in objids:
obj = doc.getobj(objid)
res += dumpxml(obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
res += dumpxml( obj, codec=codec)
else:
res += dumpxml(page.attrs)
#print "before dumpall"
if dumpall:
res += dumpallobjs( doc, codec=codec)
#print "after dumpall"
if (not objids) and (not pagenos) and (not dumpall):
res += dumptrailers( doc)
fp.close()
if codec not in ('raw','binary'):
res += '\n'
#print "end proc"
return res
示例8: main
def main():
# Open a PDF file.
with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
print rsrcmgr
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
print interpreter.process_page(page)
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
return 0
示例9: pdf_to_text
def pdf_to_text(page_object):
parser = PDFParser(page_object)
# Create a PDF document object that stores the document structure
doc = PDFDocument(parser)
# Connect the parser and document objects.
parser.set_document(doc)
doc.initialize('')
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF page aggregator object
device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
# i = page number #without this it doesn't work
# page are items in page
for i, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for object in layout:
if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
trial = []
trial.append(object.get_text())
for word in trial:
text_content.append(word)
return text_content
示例10: pdf2metadata
def pdf2metadata(fp):
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
doc.initialize()
if 'Metadata' in doc.catalog:
metadata = resolve1(doc.catalog['Metadata']).get_data()
#print metadata # The raw XMP metadata
return doc.info # The "Info" metadata
示例11: get_toc
def get_toc(pdf_path):
infile = open(pdf_path, "rb")
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
for (level, title, dest, a, structelem) in document.get_outlines():
toc.append((level, title))
return toc
示例12: loadPDF
def loadPDF(library, file_name):
"""adds a paper to the library"""
fp = open(file_name, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
password = ""
document.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
print "CANT"
# raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
authors = [] #list of authors
citations = [] #list of authors that have been cited
#pages_length = sum(1 for page in document.get_pages())
for ii, page in enumerate(PDFPage.create_pages(document)):
print '---------------------------------------------------------------------------------------------------'
print "page number {}".format(ii)
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for jj, lt_obj in enumerate(layout._objs):
if jj>3:
break
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
cur_line = lt_obj.get_text().encode('ascii', 'ignore')
match = pattern_ignore.match(cur_line)
if match is None and len(cur_line)<200:
print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC
else:
print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC
else:
print "PICTURE"
break
paper_title = file_name
paper = library.getPaper(paper_title)
paper.addAuthorIds(authors)
paper.addCitationIds(citations)
示例13: print_all_obj
def print_all_obj(filename):
with file(filename, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument(parser, None)
visited_objids = set()
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited_objids:
continue
visited_objids.add(objid)
print objid, get_obj_type(doc.getobj(objid))
示例14: proc
def proc(self, pdfFp):
"""Get meta-data as available from a PDF document"""
parser = PDFParser(pdfFp)
doc = PDFDocument(parser)
parser.set_document(doc)
doc.initialize()
self.info = doc.info
if 'Metadata' in doc.catalog:
self.metadata = xmp_to_dict(
resolve1(doc.catalog['Metadata']).get_data()
)
self.raw_doc = pdfFp.getvalue()
示例15: getDocumentInfoAndAnnotations
def getDocumentInfoAndAnnotations(pdfFile):
logger.info("Parsing pdf file " + pdfFile);
# Open PDF file.
fp = open(pdfFile, 'rb');
docInfo = None;
docAnnotations = [];
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp);
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser);
# Supply the password for initialization.
# (If no password is set, give an empty string.)
document.initialize('');
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager();
# Create a PDF device object.
device = PDFDevice(rsrcmgr);
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device);
# Process each page contained in the document.
pageNum = 0;
for page in PDFPage.create_pages(document):
pageNum += 1;
interpreter.process_page(page);
if(page.annots):
try:
if isinstance( page.annots, list ):
annots = page.annots;
else:
annots = page.annots.resolve();
for annot in annots:
if isinstance( annot, PDFObjRef ):
annot = annot.resolve();
if(annot.has_key('Subj')):
if(annot['Subj'] == 'Sticky Note' and docInfo == None):
logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']);
docInfo = annot['Contents'];
elif(annot['Subj'] == 'Comment on Text'):
logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']);
contents = annot['Contents'];
docAnnotations.append(str(pageNum) + ':' + contents);
else:
logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']);
except Exception, e:
logger.error("error getting annotation");
logger.exception(e);
# move file to error
os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));