本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.get_outlines方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_outlines方法的具体用法?Python PDFDocument.get_outlines怎么用?Python PDFDocument.get_outlines使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfdocument.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.get_outlines方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_paragraphs
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def parse_paragraphs(self, text):
# Will only work for markdown elements
# divided by '##' markers
# or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
lines = text.split('\n')
headlines = []
if self.is_pdf:
with open(self.paper_filename, 'rb') as pdf:
parser = PDFParser(pdf)
document = PDFDocument(parser)
try:
outlines = document.get_outlines()
for (level, title, _, _, _) in outlines:
if level == 1:
headlines.append(title)
except PDFNoOutlines:
logging.info(
"No outline found -> skipping paragraph search..."
)
else: # check markdown headlines
for index, line in enumerate(lines):
if line.startswith('## '):
headlines.append(line)
if len(headlines) > 0:
self.count_paragraphs(text, lines, headlines)
示例2: main
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def main():
# Open a PDF file.
with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
print rsrcmgr
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
print interpreter.process_page(page)
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
return 0
示例3: get_toc
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def get_toc(pdf_path):
infile = open(pdf_path, "rb")
parser = PDFParser(infile)
document = PDFDocument(parser)
toc = list()
for (level, title, dest, a, structelem) in document.get_outlines():
toc.append((level, title))
return toc
示例4: parse
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def parse(filename, maxlevel):
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
outlines = doc.get_outlines()
for (level, title, dest, a, se) in outlines:
if level <= maxlevel:
title_words = title.encode('utf8') \
.replace('\n', '') \
.split()
title = ' '.join(title_words)
print('<h{level}>{title}</h{level}>'
.format(level=level, title=title))
示例5: dumpoutline
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page)
in enumerate(PDFPage.create_pages(doc)) )
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict):
dest = dest['D']
return dest
try:
outlines = doc.get_outlines()
outfp.write('<outlines>\n')
for (level,title,dest,a,se) in outlines:
pageno = None
if dest:
dest = resolve_dest(dest)
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
outfp.write('<outline level="%r" title="%s">\n' % (level, s))
if dest is not None:
outfp.write('<dest>')
dumpxml(outfp, dest)
outfp.write('</dest>\n')
if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno)
outfp.write('</outline>\n')
outfp.write('</outlines>\n')
except PDFNoOutlines:
pass
parser.close()
fp.close()
return
示例6: get_headings
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def get_headings(filename):
os.chdir('..')
rd.open_location("/PDF",True)
filename_=filename[:-14]
for compare_filename in os.listdir(os.getcwd()):
if filename_ == compare_filename[:-4]:
in_file=open(compare_filename, 'rb')
parse_file=PDFParser(in_file)
file=PDFDocument(parse_file)
pages=0
for page in PDFPage.get_pages(in_file):
pages+=1
headings_list=[]
try:
for (level,title,dest,a,structelem) in file.get_outlines():
headings_list.append((level,title))
rd.open_location("/program",True)
return headings_list,pages
except:
rd.open_location("/program",True)
return None,pages
示例7: valid_toc
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def valid_toc(self, toc):
with open(str(self._doc), "rb") as pdffile:
parser = PDFParser(pdffile)
document = PDFDocument(parser)
try:
real_toc = list(document.get_outlines())
except PDFNoOutlines:
return len(toc) == 0
print("TOC from PDF file:", real_toc)
if len(real_toc) != len(toc):
print("Incorrect TOC length")
return False
for ref, real in zip(toc, real_toc):
print("Checking", ref)
if not ref[0] + 1 == real[0]:
# level
return False
if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1):
# destination
return False
if not ref[2] == real[1]:
# title
return False
return True
示例8: extract_contents
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def extract_contents(self):
parser = PDFParser(self.fd)
doc = PDFDocument(parser)
self.total_pages = self.get_pages_total()
self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages))
try:
outlines = doc.get_outlines()
except PDFNoOutlines:
# No built-in outlines
return None
else:
# built-in outlines exist
def search_page_toc(objid):
for page, pagenum in self.pages:
if page.pageid == objid:
return pagenum
return 0
for (level, title, dest, a, se) in outlines:
if dest is not None:
pn = search_page_toc(dest[0].objid)
if pn > 0:
self.outlines.append((title, pn))
示例9: open
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
# Open a PDF document.
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
# Get the outlines of the document.
outlines = document.get_outlines()
for (street_name, type , dir, address_range, city, elementary, middle, high_school) in outlines:
print (level, title)
示例10: pdf_cover
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def pdf_cover(self, pdf, images):
'''Attempt to use embedded outline information in the PDF to determine
which image to use as the cover or primary image for the volume.
:param pdf: path to the pdf file for this volume
:param images: list of image file paths for this volume
'''
with open(pdf, 'rb') as pdf_file:
parser = PDFParser(pdf_file)
document = PDFDocument(parser)
try:
outlines = document.get_outlines()
logger.debug('PDF %s includes outline information, using for cover identification',
pdf)
except PDFNoOutlines:
logger.debug('PDF %s does not include outline information', pdf)
return None
# generate a dictionary of page object id and zero-based page number
pages = dict((page.pageid, pageno) for (pageno, page)
in enumerate(PDFPage.create_pages(document)))
possible_coverpages = []
page_count = 0
for (level, title, dest, a, se) in outlines:
# NOTE: some LSDI PDFs trigger a maximum recursion error in
# pdfminer; try to avoid this by bailing out after processing
# a set number of outline items
# caveat: outline entries are not necessarily returned in order
page_count += 1
if page_count > 15:
break
# title is the label of the outline element
# dest is the target page object; apparently in some cases this can be None ?
# if so, skip it
if dest is None:
continue
# we can probably use either Cover or Title Page; there
# may be multiple Covers (for back cover)
if title.lower() in ['cover', 'title page']:
# determine page number for the reference
page_num = pages[dest[0].objid]
# check if the page is blank, as seems to be happening in some
# cases for what is labeled as the cover
try:
img = images[page_num]
except IndexError:
logger.error('Not enough images for requested page number %s',
page_num)
continue
if self.is_blank_page(img):
logger.debug('PDF outline places %s at page %s but it is blank', title, page_num)
# do NOT include as a possible cover page
else:
# non-blank: include as possible cover page
logger.debug('PDF outline places %s at page %s', title, page_num)
possible_coverpages.append(page_num)
if possible_coverpages:
# for now, just return the lowest page number, which should be
# the first cover or title page if cover is blank
return sorted(possible_coverpages)[0]
示例11: createFromPdfminer
# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def createFromPdfminer(filename):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import PDFObjRef
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.initialize()
assert doc.is_extractable
result = PDFInfos()
result._metaInfo = dict((key, str.decode(value, 'utf-16') if value.startswith('\xfe\xff') else value)
for key, value in doc.info[0].items()
if isinstance(value, basestring))
pageids = [page.pageid for page in PDFPage.create_pages(doc)]
result._pageCount = len(pageids)
def get(obj, attr = None):
"""Resolve PDFObjRefs, otherwise a no-op. May also perform
dict lookup, i.e. get(obj, 'A') is roughly the same as
get(obj)['A']."""
while isinstance(obj, PDFObjRef):
obj = obj.resolve()
if attr is not None:
return get(obj[attr])
return obj
def actionToPageIndex(action):
assert get(action, 'S').name == 'GoTo'
name = get(action, 'D')
# resolve "named destination":
dest = get(doc.get_dest(name))
return destToPageIndex(dest)
def destToPageIndex(dest):
dest = get(dest)
if isinstance(dest, dict):
assert dest.keys() == ['D'], repr(dest)
dest = get(dest, 'D')
# destinations contain the page as first element,
# the rest concerns the ROI / zoom state (various modes there):
return pageids.index(dest[0].objid)
try:
result._outline = [(level, title, actionToPageIndex(a) if a else destToPageIndex(dest))
for level, title, dest, a, se in doc.get_outlines()]
except PDFNoOutlines:
result._outline = None
result._pageInfos = []
# get annotations (links):
for page in PDFPage.create_pages(doc):
pageLinks = []
for anno in get(page.annots) or []:
anno = get(anno)
rect = numpy.array(get(anno, 'Rect'), float).reshape((2, 2))
if 'Dest' in anno:
# 'Dest' is the older (more compatible) way to
# specify links
dest = get(anno, 'Dest')
pageLinks.append((rect, destToPageIndex(dest)))
elif 'A' in anno:
# actions are much more general and include 'GoTo'
# (with viewport spec.) with variants for remote
# and embedded documents
action = get(anno, 'A')
subType = get(action, 'S').name
if subType == 'GoTo':
pageLinks.append((rect, actionToPageIndex(action)))
elif subType == 'URI':
#assert sorted(action.keys()) == ['S', 'Type', 'URI']
link = get(action, 'URI')
if link.startswith('file:'):
# resolve relative pathname w.r.t. PDF filename:
link = 'file:' + os.path.join(os.path.dirname(filename),
link[5:])
pageLinks.append((rect, link))
pageBox = numpy.array([page.mediabox], float).reshape((2, 2))
result._pageInfos.append(PDFPageInfos(links = pageLinks, pageBox = pageBox))
# extract all named destinations:
def extract_names(dests, result = None):
if result is None:
result = {}
if 'Names' in dests:
it = iter(get(dests, 'Names'))
for name, ref in zip(it, it):
result[name] = destToPageIndex(ref)
if 'Kids' in dests:
for kid in get(dests, 'Kids'):
extract_names(get(kid), result)
return result
#.........这里部分代码省略.........