本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.get_outlines方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_outlines方法的具体用法?Python PDFDocument.get_outlines怎么用?Python PDFDocument.get_outlines使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.pdfparser.PDFDocument
的用法示例。
在下文中一共展示了PDFDocument.get_outlines方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_toc
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def get_toc(self):
fp = open(self.pdf, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
# title
if doc.info:
metadict = doc.info[0]
if 'Title' in metadict.keys():
self.title = normalize_title(metadict['Title'])
# level 1 of toc
try:
outlines = doc.get_outlines()
toc = list()
select_level = self.get_level1(outlines)
except:
return None
for (level,title,dest,a,se) in doc.get_outlines():
if level==select_level:
toc.append(normalize_toc_item(title))
return toc
示例2: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = action['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
示例3: GetTOC
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def GetTOC(self, doc, *args):
fp = open(self.filepath, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(self.password)
outlines = doc.get_outlines()
return outlines
示例4: PrintTOC
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def PrintTOC(self):
fp = open(self.filepath, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(self.password)
outlines = doc.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
示例5: test1
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def test1():
#fp = open('naacl06-shinyama.pdf', 'rb')
fp = open('FL00000M26.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")
# Get the outlines of the document.
outlines = doc.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
示例6: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict):
dest = dest['D']
return dest
try:
outlines = doc.get_outlines()
outfp.write('<outlines>\n')
for (level, title, dest, a, se) in outlines:
pageno = None
if dest:
dest = resolve_dest(dest)
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
outfp.write('<outline level="%r" title="%s">\n' % (level, s))
if dest is not None:
outfp.write('<dest>')
dumpxml(outfp, dest)
outfp.write('</dest>\n')
if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno)
outfp.write('</outline>\n')
outfp.write('</outlines>\n')
except PDFNoOutlines:
pass
parser.close()
fp.close()
return
示例7: get_toc
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def get_toc(extra=False):
fp = open('Tanakh-JPS1917.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
#doc.initialize(password)
# Get the page numbers for the page object ID's.
p = 0
page_numbers = {}
for page in doc.get_pages():
p += 1
page_numbers[page.pageid] = p
number_of_pages = len(page_numbers)
# now what we really want is just the TOC for what was passed in
# Get the outlines of the document.
outlines = doc.get_outlines()
toc = []
location = [''] * 3
for (level,title,dest,a,se) in outlines:
title = title.replace(chr(10), '').replace(chr(13), '').strip()
# skip the individual chapter number nodes
if level < 4 and not title.isdigit():
location[level - 1] = title
# Get the destination page number from the action.
# Thanks to https://groups.google.com/d/topic/pdfminer-users/KwMJHZTCKbE/discussion
pageid = a.resolve()['D'][0].objid
entry = location[:level]
entry.append(page_numbers[pageid])
toc.append(entry)
if len(sys.argv) == 1:
if extra:
toc.append([number_of_pages])
else:
# If a specific list of pages from the original PDF were specified,
# create a custom TOC.
selectedPages = sys.argv[1]
from_page, to_page = [int(pnum) for pnum in selectedPages.split('-')]
toc = [entry for entry in toc
if entry[-1] >= from_page and entry[-1] <= to_page]
return toc
示例8: dumpoutline
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
示例9: get_toc
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def get_toc(extra=False):
fp = open('Tanakh-JPS1917.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
#doc.initialize(password)
# Get the page numbers for the page object ID's.
p = 0
page_numbers = {}
for page in doc.get_pages():
p += 1
page_numbers[page.pageid] = p
number_of_pages = len(page_numbers)
# now what we really want is just the TOC for what was passed in
# Get the outlines of the document.
outlines = doc.get_outlines()
toc = []
location = [''] * 3
for (level,title,dest,a,se) in outlines:
title = title.replace(chr(10), '').replace(chr(13), '').strip()
# skip the individual chapter number nodes
#if level < 4 and not title.isdigit():
# TODO: Figure out 'THE TWELVE' which are books at sub-book level.
if level < 3 and not title.isdigit():
location[level - 1] = title
# Get the destination page number from the action.
# Thanks to https://groups.google.com/d/topic/pdfminer-users/KwMJHZTCKbE/discussion
pageid = a.resolve()['D'][0].objid
entry = location[:level]
entry.append(page_numbers[pageid])
toc.append(entry)
if len(sys.argv) == 1:
if extra:
toc.append([number_of_pages])
return toc
# TODO: Still need this?
# If a specific list of pages from the original PDF were specified,
# create a custom TOC where the page number is the 1-based index of this page
# in the set of pages;
# i.e., the same as the page id in the XML produced
# when that list of pages is passed in to pdf2txt.py.
selectedPages = sys.argv[1]
from_page, to_page = [int(pnum) for pnum in selectedPages.split('-')]
selectedPages = range(from_page, to_page + 1)
customToc = []
for pn, pnum in enumerate(selectedPages):
# walk backward to see what section we're in
for entry in reversed(toc):
if pnum >= entry[-1]:
#print pn + 1
entry[-1] = pn + 1
# is it same as previous? if not:
if (pn == 0 and len(customToc) == 0) or entry[:-1] != customToc[-1][:-1]:
#if entry[:-1] != customToc[-1][:-1]:
#print 'appending ' + str(entry) + '...'
customToc.append(entry[:])
break
return customToc
示例10: open
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
sys.setdefaultencoding('utf-8')
pdf='../example/demo/1297-9716-42-107.pdf'
fp = open(pdf, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
# level 1 of toc
toc = list()
try:
outlines = doc.get_outlines()
count = 0
first = ''
print count
for (level,title,dest,a,se) in doc.get_outlines():
if count == 0:
first=title
count += 1
print count,first
for (level,title,dest,a,se) in doc.get_outlines():
#print '{0}\t{1}'.format(level, title)
if level==1:
print '{0}\t{1}'.format(level, title)
示例11: document
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def document (self):
def mergeSameParagraphLines (lines):
def isEndOfParagraph (line):
return line[-1:] in ['.', '?', '!'] or len(line) < 60
result = []
currentLine = ''
for line in lines:
# print "# '" + line + "'"
currentLine += line
if isEndOfParagraph(line):
result.append(currentLine)
currentLine = ''
if currentLine != '':
result.append(currentLine)
return result
if not self._document:
pdfFile = open(self._pdfDocument, 'rb')
pdfParser = PDFParser(pdfFile)
document = PDFDocument()
pdfParser.set_document(document)
document.set_parser(pdfParser)
document.initialize()
if not document.is_extractable:
raise pdfminer.pdfparser.PDFTextExtractionNotAllowed
resourceManger = PDFResourceManager()
debug = 1
#
PDFDocument.debug = debug
PDFParser.debug = debug
# CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
pdfContent = StringIO()
laparams = LAParams()
laparams.all_texts = True
laparams.detect_vertical = True
# laparams.line_margin = 1.0
# laparams.char_margin = 1.0
# laparams.word_margin = 1.0
# laparams.boxes_flow = 1.0
# device = PDFDevice(resourceManger)
device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(resourceManger, device)
for page in document.get_pages():
interpreter.process_page(page)
content = mergeSameParagraphLines(pdfContent.getvalue().split('\n'))
toc = []
try:
for (level, title, destination, a, se) in document.get_outlines():
toc.append((level, title))
except:
pass
pdfContent.close()
self._document = Document().initWithDocumentInfo(content, None, None)
return self._document
示例12: __init__
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
#.........这里部分代码省略.........
""" We've found a comment. If it's on top of a rect, return the
rect as the bounding box. Else return just the textbox rect """
rect=self._rect(self._intersects(layout,obj))
commenttxt={"rect":rect, "comment":txt.replace("]]","").replace("[[","")}
result.append(commenttxt)
return result
def _parse_links(self, page):
result=[]
if (page.annots):
obj=self.doc.getobj(page.annots.objid)
for i in obj:
annotobj=i.resolve()
try:
if (annotobj["Subtype"].name=='Link') and (annotobj.has_key("A")):
linktype="link"
print "Found link"
obj=annotobj["A"].resolve()
dest=""
if (obj.has_key('D')):
linktype="bookmark"
#print dir(obj["D"])
namesobj=self.doc.catalog["Names"].resolve()
destsobj=namesobj["Dests"].resolve()
for name in destsobj["Names"]:
if (hasattr(name[0], "objid")):
pg=name[0].resolve()
dest=self._find_objid_pgnum(pg)
if (obj.has_key('URI')):
dest=obj['URI']
rect=self._rect(annotobj['Rect'])
link={"rect":rect, "type":linktype,"dest": dest}
result.append(link)
except:
return result
return result
def _parse_video(self, page):
result=[]
if (page.annots):
obj=self.doc.getobj(page.annots.objid)
for i in obj:
annotobj=i.resolve()
try:
if (annotobj["Subtype"].name=='RichMedia'):
linktype="media"
rect=self._rect(annotobj['Rect'])
print "Found video"
data=annotobj["RichMediaContent"].resolve()
dataobj=data["Assets"].resolve()
fstream=dataobj["Names"][1].resolve()
filename=fstream["F"]
link={"rect":rect, "type":linktype, "filename":filename}
result.append(link)
except:
pass
return result
def _intersects(self, layout, obj):
""" Finds if the obj is contained within another object on the page """
origbbox=obj.bbox
for otherobj in layout:
if obj!=otherobj:
otherbbox=otherobj.bbox
if (origbbox[0]>=otherbbox[0]) and (origbbox[1]>=otherbbox[1]) and (origbbox[2]<=otherbbox[2]) and (origbbox[3]>=otherbbox[3]):
return otherbbox
return origbbox
"""
We search for 'bookmarks' set in Adobe Acrobat
"""
def get_sections(self):
toc=[]
try:
outlines = self.doc.get_outlines()
for (level,title,dest,a,se) in outlines:
if (dest):
objid=dest[0].objid
pgobj=dest[0].resolve()
else:
destsobj=a.resolve()
pgobj=destsobj["D"][0]
objid=pgobj.objid
x=1;
for page in self.doc.get_pages():
if page.pageid==objid:
toc.append({"name": title, "page": x});
x=x+1
except:
pass
return toc
def test(self):
print "Starting test on %s" % self.filename
result=self.parse_pages()
print result
print "Found %d pages" % (self.pagecount)
print self.get_sections()
示例13: u
# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
import codecs, sys
from pdfminer.pdfparser import PDFParser, PDFDocument
txtfile = "schlag.txt"
f = codecs.open(txtfile, encoding='utf-8')
schlaglist = [x.split('\n') for x in f] # list
'''
for x in schlaglist:
print u(x)
'''
try:
fp = open('/home/niklasmoran/EL/skript211.pdf')
print (fp.type())
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.initialize('')
outlines = doc.get_outlines()
print outlines.type()
except:
print "that didn't work", sys.exc_info()[0]