当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.get_outlines方法代码示例

本文整理汇总了Python中pdfminer.pdfparser.PDFDocument.get_outlines方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_outlines方法的具体用法?Python PDFDocument.get_outlines怎么用?Python PDFDocument.get_outlines使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfparser.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.get_outlines方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_toc

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
    def get_toc(self):
        fp = open(self.pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        
        # title
        if doc.info:
            metadict = doc.info[0]
            if 'Title' in metadict.keys():
                self.title = normalize_title(metadict['Title'])

        # level 1 of toc
        try:
            outlines = doc.get_outlines()
            toc = list()
            select_level = self.get_level1(outlines)
        except:
            return None
        for (level,title,dest,a,se) in doc.get_outlines():
            if level==select_level:
                toc.append(normalize_toc_item(title))
        return toc
开发者ID:larscwallin,项目名称:pdfdig,代码行数:27,代码来源:pdfinfo.py

示例2: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
开发者ID:joshmgrant,项目名称:pdfminer,代码行数:29,代码来源:dumppdf.py

示例3: GetTOC

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
 def GetTOC(self, doc, *args):
     fp = open(self.filepath, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument()
     parser.set_document(doc)
     doc.set_parser(parser)
     doc.initialize(self.password)
     outlines = doc.get_outlines()
     return outlines
开发者ID:i11uminator,项目名称:bookservice,代码行数:11,代码来源:MyPdfMiner.py

示例4: PrintTOC

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
 def PrintTOC(self):
     fp = open(self.filepath, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument()
     parser.set_document(doc)
     doc.set_parser(parser)
     doc.initialize(self.password)
     outlines = doc.get_outlines()
     for (level,title,dest,a,se) in outlines:
         print (level, title)
开发者ID:i11uminator,项目名称:bookservice,代码行数:12,代码来源:MyPdfMiner.py

示例5: test1

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def test1():
    #fp = open('naacl06-shinyama.pdf', 'rb')
    fp = open('FL00000M26.pdf', 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize("")
    
    # Get the outlines of the document.
    outlines = doc.get_outlines()
    for (level,title,dest,a,se) in outlines:
        print (level, title)
开发者ID:wangzhengbo1204,项目名称:Python,代码行数:15,代码来源:test1.py

示例6: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
开发者ID:brechin,项目名称:pdfminer2,代码行数:51,代码来源:dumppdf.py

示例7: get_toc

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def get_toc(extra=False):
	fp = open('Tanakh-JPS1917.pdf', 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument()
	parser.set_document(doc)
	doc.set_parser(parser)
	#doc.initialize(password)

	# Get the page numbers for the page object ID's.
	p = 0
	page_numbers = {}
	for page in doc.get_pages():
		p += 1
		page_numbers[page.pageid] = p
	number_of_pages = len(page_numbers)

	# now what we really want is just the TOC for what was passed in

	# Get the outlines of the document.
	outlines = doc.get_outlines()
	toc = []
	location = [''] * 3
	for (level,title,dest,a,se) in outlines:
		title = title.replace(chr(10), '').replace(chr(13), '').strip()
		# skip the individual chapter number nodes
		if level < 4 and not title.isdigit():
			location[level - 1] = title 
			# Get the destination page number from the action.
			# Thanks to https://groups.google.com/d/topic/pdfminer-users/KwMJHZTCKbE/discussion
			pageid = a.resolve()['D'][0].objid
			entry = location[:level]
			entry.append(page_numbers[pageid])
			toc.append(entry)

	if len(sys.argv) == 1:
		if extra:
			toc.append([number_of_pages])
	else:
		# If a specific list of pages from the original PDF were specified,
		# create a custom TOC.
		selectedPages = sys.argv[1]
		from_page, to_page = [int(pnum) for pnum in selectedPages.split('-')]
		toc = [entry for entry in toc 
					 if entry[-1] >= from_page and entry[-1] <= to_page]
	return toc
开发者ID:j-hacker,项目名称:opensiddur,代码行数:47,代码来源:outline.py

示例8: dumpoutline

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  for (level,title,dest,a,se) in doc.get_outlines():
    pageno = None
    if dest:
      dest = resolve1( doc.lookup_name('Dests', dest) )
      if isinstance(dest, dict):
        dest = dest['D']
      pageno = pages[dest[0].objid]
    outfp.write(repr((level,title,dest,pageno))+'\n')
  parser.close()
  fp.close()
  return
开发者ID:frid,项目名称:PythonPool,代码行数:20,代码来源:dumppdf.py

示例9: get_toc

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
def get_toc(extra=False):
	fp = open('Tanakh-JPS1917.pdf', 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument()
	parser.set_document(doc)
	doc.set_parser(parser)
	#doc.initialize(password)

	# Get the page numbers for the page object ID's.
	p = 0
	page_numbers = {}
	for page in doc.get_pages():
		p += 1
		page_numbers[page.pageid] = p
	number_of_pages = len(page_numbers)

	# now what we really want is just the TOC for what was passed in

	# Get the outlines of the document.
	outlines = doc.get_outlines()
	toc = []
	location = [''] * 3
	for (level,title,dest,a,se) in outlines:
		title = title.replace(chr(10), '').replace(chr(13), '').strip()
		# skip the individual chapter number nodes
		#if level < 4 and not title.isdigit():
		# TODO: Figure out 'THE TWELVE' which are books at sub-book level.
		if level < 3 and not title.isdigit():
			location[level - 1] = title
			# Get the destination page number from the action.
			# Thanks to https://groups.google.com/d/topic/pdfminer-users/KwMJHZTCKbE/discussion
			pageid = a.resolve()['D'][0].objid
			entry = location[:level]
			entry.append(page_numbers[pageid])
			toc.append(entry)

	if len(sys.argv) == 1:
		if extra:
			toc.append([number_of_pages])
		return toc

	# TODO: Still need this?
	# If a specific list of pages from the original PDF were specified,
	# create a custom TOC where the page number is the 1-based index of this page
	# in the set of pages; 
	# i.e., the same as the page id in the XML produced
	# when that list of pages is passed in to pdf2txt.py.

	selectedPages = sys.argv[1]
	from_page, to_page = [int(pnum) for pnum in selectedPages.split('-')]
	selectedPages = range(from_page, to_page + 1)
	customToc = []
	for pn, pnum in enumerate(selectedPages):
		# walk backward to see what section we're in
		for entry in reversed(toc):
			if pnum >= entry[-1]:
				#print pn + 1
				entry[-1] = pn + 1
				# is it same as previous? if not:
				if (pn == 0 and len(customToc) == 0) or entry[:-1] != customToc[-1][:-1]:
				#if entry[:-1] != customToc[-1][:-1]:
					#print 'appending ' + str(entry) + '...'
					customToc.append(entry[:])
				break

	return customToc
开发者ID:aharonium,项目名称:opensiddur,代码行数:68,代码来源:outline.py

示例10: open

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
sys.setdefaultencoding('utf-8')

pdf='../example/demo/1297-9716-42-107.pdf'

fp = open(pdf, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')

# level 1 of toc
toc = list()

try:
    outlines = doc.get_outlines()
    
    count = 0
    first = ''
    print count
    for (level,title,dest,a,se) in doc.get_outlines():
        if count == 0:
            first=title
        count += 1
    
    print count,first
    
    for (level,title,dest,a,se) in doc.get_outlines():
        #print '{0}\t{1}'.format(level, title)
        if level==1:
            print '{0}\t{1}'.format(level, title)
开发者ID:larscwallin,项目名称:pdfdig,代码行数:33,代码来源:pdfminertest.py

示例11: document

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
	def document (self):

		def mergeSameParagraphLines (lines):
			def isEndOfParagraph (line):
				return line[-1:] in ['.', '?', '!'] or len(line) < 60

			result = []
			currentLine = ''

			for line in lines:
#				print "# '" + line + "'"
				currentLine += line
				if isEndOfParagraph(line):
					result.append(currentLine)
					currentLine = ''

			if currentLine != '':
				result.append(currentLine)

			return result

		if not self._document:
			pdfFile = open(self._pdfDocument, 'rb')
			pdfParser = PDFParser(pdfFile)
			document = PDFDocument()

			pdfParser.set_document(document)
			document.set_parser(pdfParser)
			document.initialize()

			if not document.is_extractable:
				raise pdfminer.pdfparser.PDFTextExtractionNotAllowed

			resourceManger = PDFResourceManager()

			debug = 1
			#
			PDFDocument.debug = debug
			PDFParser.debug = debug
#			CMapDB.debug = debug
			PDFResourceManager.debug = debug
			PDFPageInterpreter.debug = debug
			PDFDevice.debug = debug
			#

			pdfContent = StringIO()
			laparams = LAParams()
			laparams.all_texts = True
			laparams.detect_vertical = True
#			laparams.line_margin = 1.0
#			laparams.char_margin = 1.0
#			laparams.word_margin = 1.0
#			laparams.boxes_flow = 1.0

#			device = PDFDevice(resourceManger)
			device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams)
			interpreter = PDFPageInterpreter(resourceManger, device)
			for page in document.get_pages():
				interpreter.process_page(page)
			content = mergeSameParagraphLines(pdfContent.getvalue().split('\n'))

			toc = []
			try:
				for (level, title, destination, a, se) in document.get_outlines():
					toc.append((level, title))
			except:
				pass

			pdfContent.close()

			self._document = Document().initWithDocumentInfo(content, None, None)

		return self._document
开发者ID:gcsolaroli,项目名称:metadata-processor,代码行数:75,代码来源:pdf.py

示例12: __init__

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]

#.........这里部分代码省略.........
					""" We've found a comment. If it's on top of a rect, return the 
					rect as the bounding box. Else return just the textbox rect """
					rect=self._rect(self._intersects(layout,obj))
					commenttxt={"rect":rect, "comment":txt.replace("]]","").replace("[[","")}
					result.append(commenttxt)
		return result
		
	def _parse_links(self, page):
		result=[]
		if (page.annots):
			obj=self.doc.getobj(page.annots.objid)
			for i in obj:
				annotobj=i.resolve()
				try:
					if (annotobj["Subtype"].name=='Link') and (annotobj.has_key("A")):
						linktype="link"
						print "Found link"
						obj=annotobj["A"].resolve()
						dest=""
						if (obj.has_key('D')):
							linktype="bookmark"
							#print dir(obj["D"])
							
							namesobj=self.doc.catalog["Names"].resolve()
							destsobj=namesobj["Dests"].resolve()
							for name in destsobj["Names"]:
								if (hasattr(name[0], "objid")):
									pg=name[0].resolve()
									dest=self._find_objid_pgnum(pg)
									
						if (obj.has_key('URI')):
							dest=obj['URI']
						rect=self._rect(annotobj['Rect'])
						link={"rect":rect, "type":linktype,"dest": dest}
						result.append(link)
				except:
					return result
		return result
			
	def _parse_video(self, page):
		result=[]
		if (page.annots):
			obj=self.doc.getobj(page.annots.objid)
			for i in obj:
				annotobj=i.resolve()
				try:
					if (annotobj["Subtype"].name=='RichMedia'):
						linktype="media"
						rect=self._rect(annotobj['Rect'])
						print "Found video"
						data=annotobj["RichMediaContent"].resolve()
						dataobj=data["Assets"].resolve()
						fstream=dataobj["Names"][1].resolve()
						filename=fstream["F"]
						link={"rect":rect, "type":linktype, "filename":filename}
						result.append(link)
				except:
					pass
		return result
			
	def _intersects(self, layout, obj):
		""" Finds if the obj is contained within another object on the page """
		origbbox=obj.bbox
		for otherobj in layout:
			if obj!=otherobj:
				otherbbox=otherobj.bbox
				if (origbbox[0]>=otherbbox[0]) and (origbbox[1]>=otherbbox[1]) and (origbbox[2]<=otherbbox[2]) and (origbbox[3]>=otherbbox[3]):
					return otherbbox
		return origbbox
	
	"""
	We search for 'bookmarks' set in Adobe Acrobat
	"""
	def get_sections(self):
		toc=[]
		try:
			outlines = self.doc.get_outlines()
			for (level,title,dest,a,se) in outlines:
				if (dest):
				    objid=dest[0].objid
				    pgobj=dest[0].resolve()
				else:
				    destsobj=a.resolve()
				    pgobj=destsobj["D"][0]
				    objid=pgobj.objid
				x=1;
				for page in self.doc.get_pages():
				    if page.pageid==objid:
				    	toc.append({"name": title, "page": x});
				    x=x+1
		except:
			pass
		return toc
			
	def test(self):
		print "Starting test on %s" % self.filename
		result=self.parse_pages()
		print result
		print "Found %d pages" % (self.pagecount)
		print self.get_sections()
开发者ID:10layer,项目名称:PDF-Mine,代码行数:104,代码来源:pdfmine.py

示例13: u

# 需要导入模块: from pdfminer.pdfparser import PDFDocument [as 别名]
# 或者: from pdfminer.pdfparser.PDFDocument import get_outlines [as 别名]
#!/usr/bin/python2.7
# -*- coding: utf-8 -*-

import codecs, sys
from pdfminer.pdfparser import PDFParser, PDFDocument

txtfile = "schlag.txt"

f = codecs.open(txtfile, encoding='utf-8')

schlaglist = [x.split('\n') for x in f] # list

'''
for x in schlaglist:
    print u(x)
'''
try:
    fp = open('/home/niklasmoran/EL/skript211.pdf')
    print (fp.type())
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.initialize('')

    outlines = doc.get_outlines()
    print outlines.type() 
except:
    print "that didn't work", sys.exc_info()[0]
开发者ID:mellowizz,项目名称:metastudy,代码行数:30,代码来源:pdf_analyzer.py


注:本文中的pdfminer.pdfparser.PDFDocument.get_outlines方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。