当前位置: 首页>>代码示例>>Python>>正文


Python PDFDocument.get_outlines方法代码示例

本文整理汇总了Python中pdfminer.pdfdocument.PDFDocument.get_outlines方法的典型用法代码示例。如果您正苦于以下问题:Python PDFDocument.get_outlines方法的具体用法?Python PDFDocument.get_outlines怎么用?Python PDFDocument.get_outlines使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfdocument.PDFDocument的用法示例。


在下文中一共展示了PDFDocument.get_outlines方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_paragraphs

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
    def parse_paragraphs(self, text):
        # Will only work for markdown elements
        #   divided by '##' markers
        #   or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
        lines = text.split('\n')
        headlines = []

        if self.is_pdf:
            with open(self.paper_filename, 'rb') as pdf:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)

                try:
                    outlines = document.get_outlines()
                    for (level, title, _, _, _) in outlines:
                        if level == 1:
                            headlines.append(title)
                except PDFNoOutlines:
                    logging.info(
                        "No outline found -> skipping paragraph search..."
                    )
        else:  # check markdown headlines
            for index, line in enumerate(lines):
                if line.startswith('## '):
                    headlines.append(line)

        if len(headlines) > 0:
            self.count_paragraphs(text, lines, headlines)
开发者ID:dahoo,项目名称:paper-gamification,代码行数:30,代码来源:tracker.py

示例2: main

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def main():
    # Open a PDF file.
    with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            print interpreter.process_page(page)
        outlines = document.get_outlines()
        for (level,title,dest,a,se) in outlines:
            print (level, title)
    return 0
开发者ID:cmthompson,项目名称:weiss,代码行数:27,代码来源:Layout.py

示例3: get_toc

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def get_toc(pdf_path):
    infile = open(pdf_path, "rb")
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append((level, title))

    return toc
开发者ID:erexhepa,项目名称:IF_COLOC_ENGINE,代码行数:12,代码来源:pdf_metadata.py

示例4: parse

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def parse(filename, maxlevel):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        if level <= maxlevel:
            title_words = title.encode('utf8') \
                               .replace('\n', '') \
                               .split()
            title = ' '.join(title_words)
            print('<h{level}>{title}</h{level}>'
                  .format(level=level, title=title))
开发者ID:MartinThoma,项目名称:algorithms,代码行数:16,代码来源:utils.py

示例5: dumpoutline

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page)
                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level,title,dest,a,se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
开发者ID:coolioxlr,项目名称:ziply,代码行数:48,代码来源:dumppdf.py

示例6: get_headings

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
def get_headings(filename):
    os.chdir('..')
    rd.open_location("/PDF",True)
    filename_=filename[:-14]

    for compare_filename in os.listdir(os.getcwd()):

        if filename_ == compare_filename[:-4]:
            in_file=open(compare_filename, 'rb')
            
            parse_file=PDFParser(in_file)
            file=PDFDocument(parse_file)
            pages=0
            for page in PDFPage.get_pages(in_file):
                pages+=1   
            headings_list=[]
            try:
                for (level,title,dest,a,structelem) in file.get_outlines():
                    headings_list.append((level,title))
                rd.open_location("/program",True)    
                return headings_list,pages
            except:
                rd.open_location("/program",True)
                return None,pages
开发者ID:robknapen,项目名称:IAAT,代码行数:26,代码来源:mainextraction.py

示例7: valid_toc

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
 def valid_toc(self, toc):
     with open(str(self._doc), "rb") as pdffile:
         parser = PDFParser(pdffile)
         document = PDFDocument(parser)
         try:
             real_toc = list(document.get_outlines())
         except PDFNoOutlines:
             return len(toc) == 0
         print("TOC from PDF file:", real_toc)
         if len(real_toc) != len(toc):
             print("Incorrect TOC length")
             return False
         for ref, real in zip(toc, real_toc):
             print("Checking", ref)
             if not ref[0] + 1 == real[0]:
                 # level
                 return False
             if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1):
                 # destination
                 return False
             if not ref[2] == real[1]:
                 # title
                 return False
     return True
开发者ID:atrosinenko,项目名称:lecture-notes-compiler,代码行数:26,代码来源:core.py

示例8: extract_contents

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
    def extract_contents(self):
        parser = PDFParser(self.fd)
        doc = PDFDocument(parser)
        self.total_pages = self.get_pages_total()
        self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages))

        try:
            outlines = doc.get_outlines()
        except PDFNoOutlines:
            # No built-in outlines
            return None
        else:
            # built-in outlines exist
            def search_page_toc(objid):
                for page, pagenum in self.pages:
                    if page.pageid == objid:
                        return pagenum
                return 0

            for (level, title, dest, a, se) in outlines:
                if dest is not None:
                    pn = search_page_toc(dest[0].objid)
                    if pn > 0:
                        self.outlines.append((title, pn))
开发者ID:zuban32,项目名称:pdf2qarch,代码行数:26,代码来源:contents_parser.py

示例9: open

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

# Open a PDF document.
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

# Get the outlines of the document.
outlines = document.get_outlines()
for (street_name, type , dir, address_range, city, elementary, middle, high_school) in outlines:
    print (level, title)
开发者ID:mikedasuya,项目名称:python,代码行数:14,代码来源:createDB.py

示例10: pdf_cover

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
    def pdf_cover(self, pdf, images):
        '''Attempt to use embedded outline information in the PDF to determine
        which image to use as the cover or primary image for the volume.

        :param pdf: path to the pdf file for this volume
        :param images: list of image file paths for this volume
        '''
        with open(pdf, 'rb') as pdf_file:
            parser = PDFParser(pdf_file)
            document = PDFDocument(parser)
            try:
                outlines = document.get_outlines()
                logger.debug('PDF %s includes outline information, using for cover identification',
                             pdf)
            except PDFNoOutlines:
                logger.debug('PDF %s does not include outline information', pdf)
                return None

            # generate a dictionary of page object id and zero-based page number
            pages = dict((page.pageid, pageno) for (pageno, page)
                  in enumerate(PDFPage.create_pages(document)))

            possible_coverpages = []
            page_count = 0
            for (level, title, dest, a, se) in outlines:

                # NOTE: some LSDI PDFs trigger a maximum recursion error in
                # pdfminer; try to avoid this by bailing out after processing
                # a set number of outline items
                # caveat: outline entries are not necessarily returned in order
                page_count += 1
                if page_count > 15:
                    break

                # title is the label of the outline element

                # dest is the target page object; apparently in some cases this can be None ?
                # if so, skip it
                if dest is None:
                    continue

                # we can probably use either Cover or Title Page; there
                # may be multiple Covers (for back cover)
                if title.lower() in ['cover', 'title page']:
                    # determine page number for the reference
                    page_num = pages[dest[0].objid]

                    # check if the page is blank, as seems to be happening in some
                    # cases for what is labeled as the cover
                    try:
                        img = images[page_num]
                    except IndexError:
                        logger.error('Not enough images for requested page number %s',
                                     page_num)
                        continue

                    if self.is_blank_page(img):
                        logger.debug('PDF outline places %s at page %s but it is blank', title, page_num)
                        # do NOT include as a possible cover page
                    else:
                        # non-blank: include as possible cover page
                        logger.debug('PDF outline places %s at page %s', title, page_num)
                        possible_coverpages.append(page_num)

            if possible_coverpages:
                # for now, just return the lowest page number, which should be
                # the first cover or title page if cover is blank
                return sorted(possible_coverpages)[0]
开发者ID:WSULib,项目名称:readux,代码行数:70,代码来源:page_import.py

示例11: createFromPdfminer

# 需要导入模块: from pdfminer.pdfdocument import PDFDocument [as 别名]
# 或者: from pdfminer.pdfdocument.PDFDocument import get_outlines [as 别名]
    def createFromPdfminer(filename):
        from pdfminer.pdfparser import PDFParser
        from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdftypes import PDFObjRef

        fp = open(filename, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        doc.initialize()
        assert doc.is_extractable

        result = PDFInfos()
        result._metaInfo = dict((key, str.decode(value, 'utf-16') if value.startswith('\xfe\xff') else value)
                                for key, value in doc.info[0].items()
                                if isinstance(value, basestring))

        pageids = [page.pageid for page in PDFPage.create_pages(doc)]
        result._pageCount = len(pageids)

        def get(obj, attr = None):
            """Resolve PDFObjRefs, otherwise a no-op. May also perform
            dict lookup, i.e. get(obj, 'A') is roughly the same as
            get(obj)['A']."""
            while isinstance(obj, PDFObjRef):
                obj = obj.resolve()
            if attr is not None:
                return get(obj[attr])
            return obj

        def actionToPageIndex(action):
            assert get(action, 'S').name == 'GoTo'
            name = get(action, 'D')
            # resolve "named destination":
            dest = get(doc.get_dest(name))
            return destToPageIndex(dest)

        def destToPageIndex(dest):
            dest = get(dest)
            if isinstance(dest, dict):
                assert dest.keys() == ['D'], repr(dest)
                dest = get(dest, 'D')
            # destinations contain the page as first element,
            # the rest concerns the ROI / zoom state (various modes there):
            return pageids.index(dest[0].objid)

        try:
            result._outline = [(level, title, actionToPageIndex(a) if a else destToPageIndex(dest))
                               for level, title, dest, a, se in doc.get_outlines()]
        except PDFNoOutlines:
            result._outline = None

        result._pageInfos = []

        # get annotations (links):
        for page in PDFPage.create_pages(doc):
            pageLinks = []

            for anno in get(page.annots) or []:
                anno = get(anno)
                rect = numpy.array(get(anno, 'Rect'), float).reshape((2, 2))
                if 'Dest' in anno:
                    # 'Dest' is the older (more compatible) way to
                    # specify links
                    dest = get(anno, 'Dest')
                    pageLinks.append((rect, destToPageIndex(dest)))
                elif 'A' in anno:
                    # actions are much more general and include 'GoTo'
                    # (with viewport spec.) with variants for remote
                    # and embedded documents
                    action = get(anno, 'A')
                    subType = get(action, 'S').name
                    if subType == 'GoTo':
                        pageLinks.append((rect, actionToPageIndex(action)))
                    elif subType == 'URI':
                        #assert sorted(action.keys()) == ['S', 'Type', 'URI']
                        link = get(action, 'URI')
                        if link.startswith('file:'):
                            # resolve relative pathname w.r.t. PDF filename:
                            link = 'file:' + os.path.join(os.path.dirname(filename),
                                                          link[5:])
                        pageLinks.append((rect, link))

            pageBox = numpy.array([page.mediabox], float).reshape((2, 2))

            result._pageInfos.append(PDFPageInfos(links = pageLinks, pageBox = pageBox))

        # extract all named destinations:
        def extract_names(dests, result = None):
            if result is None:
                result = {}
            if 'Names' in dests:
                it = iter(get(dests, 'Names'))
                for name, ref in zip(it, it):
                    result[name] = destToPageIndex(ref)
            if 'Kids' in dests:
                for kid in get(dests, 'Kids'):
                    extract_names(get(kid), result)
            return result

#.........这里部分代码省略.........
开发者ID:hmeine,项目名称:pdfdecanter,代码行数:103,代码来源:pdf_infos.py


注:本文中的pdfminer.pdfdocument.PDFDocument.get_outlines方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。