当前位置: 首页>>代码示例>>Python>>正文


Python PDFPage.create_pages方法代码示例

本文整理汇总了Python中pdfminer.pdfpage.PDFPage.create_pages方法的典型用法代码示例。如果您正苦于以下问题:Python PDFPage.create_pages方法的具体用法?Python PDFPage.create_pages怎么用?Python PDFPage.create_pages使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfpage.PDFPage的用法示例。


在下文中一共展示了PDFPage.create_pages方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: read_fields

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter) 
开发者ID:jhpyle,项目名称:docassemble,代码行数:19,代码来源:pdftk.py

示例2: pages

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            page_number = i+1
            if pp != None and page_number not in pp: continue
            p = Page(self, page, page_number=page_number, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages 
开发者ID:jsvine,项目名称:pdfplumber,代码行数:15,代码来源:pdf.py

示例3: analyze_pages

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout 
开发者ID:HazyResearch,项目名称:pdftotree,代码行数:36,代码来源:pdf_utils.py

示例4: extract_first_jpeg_in_pdf

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
开发者ID:konstantint,项目名称:PassportEye,代码行数:41,代码来源:pdf.py

示例5: parse_case

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
开发者ID:thomas0809,项目名称:GraphIE,代码行数:41,代码来源:parse_pdf.py

示例6: parse_pdf

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
开发者ID:rammie,项目名称:pdfjinja,代码行数:16,代码来源:pdfjinja.py

示例7: extract

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def extract(self, max_page_num=None):
        for page in PDFPage.create_pages(self._document):
            self._interpreter.process_page(page)
            layout = self._device.get_result()

            if max_page_num != None and layout.pageid > max_page_num:
                break

            self._pages[layout.pageid] = layout 
开发者ID:johnlinp,项目名称:pdf-to-markdown,代码行数:11,代码来源:parser.py

示例8: p2t

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc 
开发者ID:startprogress,项目名称:China_stock_announcement,代码行数:37,代码来源:formatFun.py

示例9: get_title_from_io

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def get_title_from_io(pdf_io):
    # pylint: disable=too-many-locals
    parser = PDFParser(pdf_io)
    # if pdf is protected with a pwd, 2nd param here is password
    doc = PDFDocument(parser)

    # pdf may not allow extraction
    # pylint: disable=no-else-return
    if doc.is_extractable:
        rm = PDFResourceManager()
        dev = TextOnlyDevice(rm)
        interpreter = TextOnlyInterpreter(rm, dev)

        first_page = StringIO()
        converter = TextConverter(rm, first_page, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(rm, converter)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_interpreter.process_page(page)
            break

        converter.close()
        first_page_text = first_page.getvalue()
        first_page.close()
        dev.recover_last_paragraph()
        verbose('all blocks')

        for b in dev.blocks:
            verbose(b)

        # find max font size
        max_tfs = max(dev.blocks, key=lambda x: x[1])[1]
        verbose('max_tfs: ', max_tfs)
        # find max blocks with max font size
        max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks))
        # find the one with the highest y coordinate
        # this is the most close to top
        max_y = max(max_blocks, key=lambda x: x[3])[3]
        verbose('max_y: ', max_y)
        found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks))
        verbose('found blocks')

        for b in found_blocks:
            verbose(b)
        block = found_blocks[0]
        title = ''.join(block[4]).strip()

        # Retrieve missing spaces if needed
        if " " not in title:
            title = retrieve_spaces(first_page_text, title)

        # Remove duplcate spaces if any are present
        if "  " in title:
            title = " ".join(title.split())

        return title
    else:
        return None 
开发者ID:metebalci,项目名称:pdftitle,代码行数:61,代码来源:pdftitle.py

示例10: extractHighlights

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 别名]
def extractHighlights(filename,anno,verbose=True):
    '''Extract highlighted texts from a PDF

    '''
    hlpages=anno.hlpages
    if len(hlpages)==0:
        return []

    #--------------Get pdfmine instances--------------
    document, interpreter, device=init(filename)

    #----------------Loop through pages----------------
    hltexts=[]

    for ii,page in enumerate(PDFPage.create_pages(document)):

        #------------Get highlights in page------------
        if len(hlpages)>0 and ii+1 in hlpages:

            anno_total=len(anno.highlights[ii+1])
            anno_found=0

            interpreter.process_page(page)
            layout = device.get_result()

            #--------------Sort boxes diagnoally--------------
            objs=sortDiag(layout)

            #-----------------Refine ordering-----------------
            objs=fineTuneOrder(objs)

            #----------------Loop through boxes----------------
            for jj,objj in enumerate(objs):

                if type(objj)!=LTTextBox and\
                        type(objj)!=LTTextBoxHorizontal:
                    continue
                textjj,numjj=findStrFromBox(anno.highlights[ii+1],objj)

                if numjj>0:
                    #--------------Attach text with meta--------------
                    textjj=Anno(textjj,\
                        ctime=getCtime(anno.highlights[ii+1]),\
                        title=anno.meta['title'],\
                        page=ii+1,citationkey=anno.meta['citationkey'],\
                        tags=anno.meta['tags'])

                    hltexts.append(textjj)

                #----------------Break if all found----------------
                anno_found+=numjj
                if anno_total==anno_found:
                    break


    return hltexts 
开发者ID:Xunius,项目名称:Menotexport,代码行数:58,代码来源:extracthl.py


注:本文中的pdfminer.pdfpage.PDFPage.create_pages方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。