當前位置: 首頁>>代碼示例>>Python>>正文


Python PDFPage.create_pages方法代碼示例

本文整理匯總了Python中pdfminer.pdfpage.PDFPage.create_pages方法的典型用法代碼示例。如果您正苦於以下問題:Python PDFPage.create_pages方法的具體用法?Python PDFPage.create_pages怎麽用?Python PDFPage.create_pages使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pdfminer.pdfpage.PDFPage的用法示例。


在下文中一共展示了PDFPage.create_pages方法的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: read_fields

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter) 
開發者ID:jhpyle,項目名稱:docassemble,代碼行數:19,代碼來源:pdftk.py

示例2: pages

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            page_number = i+1
            if pp != None and page_number not in pp: continue
            p = Page(self, page, page_number=page_number, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages 
開發者ID:jsvine,項目名稱:pdfplumber,代碼行數:15,代碼來源:pdf.py

示例3: analyze_pages

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout 
開發者ID:HazyResearch,項目名稱:pdftotree,代碼行數:36,代碼來源:pdf_utils.py

示例4: extract_first_jpeg_in_pdf

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
開發者ID:konstantint,項目名稱:PassportEye,代碼行數:41,代碼來源:pdf.py

示例5: parse_case

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
開發者ID:thomas0809,項目名稱:GraphIE,代碼行數:41,代碼來源:parse_pdf.py

示例6: parse_pdf

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
開發者ID:rammie,項目名稱:pdfjinja,代碼行數:16,代碼來源:pdfjinja.py

示例7: extract

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extract(self, max_page_num=None):
        for page in PDFPage.create_pages(self._document):
            self._interpreter.process_page(page)
            layout = self._device.get_result()

            if max_page_num != None and layout.pageid > max_page_num:
                break

            self._pages[layout.pageid] = layout 
開發者ID:johnlinp,項目名稱:pdf-to-markdown,代碼行數:11,代碼來源:parser.py

示例8: p2t

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 來創建一個pdf文檔分析器
        parser = PDFParser(fp)
        #創建一個PDF文檔對象存儲文檔結構
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正確下載')
        # 檢查文件是否允許文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允許提取文本')
             # 創建一個PDF資源管理器對象來存儲共賞資源
            rsrcmgr=PDFResourceManager()
             # 設定參數進行分析
            laparams=LAParams()
             # 創建一個PDF設備對象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 創建一個PDF解釋器對象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 處理每一頁
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受該頁麵的LTPage對象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已轉為 ' + outfile)

##############################################把doc轉為txt##############################################
# 調用之前要確保你在linux 下裝了catdoc 
開發者ID:startprogress,項目名稱:China_stock_announcement,代碼行數:37,代碼來源:formatFun.py

示例9: get_title_from_io

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def get_title_from_io(pdf_io):
    # pylint: disable=too-many-locals
    parser = PDFParser(pdf_io)
    # if pdf is protected with a pwd, 2nd param here is password
    doc = PDFDocument(parser)

    # pdf may not allow extraction
    # pylint: disable=no-else-return
    if doc.is_extractable:
        rm = PDFResourceManager()
        dev = TextOnlyDevice(rm)
        interpreter = TextOnlyInterpreter(rm, dev)

        first_page = StringIO()
        converter = TextConverter(rm, first_page, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(rm, converter)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_interpreter.process_page(page)
            break

        converter.close()
        first_page_text = first_page.getvalue()
        first_page.close()
        dev.recover_last_paragraph()
        verbose('all blocks')

        for b in dev.blocks:
            verbose(b)

        # find max font size
        max_tfs = max(dev.blocks, key=lambda x: x[1])[1]
        verbose('max_tfs: ', max_tfs)
        # find max blocks with max font size
        max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks))
        # find the one with the highest y coordinate
        # this is the most close to top
        max_y = max(max_blocks, key=lambda x: x[3])[3]
        verbose('max_y: ', max_y)
        found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks))
        verbose('found blocks')

        for b in found_blocks:
            verbose(b)
        block = found_blocks[0]
        title = ''.join(block[4]).strip()

        # Retrieve missing spaces if needed
        if " " not in title:
            title = retrieve_spaces(first_page_text, title)

        # Remove duplcate spaces if any are present
        if "  " in title:
            title = " ".join(title.split())

        return title
    else:
        return None 
開發者ID:metebalci,項目名稱:pdftitle,代碼行數:61,代碼來源:pdftitle.py

示例10: extractHighlights

# 需要導入模塊: from pdfminer.pdfpage import PDFPage [as 別名]
# 或者: from pdfminer.pdfpage.PDFPage import create_pages [as 別名]
def extractHighlights(filename,anno,verbose=True):
    '''Extract highlighted texts from a PDF

    '''
    hlpages=anno.hlpages
    if len(hlpages)==0:
        return []

    #--------------Get pdfmine instances--------------
    document, interpreter, device=init(filename)

    #----------------Loop through pages----------------
    hltexts=[]

    for ii,page in enumerate(PDFPage.create_pages(document)):

        #------------Get highlights in page------------
        if len(hlpages)>0 and ii+1 in hlpages:

            anno_total=len(anno.highlights[ii+1])
            anno_found=0

            interpreter.process_page(page)
            layout = device.get_result()

            #--------------Sort boxes diagnoally--------------
            objs=sortDiag(layout)

            #-----------------Refine ordering-----------------
            objs=fineTuneOrder(objs)

            #----------------Loop through boxes----------------
            for jj,objj in enumerate(objs):

                if type(objj)!=LTTextBox and\
                        type(objj)!=LTTextBoxHorizontal:
                    continue
                textjj,numjj=findStrFromBox(anno.highlights[ii+1],objj)

                if numjj>0:
                    #--------------Attach text with meta--------------
                    textjj=Anno(textjj,\
                        ctime=getCtime(anno.highlights[ii+1]),\
                        title=anno.meta['title'],\
                        page=ii+1,citationkey=anno.meta['citationkey'],\
                        tags=anno.meta['tags'])

                    hltexts.append(textjj)

                #----------------Break if all found----------------
                anno_found+=numjj
                if anno_total==anno_found:
                    break


    return hltexts 
開發者ID:Xunius,項目名稱:Menotexport,代碼行數:58,代碼來源:extracthl.py


注:本文中的pdfminer.pdfpage.PDFPage.create_pages方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。