当前位置: 首页>>代码示例>>Python>>正文


Python PDFPage.get_pages方法代码示例

本文整理汇总了Python中pdfminer.pdfpage.PDFPage.get_pages方法的典型用法代码示例。如果您正苦于以下问题:Python PDFPage.get_pages方法的具体用法?Python PDFPage.get_pages怎么用?Python PDFPage.get_pages使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.pdfpage.PDFPage的用法示例。


在下文中一共展示了PDFPage.get_pages方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_text_from_pdf

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
 
            text = fake_file_handle.getvalue()
            yield text
 
            # close open handles
            converter.close()
            fake_file_handle.close() 
开发者ID:OmkarPathak,项目名称:ResumeParser,代码行数:26,代码来源:utils.py

示例2: pdf_do_pdf

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def pdf_do_pdf(astream, afile):
    outstream = io.BytesIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams,
                               imagewriter=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        for page in PDFPage.get_pages(astream, set(),
                                      maxpages=0, password='',
                                      caching=True, check_extractable=True):
            interpreter.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        log_error(str(e), afile)
        return
    text = outstream.getvalue()
    text_do_data(text, afile)
    outstream.close() 
开发者ID:veorq,项目名称:blueflower,代码行数:20,代码来源:pdf.py

示例3: do_import

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def do_import(self, results, filepath):
        buff = StringIO()
        fp = open(filepath, 'rb')

        laparams = LAParams()
        laparams.all_texts = True
        rsrcmgr = PDFResourceManager()
        pagenos = set()

        page_num = 0
        for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
            page_num += 1

            device = TextConverter(
                rsrcmgr, buff, codec='utf-8', laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            interpreter.process_page(page)

            buff.write("\n")

        results.investigation.update(import_text=buff.getvalue())

        fp.close()
        buff.close() 
开发者ID:yeti-platform,项目名称:yeti,代码行数:26,代码来源:pdf.py

示例4: convertPdfToText

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convertPdfToText(path):  #converts all pdf pages to text
    rsrcmgr=PDFResourceManager()
    retstr=StringIO()
    codec='utf-8'
    laparams=LAParams()
    device=TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp=file(path, 'rb')
    filename=path
    interpreter=PDFPageInterpreter(rsrcmgr, device)
    maxpages=0
    caching=True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password="",caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    writeToText(text,absolute_path_shortner(path)) 
开发者ID:avidLearnerInProgress,项目名称:python-automation-scripts,代码行数:21,代码来源:pdfToText.py

示例5: pdf_to_text

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def pdf_to_text(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    result = []
    for line in text.split('\n'):
        line2 = line.strip()
        if line2 != '':
            result.append(line2)
    return result 
开发者ID:chen0040,项目名称:keras-english-resume-parser-and-analyzer,代码行数:26,代码来源:pdf_utils.py

示例6: convert

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
#Function to extract names from the string using spacy 
开发者ID:ashaywalke,项目名称:resume-parser,代码行数:22,代码来源:resumeparser.py

示例7: convert_pdf_to_txt

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()

    return text 
开发者ID:soodoku,项目名称:autosum,代码行数:25,代码来源:autosumpdf.py

示例8: convert_pdf_to_txt

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convert_pdf_to_txt(r, max_pages=3):
    text = None

    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    if r.status_code != 200:
        logger.info(u"error: status code {} in convert_pdf_to_txt".format(r.status_code))
        return None

    if not r.encoding:
        r.encoding = "utf-8"
    fp = StringIO(r.content_big())

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True)

    for page in pages:
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    # logger.info(text)
    return text 
开发者ID:ourresearch,项目名称:oadoi,代码行数:35,代码来源:oa_pdf.py

示例9: convert_pdf

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convert_pdf(input_file, format='text', codec='utf-8'):
    """Convert PDF file to text or html.

    Args:
        input_file (str): Input PDF file.
        format (str): Format text or html.
        codec (str): Codec for encode the text.

    Returns:
        str: Return text or html from PDF file.

    """
    manager = PDFResourceManager()
    output = BytesIO()
    laparams = LAParams()
    if format == 'text':
        converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    elif format == 'html':
        converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)

    with open(input_file, 'rb') as f1:
        interpreter = PDFPageInterpreter(manager, converter)
        for page in PDFPage.get_pages(f1,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

        converter.close()
        text = output.getvalue()
        output.close()

    return text.decode() 
开发者ID:lucasayres,项目名称:python-tools,代码行数:34,代码来源:convert_pdf.py

示例10: convert_pdf_to_txt

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e)) 
开发者ID:SimplySecurity,项目名称:SimplyEmail,代码行数:35,代码来源:Converter.py

示例11: pdf_2_txt

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def pdf_2_txt(pdf) :
    outfile = pdf + '.txt'
    args = [pdf]

    debug = 0
    pagenos = set()
    password = ''
    maxpages = 0
    rotation = 0
    codec = 'utf-8'   #输出编码
    caching = True
    imagewriter = None
    laparams = LAParams()

    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile,'w',encoding="utf8")
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,imagewriter=imagewriter)
    for fname in args:
        fp = open(fname,'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        #处理文档对象中每一页的内容
        for page in PDFPage.get_pages(fp, pagenos,
                          maxpages=maxpages, password=password,
                          caching=caching, check_extractable=True) :
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return outfile 
开发者ID:ylfeng250,项目名称:FengTools,代码行数:35,代码来源:youdao.py

示例12: get_number_of_pages

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                        file_name,
                        caching=True,
                        check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                            fh,
                            caching=True,
                            check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None 
开发者ID:OmkarPathak,项目名称:pyresparser,代码行数:30,代码来源:utils.py

示例13: get_number_of_pages

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                file_name,
                caching=True,
                check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None 
开发者ID:OmkarPathak,项目名称:pyresparser,代码行数:30,代码来源:custom_t.py

示例14: pdf_to_txt

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def pdf_to_txt(file_name):
  try:
    file_pointer = open(file_name,'rb')

    # Setting up pdf reader
    pdf_resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(pdf_resource_manager, return_string, codec=codec, \
      laparams=laparams)
    interpreter = PDFPageInterpreter(pdf_resource_manager, device)

    for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="",
      caching=True, check_extractable=True):
      interpreter.process_page(page)
    file_pointer.close()
    device.close()

    # Get full string from PDF
    pdf_txt = return_string.getvalue()
    return_string.close()

    # logging.debug(pdf_txt)

    # Formatting removing and replacing special characters
    pdf_txt = pdf_txt.replace("\r", "\n")
    pdf_txt = re.sub(regex.bullet, " ", pdf_txt)

    return pdf_txt.decode('ascii', errors='ignore')

  except Exception, exception_instance:
    logging.error('Error converting pdf to txt: '+str(exception_instance))
    return '' 
开发者ID:skcript,项目名称:cvscan,代码行数:36,代码来源:converter.py

示例15: fetch_pdf_urls

# 需要导入模块: from pdfminer.pdfpage import PDFPage [as 别名]
# 或者: from pdfminer.pdfpage.PDFPage import get_pages [as 别名]
def fetch_pdf_urls(file_name):
  try:
    links = []
    file_pointer = open(file_name,'rb')

    # Setting up pdf document
    pdf_pages = PDFPage.get_pages(file_pointer)

    # fetches URLs
    for page in pdf_pages:
      if 'Annots' in page.attrs.keys():
        link_object_list = page.attrs['Annots']
        # Due to implementation of pdfminer the link_object_list can either
        # be the list directly or a PDF Object reference
        if type(link_object_list) is not list:
          link_object_list = link_object_list.resolve()
        for link_object in link_object_list:
          if type(link_object) is not dict:
            link_object = link_object.resolve()
          if link_object['A']['URI']:
            links.append(link_object['A']['URI'])
    file_pointer.close()
    return links

  except Exception, exception_instance:
    logging.error('Error while fetching URLs : '+str(exception_instance))
    return '' 
开发者ID:skcript,项目名称:cvscan,代码行数:29,代码来源:annotations_parser.py


注:本文中的pdfminer.pdfpage.PDFPage.get_pages方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。