当前位置: 首页>>代码示例>>Python>>正文


Python LAParams.all_texts方法代码示例

本文整理汇总了Python中pdfminer.layout.LAParams.all_texts方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.all_texts方法的具体用法?Python LAParams.all_texts怎么用?Python LAParams.all_texts使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.layout.LAParams的用法示例。


在下文中一共展示了LAParams.all_texts方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _pdf_to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def _pdf_to_text(path):

    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'ascii'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        with open(path, 'rb') as fp:
            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            retVal = (txt,True)
            retstr.close()

    except Exception,e:
        #print str(e)
        #print "\tERROR: PDF is not formatted correctly, aborting."
        retVal = ("", False)
        pass
开发者ID:hhroc,项目名称:monroeminutes,代码行数:28,代码来源:decodepdf.py

示例2: parse_pdf_pdfminer

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
开发者ID:sebdraven,项目名称:ioc_parser,代码行数:33,代码来源:iocp.py

示例3: count_words

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def count_words(self):
        """
        Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/
        and http://www.unixuser.org/~euske/python/pdfminer/programming.html
        """
        with open(self.filename, "rb") as fp:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Supply the password for initialization.
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)

            full_text = retstr.getvalue()
            full_text = full_text.translate(string.maketrans("", ""), string.punctuation)

            return len(full_text.split())
开发者ID:dhumbert,项目名称:literable,代码行数:33,代码来源:pdf.py

示例4: parse_pdf_pdfminer

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
开发者ID:born2c0de,项目名称:ioc_parser,代码行数:30,代码来源:iocp.py

示例5: _convert_pdf_to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def _convert_pdf_to_text(self, password=None):
    	input_pdf = self.cvFile
    	if password is not None:
	    self.cvFilePasswd = password
    	pagenos = range(0, 30)
    	maxpages = pagenos.__len__()
    	layoutmode = 'normal'
    	codec = 'utf-8'
    	scale = 1
    	outtype = 'txt'
    	laparams = LAParams()
    	laparams.all_texts = True
    	laparams.showpageno = True
    	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_pdf):
            inputPath = os.path.dirname(input_pdf)
    	input_filename = os.path.basename(input_pdf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	outfp = file(output_filename, 'w')
    	rsrcmgr = PDFResourceManager()
    	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    	fp = file(input_pdf, 'rb')
    	process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
    	fp.close()
    	device.close()
    	outfp.close()
    	return (0)
开发者ID:arshpreetsingh,项目名称:cv-parser,代码行数:34,代码来源:cvparser.py

示例6: dump_pdf_pdfminer

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def dump_pdf_pdfminer(self, fpath_in):
        fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
        n = 0

        with open(fpath_in, 'rb') as fin:
            with open(fpath_out, 'wb') as fout:
                try:
                    laparams = LAParams()
                    laparams.all_texts = True  
                    rsrcmgr = PDFResourceManager()
                    pagenos = set()

                    page_num = 0
                    for page in PDFPage.get_pages(fin, pagenos, check_extractable=True):
                        page_num += 1

                        retstr = StringIO()
                        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        interpreter.process_page(page)
                        data = retstr.getvalue()
                        retstr.close()

                        fout.write(data)
                        n += len(data)
                    print "Written %d bytes to %s" % (n, fpath_out)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except Exception as e:
                    print "Failed parsing %s" % (fpath_in)
开发者ID:RanchoIce,项目名称:apt-analysis,代码行数:32,代码来源:dump-pdf.py

示例7: pdf2str

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def pdf2str(path):

    #Allocate resources
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    
    #Set parameters
    codec = 'utf-8'
    laparams.all_texts=True
    laparams.detect_vertical = True
    caching = True
    pagenos = set()

    #Initialize the converter
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    #Open the file and parse
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    #Clean up
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
开发者ID:acadien,项目名称:pdfAnalyse,代码行数:32,代码来源:pdfBagOfWords.py

示例8: to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def to_text(path):
    """Wrapper around `pdfminer`.

    Parameters
    ----------
    path : str
        path of electronic invoice in PDF

    Returns
    -------
    str : str
        returns extracted text from pdf

    """

    try:
        # python 2
        from StringIO import StringIO
        import sys

        reload(sys)  # noqa: F821
        sys.setdefaultencoding('utf8')
    except ImportError:
        from io import StringIO

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        )
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str.encode('utf-8')
开发者ID:m3nu,项目名称:invoice2data,代码行数:58,代码来源:pdfminer_wrapper.py

示例9: get_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def get_text(self):
        """Returns all text content from the PDF as plain text.
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        try:
            file_pointer = file(self.path, 'rb')
            process_pdf(rsrcmgr, device, file_pointer)
        except Exception as e:
            logging.error("Error processing PDF: %s" % e)
            raise
        finally:
            file_pointer.close()
            device.close()

        text = retstr.getvalue()
        retstr.close()
        if (text is None) or (text.strip() == ""):
            logging.info("No text found in PDF. Attempting OCR. This will take a while.")
            #FIXME this should go in a separate method
            #First, convert to image
            import subprocess
            try:
                arglist = ["gs",
                      "-dNOPAUSE",
                      "-sOutputFile=temp/page%03d.png",
                      "-sDEVICE=png16m",
                      "-r72",
                      self.path]
                process = subprocess.call(
                    args=arglist,
                    stdout=subprocess.STDOUT,
                    stderr=subprocess.STDOUT)
            except OSError:
                logging.error("Failed to run GhostScript (using `gs`)")
            #Do OCR
            import time
            time.sleep(1) # make sure the server has time to write the files
            import Image
            import pytesseract
            import os
            text = ""
            for file_ in os.listdir("temp"):
                if file_.endswith(".png"):
                    text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe")
                    os.unlink("temp/" + file_)
        self.text = text
        return text
开发者ID:staffanm,项目名称:protokollen,代码行数:55,代码来源:pdf.py

示例10: convert_to_text_file

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
    """
        Parse file according to BORME PDF format

        filename:
        filenameOut:
    """

    if os.path.isdir(filename_out):
        filename_out = os.path.join(filename_out, os.path.basename(filename_in))

    if os.path.exists(filename_out) and not rewrite:
        logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
        return False

    # conf
    codec = 'utf-8'
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    maxpages = 0
    password = ''
    rotation = 0

    # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
    laparams.detect_vertical = True
    laparams.all_texts = False
    laparams.char_margin = 2.0
    laparams.line_margin = 0.5
    laparams.word_margin = 0.1

    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(filename_out, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    fp = open(filename_in, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # https://github.com/euske/pdfminer/issues/72
    #page = PDFPage()
    #PDFPage.cropbox =

    # y esto?
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return True
开发者ID:miguelramosfdz,项目名称:bormeparser,代码行数:55,代码来源:functions.py

示例11: to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:14,代码来源:pdf.py

示例12: _pdf2text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.
            #
            created = ""
            try:
                if not isinstance(doc.info[0]['CreationDate'],basestring):
                    creatd = doc.info[0]['CreationDate'].resolve()[2:-7]
                else:
                    created = doc.info[0]['CreationDate'][2:-7]
            except:
                self._report("CreationDate field could not be decoded within PDF, setting to ''")
                pass
            created = created.encode('ascii','ignore')
            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._report("Error: \n\t%s" % str(e))
            retVal = (None,"",False)
            pass
开发者ID:thequbit,项目名称:unpdfer,代码行数:48,代码来源:unpdfer.py

示例13: _pdf2text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.  I'm starting to dislike PDFs ...
            #
            if not isinstance(doc.info[0]['CreationDate'],basestring):
                datestring = doc.info[0]['CreationDate'].resolve()[2:-7]
            else:
                datestring = doc.info[0]['CreationDate'][2:-7]
            #print "working on '{0}'...".format(datestring)
            ts = strptime(datestring, "%Y%m%d%H%M%S")
            created = datetime.fromtimestamp(mktime(ts))

            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._reportstr("Error: \n\t%s" %str(e))
            retVal = (None,"",False)
            pass
开发者ID:citruspi,项目名称:unpdfer,代码行数:46,代码来源:unpdfer.py

示例14: getPdfAsText

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def getPdfAsText(pdfPages = None, fileDescriptor = None):
    if pdfPages is None and fileDescriptor is not None:
        pdfPages = getPdfPages(fileDescriptor)

    resourceManager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    try:
        outputStream = StringIO.StringIO()
        device = TextConverter(resourceManager, outputStream, laparams=laparams)
        intrepreter = PDFPageInterpreter(resourceManager, device)
        for pdfPage in pdfPages:
            intrepreter.process_page(pdfPage)
        return outputStream.getvalue()
    finally:
        device.close()
        outputStream.close()
开发者ID:siims,项目名称:invoice_info_from_pdf_to_csv,代码行数:21,代码来源:__init__.py

示例15: pdf

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def pdf(f):
    rsrcmgr = PDFResourceManager()
    retstr = cStringIO.StringIO()
    codec = 'utf-8'

    laparams = LAParams()
    laparams.all_texts = True

    device = TextConverter(
        rsrcmgr, retstr, codec=codec, laparams=laparams
    )

    fp = file(f, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
开发者ID:DarioGT,项目名称:libreQDA,代码行数:22,代码来源:text_extraction.py


注:本文中的pdfminer.layout.LAParams.all_texts方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。