当前位置: 首页>>代码示例>>Python>>正文


Python LAParams.word_margin方法代码示例

本文整理汇总了Python中pdfminer.layout.LAParams.word_margin方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.word_margin方法的具体用法?Python LAParams.word_margin怎么用?Python LAParams.word_margin使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.layout.LAParams的用法示例。


在下文中一共展示了LAParams.word_margin方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
开发者ID:hcouch21,项目名称:styloproject,代码行数:30,代码来源:PdfParser.py

示例2: initialize_pdf_miner

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def initialize_pdf_miner(fh, password = None):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser, password)

    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:ziweizhou,项目名称:pdftables,代码行数:28,代码来源:pdftables.py

示例3: initialize_pdf_miner

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    #doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    #doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        pass
        #raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1,
            boxes_flow=0.1, detect_vertical=False, all_texts=False)
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:metador,项目名称:tables_from_pdf,代码行数:35,代码来源:pdftables.py

示例4: parse_pdf

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
    def parse_pdf(self, test_parse=False):
        """
            Parse a PDF and return text contents as an array
        """

        dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)

        # input options
        pagenos = set()
        maxpages = 0
        # output option
        codec = "utf-8"
        caching = True
        laparams = LAParams()
        laparams.char_margin = 8.0
        laparams.word_margin = 2.0

        rsrcmgr = PDFResourceManager(caching=caching)

        try:
            outfp = file(self.text_file, "w")
        except IOError as io_error:
            raise DTPOFileError(self.text_file, 0, str(io_error))

        try:
            fp = file(self.source_file, "rb")
        except IOError as io_error:
            raise DTPOFileError(self.source_file, 0, str(io_error))

        try:
            device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)

        except PDFException as pdf_error:
            message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
            raise DTPOFileError(self.source_file, 0, message)
        except Exception as exception:
            message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
            raise DTPOFileError(self.source_file, 0, message)

        fp.close()
        device.close()
        outfp.close()

        #   Got the PDF converted = now get it into an array
        self.file_array = []
        for line in open(self.text_file):
            self.file_array.append(line)

        #   Remove the last entry - it's always '\x0c'
        if len(self.file_array) > 0:
            del self.file_array[-1]

        #   Remove the outfile
        if not test_parse:
            os.remove(self.text_file)
开发者ID:stubevan,项目名称:DTPO-Autoload,代码行数:58,代码来源:text_extractor.py

示例5: to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:14,代码来源:pdf.py

示例6: convert_to_text_file

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
    """
        Parse file according to BORME PDF format

        filename:
        filenameOut:
    """

    if os.path.isdir(filename_out):
        filename_out = os.path.join(filename_out, os.path.basename(filename_in))

    if os.path.exists(filename_out) and not rewrite:
        logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
        return False

    # conf
    codec = 'utf-8'
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    maxpages = 0
    password = ''
    rotation = 0

    # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
    laparams.detect_vertical = True
    laparams.all_texts = False
    laparams.char_margin = 2.0
    laparams.line_margin = 0.5
    laparams.word_margin = 0.1

    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(filename_out, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    fp = open(filename_in, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # https://github.com/euske/pdfminer/issues/72
    #page = PDFPage()
    #PDFPage.cropbox =

    # y esto?
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return True
开发者ID:miguelramosfdz,项目名称:bormeparser,代码行数:55,代码来源:functions.py

示例7: initialize_pdf_interpreter

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def initialize_pdf_interpreter():
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return interpreter, device
开发者ID:vnaydionov,项目名称:pdftables,代码行数:20,代码来源:pdftables.py

示例8: output_pdf_to_table

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
    def output_pdf_to_table(self, path, config):

        fp = open(path, "rb")
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.line_margin = line_margin_threshold
        laparams.word_margin = word_margin_threshold
        codec = 'utf-8'
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        password=""
        maxpages=pages_to_view
        caching=True
        pagenos=set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, 
            password=password,caching=caching, check_extractable=False):
            interpreter.process_page(page)
            layout = device.get_result()
            self.getRows(layout, config)
开发者ID:hning,项目名称:SBCParser,代码行数:23,代码来源:BlueCrossParser.py

示例9: read_file

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
    def read_file(self):
        with open(self.path, 'rb') as f:
            parser = PDFParser(f)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 0.1
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = []

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    extracted_text.append(lt_obj.get_text())
        self.content = ' '.join(extracted_text)
开发者ID:s-alexey,项目名称:orange3-text,代码行数:24,代码来源:import_documents.py

示例10: initialize_pdf_miner

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    codec = 'utf-8'
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
开发者ID:ndanielsen,项目名称:pdftables-4,代码行数:26,代码来源:pdftables.py

示例11: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
               '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
               '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrc = PDFResourceManager()
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
        fp.close()
    device.close()
    outfp.close()
    return
开发者ID:joshmgrant,项目名称:pdfminer,代码行数:84,代码来源:pdf2txt.py

示例12: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
#.........这里部分代码省略.........
开发者ID:jamalmazrui,项目名称:pdf2tag,代码行数:103,代码来源:pdf2tag.py

示例13: PDFParser

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
    # Create a PDF parser object associated with the file object.
    #parser = PDFParser(open_file)
    # Create a PDF document object that stores the document structure.
    #doc = PDFDocument(parser)
    # Connect the parser and document objects.
    #print parser.nextline()
    #print parser.nextline()
    #print parser.nextline()


    ##ATTEMPT 2
    #Code from pdf2txt.py
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.line_margin=0.5
    laparams.word_margin=0.1
    laparams.all_texts=False

    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pdf_pages = PDFPage.get_pages(fp_in, set())
    pagenum = 0
    pagelim = 3
    for page in pdf_pages:
        pagenum += 1
        if pagenum > pagelim:
            continue
        print "Transcribing page " + str(pagenum) + " from PDF to text"
        interpreter.process_page(page)
    fp_in.close()
开发者ID:selahlynch,项目名称:phillycourtdockets,代码行数:33,代码来源:dockets.py

示例14: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
            description='Convert pdf to txt',
            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d', dest='debuglevel', action='count',
                       default = 0,
                       help='Debug (repeat for more verbose debugging)')

    parser.add_argument('-p', '--pages', dest='pagenos', action='store',
                       type=str,
                       default = '',
                       help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.')

    parser.add_argument('-c', '--codec', dest='codec', action='store',
                       type=str,
                       default='utf-8',
                       help='Specifies the output codec.')

    parser.add_argument('-t', '--type', dest='outtype', action='store',
                       type=str,
                       default='shape',
                       choices = ['text', 'html', 'xml', 'tag', 'shape'],
                       help='Specifies the output format, one of: shape, text, html, xml, tag')

    parser.add_argument('-m', dest='maxpages', action='store',
                       type=int,
                       default=0,
                       help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.')

    parser.add_argument('-P', '--password', dest='password', action='store',
                       type=str,
                       default='',
                       help='Provides the user password to access PDF contents.')

    parser.add_argument('-o', '--output', dest='outfile', action='store',
                       type=str,
                       default=None,
                       help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.')

    parser.add_argument('-C', '--no-caching', dest='caching', action='store_false',
                       default=True,
                       help='Suppress object caching. This will reduce the memory consumption but also slows down the process.')

    parser.add_argument('-n', '--no-layout', dest='layout', action='store_false',
                       default=True,
                       help='Suppress layout analysis.')

    parser.add_argument('--show-pageno', dest='show_pageno', action='store_true',
                       default=False,
                       help='Show page numbers.')


    parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true',
                       default=False,
                       help='Forces to perform layout analysis for all the text strings, including text contained in figures.')

    parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true',
                       default=False,
                       help='Allows vertical writing detection.')

    parser.add_argument('-M', dest='char_margin', action='store',
                       type=float,
                       default=2.0,
                       help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.')

    parser.add_argument('-L', dest='line_margin', action='store',
                       type=float,
                       default=0.5,
                       help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.')

    parser.add_argument('-W', dest='word_margin', action='store',
#.........这里部分代码省略.........
开发者ID:garabik,项目名称:pdfshapeminer,代码行数:103,代码来源:pdf2txt.py

示例15: convert_pdf_To_Txt

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import word_margin [as 别名]
def convert_pdf_To_Txt(path,opts={}):
    """
    this ALGO form pdfinterp modul  documentation


    """

        # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    retstr = StringIO()
    if outtype == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
        interpreter.process_page(page)
    #print retstr.getvalue()
    txt2Pdf=retstr.getvalue()
    #print type(txt2Pdf)

    #fp.close()
    #device.close()
    #outfp.close()
    return txt2Pdf
开发者ID:smidaamine,项目名称:plagiarism-detector,代码行数:85,代码来源:PlagiaUtil.py


注:本文中的pdfminer.layout.LAParams.word_margin方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。