当前位置: 首页>>代码示例>>Python>>正文


Python LAParams.detect_vertical方法代码示例

本文整理汇总了Python中pdfminer.layout.LAParams.detect_vertical方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.detect_vertical方法的具体用法?Python LAParams.detect_vertical怎么用?Python LAParams.detect_vertical使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pdfminer.layout.LAParams的用法示例。


在下文中一共展示了LAParams.detect_vertical方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_result_from_file

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
        page_index += 1
    fp.close()
    return result
开发者ID:broersma,项目名称:extractpdf,代码行数:35,代码来源:extractpdf.py

示例2: pdf2str

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdf2str(path):

    #Allocate resources
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    
    #Set parameters
    codec = 'utf-8'
    laparams.all_texts=True
    laparams.detect_vertical = True
    caching = True
    pagenos = set()

    #Initialize the converter
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    #Open the file and parse
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    #Clean up
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
开发者ID:acadien,项目名称:pdfAnalyse,代码行数:32,代码来源:pdfBagOfWords.py

示例3: convert_to_text_file

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
    """
        Parse file according to BORME PDF format

        filename:
        filenameOut:
    """

    if os.path.isdir(filename_out):
        filename_out = os.path.join(filename_out, os.path.basename(filename_in))

    if os.path.exists(filename_out) and not rewrite:
        logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
        return False

    # conf
    codec = 'utf-8'
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    maxpages = 0
    password = ''
    rotation = 0

    # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
    laparams.detect_vertical = True
    laparams.all_texts = False
    laparams.char_margin = 2.0
    laparams.line_margin = 0.5
    laparams.word_margin = 0.1

    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(filename_out, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    fp = open(filename_in, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # https://github.com/euske/pdfminer/issues/72
    #page = PDFPage()
    #PDFPage.cropbox =

    # y esto?
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return True
开发者ID:miguelramosfdz,项目名称:bormeparser,代码行数:55,代码来源:functions.py

示例4: to_text

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
开发者ID:MikaYuoadas,项目名称:Docbucket,代码行数:14,代码来源:pdf.py

示例5: getPdfAsText

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def getPdfAsText(pdfPages = None, fileDescriptor = None):
    if pdfPages is None and fileDescriptor is not None:
        pdfPages = getPdfPages(fileDescriptor)

    resourceManager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    try:
        outputStream = StringIO.StringIO()
        device = TextConverter(resourceManager, outputStream, laparams=laparams)
        intrepreter = PDFPageInterpreter(resourceManager, device)
        for pdfPage in pdfPages:
            intrepreter.process_page(pdfPage)
        return outputStream.getvalue()
    finally:
        device.close()
        outputStream.close()
开发者ID:siims,项目名称:invoice_info_from_pdf_to_csv,代码行数:21,代码来源:__init__.py

示例6: convert_pdf_to_txt

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_pdf_to_txt(path, txtname, buf=True):
    rsrcmgr = PDFResourceManager()
    if buf:
        outfp = StringIO()
    else:
        outfp = file(txtname, 'w')
    codec = 'utf-8'
    laparams = LAParams()
    laparams.detect_vertical = True
#    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    device = TextConverter(rsrcmgr, outfp,  laparams=laparams)

    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()
    device.close()
    if buf:
        text = re.sub(space, "", outfp.getvalue())
        print (text)
    outfp.close()
开发者ID:rinkeigun,项目名称:linux_module,代码行数:24,代码来源:pdftest.py

示例7: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
    parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
    parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
    parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
    parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
    parser.add_argument('-P', metavar='password', default='', help='pdf password')
    parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
    parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
    parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
    lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
    lagroup.add_argument('-V', action='store_true', help='detect vertical text')
    lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
    lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
    lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
    lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
    lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
                               laparams=laparams, imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
                    caching=args.cache, check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
开发者ID:JoshData,项目名称:pdfminer,代码行数:71,代码来源:pdf2txt.py

示例8: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
    def usage():
        print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
            laparams=laparams, outdir=outdir, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
开发者ID:kyao4,项目名称:web_crawler,代码行数:86,代码来源:pdf2txt.py

示例9: pdf2txt

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdf2txt(argv):
    import getopt
    (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
开发者ID:tiagochst,项目名称:exato,代码行数:82,代码来源:pdf2txt.py

示例10: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
    import getopt			#getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
    def usage(): 			#usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
		'''
		getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
		短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
		长选项名后的等号(=)表示该选项必须有附加的参数。
		返回opts和args。
		'''
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''			#参数P
    pagenos = set()			#参数p
    maxpages = 0			#参数m
    # output option
    outfile = None			#参数o output
    outtype = None			#参数t out type
    outdir = None			#参数O output directory
    layoutmode = 'normal'	#参数Y
    codec = 'utf-8'			#参数c
    pageno = 1				
    scale = 1				#参数s,暂缺M,L,F,Y四个参数
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:				#确认输出文件格式
        outtype = 'text'
        if outfile:			
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:					
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)  #TextConverter貌似不能指定outdir参数
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
开发者ID:bminfo,项目名称:autoRCT,代码行数:97,代码来源:explain.py

示例11: pdfminerr

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdfminerr(argv):
    global pdfminerr, install
    import getopt
  
    def usage():
        print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.")
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'

    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
           caching=caching, check_extractable=True)

        fp.close()
    
    device.close()
    outfp.close()
    return
开发者ID:emmanuel12,项目名称:pdf-convertor,代码行数:98,代码来源:pdf2txt.py

示例12: convert_pdf_To_Txt

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_pdf_To_Txt(path,opts={}):
    """
    this ALGO form pdfinterp modul  documentation


    """

        # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    retstr = StringIO()
    if outtype == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
        interpreter.process_page(page)
    #print retstr.getvalue()
    txt2Pdf=retstr.getvalue()
    #print type(txt2Pdf)

    #fp.close()
    #device.close()
    #outfp.close()
    return txt2Pdf
开发者ID:smidaamine,项目名称:plagiarism-detector,代码行数:85,代码来源:PlagiaUtil.py

示例13: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(fname, k, v):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    if k == '-d':
        debug += 1
    elif k == '-p':
        pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m':
        maxpages = int(v)
    elif k == '-P':
        password = v
    elif k == '-o':
        outfile = v
    elif k == '-C':
        caching = False
    elif k == '-n':
        laparams = None
    elif k == '-A':
        laparams.all_texts = True
    elif k == '-V':
        laparams.detect_vertical = True
    elif k == '-M':
        laparams.char_margin = float(v)
    elif k == '-L':
        laparams.line_margin = float(v)
    elif k == '-W':
        laparams.word_margin = float(v)
    elif k == '-F':
        laparams.boxes_flow = float(v)
    elif k == '-Y':
        layoutmode = v
    elif k == '-O':
        imagewriter = ImageWriter(v)
    elif k == '-R':
        rotation = int(v)
    elif k == '-S':
        stripcontrol = True
    elif k == '-t':
        outtype = v
    elif k == '-c':
        codec = v
    elif k == '-s':
        scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
 
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
#.........这里部分代码省略.........
开发者ID:lixiao89,项目名称:literature_network,代码行数:103,代码来源:pdf2txt.py

示例14: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
#.........这里部分代码省略.........
开发者ID:jamalmazrui,项目名称:pdf2tag,代码行数:103,代码来源:pdf2tag.py

示例15: main

# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
    import getopt

    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] '
               '[-S] [-f] file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    roundCoords = False
    simplifyOutput = False
    formatOutput = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-C':
            caching = False
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            imagewriter = ImageWriter(v)
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
        elif k == '-r':
            roundCoords = True
        elif k == '-S':
            simplifyOutput = True
        elif k == '-f':
            formatOutput = True

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if formatOutput and outtype.endswith('ml'):
        try:
            from cStringIO import StringIO
        except ImportError:
#.........这里部分代码省略.........
开发者ID:brechin,项目名称:pdfminer2,代码行数:103,代码来源:pdf2txt.py


注:本文中的pdfminer.layout.LAParams.detect_vertical方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。