本文整理汇总了Python中pdfminer.layout.LAParams.detect_vertical方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.detect_vertical方法的具体用法?Python LAParams.detect_vertical怎么用?Python LAParams.detect_vertical使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout.LAParams
的用法示例。
在下文中一共展示了LAParams.detect_vertical方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_result_from_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def get_result_from_file(filename):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
result = {"filename": filename, "pages": []}
fp = open(filename, "rb")
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 2.0
laparams.detect_vertical = True
laparams.line_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_index = 0
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
bounding_box = get_bounding_box(layout)
labels = get_text_labels(layout)
result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
page_index += 1
fp.close()
return result
示例2: pdf2str
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdf2str(path):
#Allocate resources
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
#Set parameters
codec = 'utf-8'
laparams.all_texts=True
laparams.detect_vertical = True
caching = True
pagenos = set()
#Initialize the converter
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
#Open the file and parse
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
interpreter.process_page(page)
#Clean up
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
示例3: convert_to_text_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
"""
Parse file according to BORME PDF format
filename:
filenameOut:
"""
if os.path.isdir(filename_out):
filename_out = os.path.join(filename_out, os.path.basename(filename_in))
if os.path.exists(filename_out) and not rewrite:
logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
return False
# conf
codec = 'utf-8'
laparams = LAParams()
imagewriter = None
pagenos = set()
maxpages = 0
password = ''
rotation = 0
# <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
laparams.detect_vertical = True
laparams.all_texts = False
laparams.char_margin = 2.0
laparams.line_margin = 0.5
laparams.word_margin = 0.1
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
outfp = open(filename_out, 'w')
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
fp = open(filename_in, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
# https://github.com/euske/pdfminer/issues/72
#page = PDFPage()
#PDFPage.cropbox =
# y esto?
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return True
示例4: to_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def to_text(self):
rsrcmgr = PDFResourceManager()
output = StringIO()
laparams = LAParams()
laparams.detect_vertical = True
laparams.all_texts = True
laparams.word_margin = 0.4
device = TextConverter(rsrcmgr, output, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self._doc.get_pages():
interpreter.process_page(page)
return output.getvalue().decode('utf-8', 'ignore')
示例5: getPdfAsText
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def getPdfAsText(pdfPages = None, fileDescriptor = None):
if pdfPages is None and fileDescriptor is not None:
pdfPages = getPdfPages(fileDescriptor)
resourceManager = PDFResourceManager()
laparams = LAParams()
laparams.all_texts = True
laparams.detect_vertical = True
try:
outputStream = StringIO.StringIO()
device = TextConverter(resourceManager, outputStream, laparams=laparams)
intrepreter = PDFPageInterpreter(resourceManager, device)
for pdfPage in pdfPages:
intrepreter.process_page(pdfPage)
return outputStream.getvalue()
finally:
device.close()
outputStream.close()
示例6: convert_pdf_to_txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_pdf_to_txt(path, txtname, buf=True):
rsrcmgr = PDFResourceManager()
if buf:
outfp = StringIO()
else:
outfp = file(txtname, 'w')
codec = 'utf-8'
laparams = LAParams()
laparams.detect_vertical = True
# device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
device.close()
if buf:
text = re.sub(space, "", outfp.getvalue())
print (text)
outfp.close()
示例7: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv=None):
parser = argparse.ArgumentParser(description='Convert PDF into text.')
parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
parser.add_argument('-P', metavar='password', default='', help='pdf password')
parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
help='output file name (default: stdout)')
parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
lagroup = parser.add_argument_group(title='layout analysis')
lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
lagroup.add_argument('-V', action='store_true', help='detect vertical text')
lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
args = parser.parse_args(argv)
logging.basicConfig()
logging.getLogger('pdfminer').setLevel(args.l.upper())
laparams = LAParams()
if args.n:
laparams = None
else:
laparams.all_texts = args.A
laparams.detect_vertical = args.V
if args.M:
laparams.char_margin = args.M
if args.L:
laparams.line_margin = args.L
if args.W:
laparams.word_margin = args.W
if args.F:
laparams.boxes_flow = args.F
rsrcmgr = PDFResourceManager(caching=args.cache)
outtype = args.t
if not outtype:
if args.o:
if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
outtype = 'html'
elif args.o.name.endswith('.xml'):
outtype = 'xml'
elif args.o.name.endswith('.tag'):
outtype = 'tag'
if outtype == 'xml':
device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
laparams=laparams, imagewriter=args.O)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, args.o, codec=args.c)
else:
device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
for fp in args.file:
process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
caching=args.cache, check_extractable=True)
fp.close()
device.close()
if args.o is not sys.stdout:
args.o.close()
示例8: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
def usage():
print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = False
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug = True
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
if debug:
set_debug_logging()
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
close_outfp = True
else:
outfp = sys.stdout
close_outfp = False
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
laparams=laparams, outdir=outdir, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp)
else:
return usage()
for fname in args:
fp = io.open(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
if close_outfp:
outfp.close()
示例9: pdf2txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdf2txt(argv):
import getopt
(opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例10: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
'''
getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
长选项名后的等号(=)表示该选项必须有附加的参数。
返回opts和args。
'''
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = '' #参数P
pagenos = set() #参数p
maxpages = 0 #参数m
# output option
outfile = None #参数o output
outtype = None #参数t out type
outdir = None #参数O output directory
layoutmode = 'normal' #参数Y
codec = 'utf-8' #参数c
pageno = 1
scale = 1 #参数s,暂缺M,L,F,Y四个参数
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: #确认输出文件格式
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例11: pdfminerr
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def pdfminerr(argv):
global pdfminerr, install
import getopt
def usage():
print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.")
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例12: convert_pdf_To_Txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def convert_pdf_To_Txt(path,opts={}):
"""
this ALGO form pdfinterp modul documentation
"""
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
retstr = StringIO()
if outtype == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
imagewriter=imagewriter)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
#print retstr.getvalue()
txt2Pdf=retstr.getvalue()
#print type(txt2Pdf)
#fp.close()
#device.close()
#outfp.close()
return txt2Pdf
示例13: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(fname, k, v):
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
if k == '-d':
debug += 1
elif k == '-p':
pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m':
maxpages = int(v)
elif k == '-P':
password = v
elif k == '-o':
outfile = v
elif k == '-C':
caching = False
elif k == '-n':
laparams = None
elif k == '-A':
laparams.all_texts = True
elif k == '-V':
laparams.detect_vertical = True
elif k == '-M':
laparams.char_margin = float(v)
elif k == '-L':
laparams.line_margin = float(v)
elif k == '-W':
laparams.word_margin = float(v)
elif k == '-F':
laparams.boxes_flow = float(v)
elif k == '-Y':
layoutmode = v
elif k == '-O':
imagewriter = ImageWriter(v)
elif k == '-R':
rotation = int(v)
elif k == '-S':
stripcontrol = True
elif k == '-t':
outtype = v
elif k == '-c':
codec = v
elif k == '-s':
scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
#.........这里部分代码省略.........
示例14: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
import getopt
def usage():
print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = 'tag'
imagewriter = None
rotation = 0
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = False
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-R': rotation = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'tag'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
for fname in args:
l = glob.glob(fname)
count = len(l)
print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
for pdf in l:
# print pdf
d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
ext = '.' + d[outtype]
outfile = pdf[0:-4] + ext
print outfile
outfp = file(outfile, 'wb')
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
#.........这里部分代码省略.........
示例15: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import detect_vertical [as 别名]
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] '
'[-S] [-f] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
roundCoords = False
simplifyOutput = False
formatOutput = False
laparams = LAParams()
for (k, v) in opts:
if k == '-d':
debug += 1
elif k == '-p':
pagenos.update(int(x) - 1 for x in v.split(','))
elif k == '-m':
maxpages = int(v)
elif k == '-P':
password = v
elif k == '-o':
outfile = v
elif k == '-C':
caching = False
elif k == '-n':
laparams = None
elif k == '-A':
laparams.all_texts = True
elif k == '-V':
laparams.detect_vertical = True
elif k == '-M':
laparams.char_margin = float(v)
elif k == '-L':
laparams.line_margin = float(v)
elif k == '-W':
laparams.word_margin = float(v)
elif k == '-F':
laparams.boxes_flow = float(v)
elif k == '-Y':
layoutmode = v
elif k == '-O':
imagewriter = ImageWriter(v)
elif k == '-t':
outtype = v
elif k == '-c':
codec = v
elif k == '-s':
scale = float(v)
elif k == '-r':
roundCoords = True
elif k == '-S':
simplifyOutput = True
elif k == '-f':
formatOutput = True
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if formatOutput and outtype.endswith('ml'):
try:
from cStringIO import StringIO
except ImportError:
#.........这里部分代码省略.........