本文整理汇总了Python中pdfminer.layout.LAParams.boxes_flow方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.boxes_flow方法的具体用法?Python LAParams.boxes_flow怎么用?Python LAParams.boxes_flow使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout.LAParams
的用法示例。
在下文中一共展示了LAParams.boxes_flow方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: GetScript
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def GetScript(filename):
global scriptName
ResetGlobals()
scriptName = filename
password = ""
# Open a PDF file.
fp = open(filename, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser, password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
print "---Not translatable---"
return
#raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams()
laparams.boxes_flow = 2
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for pgnum,page in enumerate(PDFPage.create_pages(document)):
if pgnum == 0:
continue
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
text = []
for page in layout:
try:
if page.get_text().strip():
text.append(TextBlock(page.x0,page.y1,page.get_text().strip()))
except:
temp=5
print ".",
text.sort(key = lambda row:(-row.y))
# Parse all of the "line" objects in each page
for line in text:
ParseLine(line.text, line.x)
示例2: from_pdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def from_pdf(pdfFile):
try:
pagenos = set()
strfp = StringIO()
codec = 'utf-8'
laparams = LAParams()
#laparams.char_margin = 10
laparams.line_margin = 20
#laparams.word_margin = 10
laparams.boxes_flow = -1
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, strfp, codec=codec, laparams=laparams)
fp = file(pdfFile, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
interpreter.process_page(page)
except Exception, e:
print e
traceback.print_exc()
pass
示例3: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-R': rotation = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
示例4: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
import getopt
def usage():
print(
"usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] "
"[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] "
"[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0]
)
return 100
try:
(opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:")
except getopt.GetoptError:
return usage()
if not args:
return usage()
# debug option
debug = 0
# input option
password = ""
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = "normal"
codec = "utf-8"
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == "-d":
debug += 1
elif k == "-p":
pagenos.update(int(x) - 1 for x in v.split(","))
elif k == "-m":
maxpages = int(v)
elif k == "-P":
password = v
elif k == "-o":
outfile = v
elif k == "-C":
caching = False
elif k == "-n":
laparams = None
elif k == "-A":
laparams.all_texts = True
elif k == "-V":
laparams.detect_vertical = True
elif k == "-M":
laparams.char_margin = float(v)
elif k == "-L":
laparams.line_margin = float(v)
elif k == "-W":
laparams.word_margin = float(v)
elif k == "-F":
laparams.boxes_flow = float(v)
elif k == "-Y":
layoutmode = v
elif k == "-O":
outdir = v
elif k == "-t":
outtype = v
elif k == "-c":
codec = v
elif k == "-s":
scale = float(v)
#
# PDFDocument.debug = debug
# PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = "text"
if outfile:
if outfile.endswith(".htm") or outfile.endswith(".html"):
outtype = "html"
elif outfile.endswith(".xml"):
outtype = "xml"
elif outfile.endswith(".tag"):
outtype = "tag"
if outfile:
outfp = file(outfile, "w")
else:
outfp = sys.stdout
if outtype == "text":
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == "xml":
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == "html":
device = HTMLConverter(
rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir
#.........这里部分代码省略.........
示例5: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
def usage():
print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = False
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug = True
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
if debug:
set_debug_logging()
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
close_outfp = True
else:
outfp = sys.stdout
close_outfp = False
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
laparams=laparams, outdir=outdir, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp)
else:
return usage()
for fname in args:
fp = io.open(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
if close_outfp:
outfp.close()
示例6: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv=None):
parser = argparse.ArgumentParser(description='Convert PDF into text.')
parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
parser.add_argument('-P', metavar='password', default='', help='pdf password')
parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
help='output file name (default: stdout)')
parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
lagroup = parser.add_argument_group(title='layout analysis')
lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
lagroup.add_argument('-V', action='store_true', help='detect vertical text')
lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
args = parser.parse_args(argv)
logging.basicConfig()
logging.getLogger('pdfminer').setLevel(args.l.upper())
laparams = LAParams()
if args.n:
laparams = None
else:
laparams.all_texts = args.A
laparams.detect_vertical = args.V
if args.M:
laparams.char_margin = args.M
if args.L:
laparams.line_margin = args.L
if args.W:
laparams.word_margin = args.W
if args.F:
laparams.boxes_flow = args.F
rsrcmgr = PDFResourceManager(caching=args.cache)
outtype = args.t
if not outtype:
if args.o:
if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
outtype = 'html'
elif args.o.name.endswith('.xml'):
outtype = 'xml'
elif args.o.name.endswith('.tag'):
outtype = 'tag'
if outtype == 'xml':
device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
laparams=laparams, imagewriter=args.O)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, args.o, codec=args.c)
else:
device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
for fp in args.file:
process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
caching=args.cache, check_extractable=True)
fp.close()
device.close()
if args.o is not sys.stdout:
args.o.close()
示例7: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
'''
getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
长选项名后的等号(=)表示该选项必须有附加的参数。
返回opts和args。
'''
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = '' #参数P
pagenos = set() #参数p
maxpages = 0 #参数m
# output option
outfile = None #参数o output
outtype = None #参数t out type
outdir = None #参数O output directory
layoutmode = 'normal' #参数Y
codec = 'utf-8' #参数c
pageno = 1
scale = 1 #参数s,暂缺M,L,F,Y四个参数
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: #确认输出文件格式
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例8: convert_pdf_To_Txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def convert_pdf_To_Txt(path,opts={}):
"""
this ALGO form pdfinterp modul documentation
"""
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
retstr = StringIO()
if outtype == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
imagewriter=imagewriter)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
#print retstr.getvalue()
txt2Pdf=retstr.getvalue()
#print type(txt2Pdf)
#fp.close()
#device.close()
#outfp.close()
return txt2Pdf
示例9: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(fname, k, v):
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
if k == '-d':
debug += 1
elif k == '-p':
pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m':
maxpages = int(v)
elif k == '-P':
password = v
elif k == '-o':
outfile = v
elif k == '-C':
caching = False
elif k == '-n':
laparams = None
elif k == '-A':
laparams.all_texts = True
elif k == '-V':
laparams.detect_vertical = True
elif k == '-M':
laparams.char_margin = float(v)
elif k == '-L':
laparams.line_margin = float(v)
elif k == '-W':
laparams.word_margin = float(v)
elif k == '-F':
laparams.boxes_flow = float(v)
elif k == '-Y':
layoutmode = v
elif k == '-O':
imagewriter = ImageWriter(v)
elif k == '-R':
rotation = int(v)
elif k == '-S':
stripcontrol = True
elif k == '-t':
outtype = v
elif k == '-c':
codec = v
elif k == '-s':
scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
#.........这里部分代码省略.........
示例10: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
import getopt
def usage():
print(
"usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]"
" [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]"
" [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]"
" [-t text|html|xml|tag] [-c codec] [-s scale]"
" file ..." % argv[0]
)
return 100
try:
(opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:")
except getopt.GetoptError:
return usage()
if not args:
return usage()
# input option
password = b""
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = "normal"
codec = "utf-8"
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == "-d":
logging.getLogger().setLevel(logging.DEBUG)
elif k == "-p":
pagenos.update(int(x) - 1 for x in v.split(","))
elif k == "-m":
maxpages = int(v)
elif k == "-P":
password = v
elif k == "-o":
outfile = v
elif k == "-C":
caching = False
elif k == "-n":
laparams = None
elif k == "-A":
laparams.all_texts = True
elif k == "-V":
laparams.detect_vertical = True
elif k == "-M":
laparams.char_margin = float(v)
elif k == "-L":
laparams.line_margin = float(v)
elif k == "-W":
laparams.word_margin = float(v)
elif k == "-F":
laparams.boxes_flow = float(v)
elif k == "-Y":
layoutmode = v
elif k == "-O":
imagewriter = ImageWriter(v)
elif k == "-R":
rotation = int(v)
elif k == "-S":
stripcontrol = True
elif k == "-t":
outtype = v
elif k == "-c":
codec = v
elif k == "-s":
scale = float(v)
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = "text"
if outfile:
if outfile.endswith(".htm") or outfile.endswith(".html"):
outtype = "html"
elif outfile.endswith(".xml"):
outtype = "xml"
elif outfile.endswith(".tag"):
outtype = "tag"
if outfile:
outfp = open(outfile, "wb")
else:
outfp = sys.stdout
if outfp.encoding is not None:
codec = None
if outtype == "text":
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
elif outtype == "xml":
device = XMLConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol
)
#.........这里部分代码省略.........
示例11: readPDF2HTML
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def readPDF2HTML(pdfFile, opts={}):
# open a PDF file
fp = StringIO(pdfFile.read())
retstr = StringIO()
# create a PDF parser object associated with the file object
parser = PDFParser(fp)
# create a PDF document allows text extraction
document = PDFDocument(parser) # password if needed
# check if document allows text extraction without password
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# create a PDF resource manager object that sotres shared resources
rsrcmgr = PDFResourceManager()
# create a PDF device object
laparams = LAParams()
for (k, v) in opts:
if k == '-d':
debug += 1
elif k == '-p':
pagenos.update(int(x) - 1 for x in v.split(','))
elif k == '-m':
maxpages = int(v)
elif k == '-P':
password = v
elif k == '-o':
outfile = v
elif k == '-n':
laparams = None
elif k == '-A':
laparams.all_texts = True
elif k == '-V':
laparams.detect_vertical = True
elif k == '-M':
laparams.char_margin = float(v)
elif k == '-L':
laparams.line_margin = float(v)
elif k == '-W':
laparams.word_margin = float(v)
elif k == '-F':
laparams.boxes_flow = float(v)
elif k == '-Y':
layoutmode = v
elif k == '-O':
outdir = v
elif k == '-t':
outtype = v
elif k == '-c':
codec = v
elif k == '-s':
scale = float(v)
codec = 'utf-8'
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# create a PDF interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
pagenos = set()
# process each page contained in the document
for page in PDFPage.get_pages(fp, pagenos):
interpreter.process_page(page)
# close streams and return text content
fp.close()
content = retstr.getvalue()
device.close()
retstr.close()
return content
示例12: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
#.........这里部分代码省略.........
parser.add_argument('-n', '--no-layout', dest='layout', action='store_false',
default=True,
help='Suppress layout analysis.')
parser.add_argument('--show-pageno', dest='show_pageno', action='store_true',
default=False,
help='Show page numbers.')
parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true',
default=False,
help='Forces to perform layout analysis for all the text strings, including text contained in figures.')
parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true',
default=False,
help='Allows vertical writing detection.')
parser.add_argument('-M', dest='char_margin', action='store',
type=float,
default=2.0,
help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.')
parser.add_argument('-L', dest='line_margin', action='store',
type=float,
default=0.5,
help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.')
parser.add_argument('-W', dest='word_margin', action='store',
type=float,
default=0.1,
help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.')
parser.add_argument('-F', dest='boxes_flow', action='store',
type=float,
default=0.5,
help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).')
parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store',
type=str,
default='normal',
choices = ['exact', 'normal', 'loose'],
help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.')
parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store',
type=str,
default=None,
help='imagewriter')
parser.add_argument('-R', '--rotation', dest='rotation', action='store',
type=int,
default=0,
help='rotation')
parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true',
default=False,
help='stripcontrol')
parser.add_argument('-s', dest='scale', action='store',
type=float,
default=1,
help='Specifies the output scale. Can be used in HTML format only.')
parser.add_argument('--draw-lines', dest='draw_lines', action='store_true',
help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.")
示例13: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def main(argv):
import getopt
def usage():
print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = 'tag'
imagewriter = None
rotation = 0
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = False
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-R': rotation = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'tag'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
for fname in args:
l = glob.glob(fname)
count = len(l)
print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
for pdf in l:
# print pdf
d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
ext = '.' + d[outtype]
outfile = pdf[0:-4] + ext
print outfile
outfp = file(outfile, 'wb')
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
#.........这里部分代码省略.........
示例14: pdfminerr
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def pdfminerr(argv):
global pdfminerr, install
import getopt
def usage():
print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.")
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例15: pdf2txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import boxes_flow [as 别名]
def pdf2txt(argv):
import getopt
(opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return