本文整理汇总了Python中pdfminer.layout.LAParams.char_margin方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.char_margin方法的具体用法?Python LAParams.char_margin怎么用?Python LAParams.char_margin使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout.LAParams
的用法示例。
在下文中一共展示了LAParams.char_margin方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_result_from_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def get_result_from_file(filename):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
result = {"filename": filename, "pages": []}
fp = open(filename, "rb")
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 2.0
laparams.detect_vertical = True
laparams.line_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_index = 0
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
bounding_box = get_bounding_box(layout)
labels = get_text_labels(layout)
result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
page_index += 1
fp.close()
return result
示例2: pdf2xml
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def pdf2xml(infile):
'''
Return a string of XML representation for given PDF file handle.
Uses pdfminer to do the conversion and does some final post-processing.
'''
outfile = StringIO()
# Empirically determined...
laparams = LAParams()
laparams.char_margin = 0.4
# See pdf2txt.py
rsrcmgr = PDFResourceManager(caching=False)
device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if page_api:
for page in PDFPage.get_pages(infile, set()):
interpreter.process_page(page)
else:
process_pdf(rsrcmgr, device, infile, set())
infile.close()
return outfile.getvalue().replace("\n", "")
示例3: parse
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def parse(self, path):
out = StringIO.StringIO()
fp = None
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
fp = file(path)
rsrc = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin = 2.0
laparams.word_margin = 0.0
device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
interpreter = PDFPageInterpreter(rsrc, device)
for page in doc.get_pages():
interpreter.process_page(page)
device.close()
sample = Sample(path, None, out.getvalue())
out.close()
return sample
示例4: parse_pdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def parse_pdf(self, test_parse=False):
"""
Parse a PDF and return text contents as an array
"""
dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)
# input options
pagenos = set()
maxpages = 0
# output option
codec = "utf-8"
caching = True
laparams = LAParams()
laparams.char_margin = 8.0
laparams.word_margin = 2.0
rsrcmgr = PDFResourceManager(caching=caching)
try:
outfp = file(self.text_file, "w")
except IOError as io_error:
raise DTPOFileError(self.text_file, 0, str(io_error))
try:
fp = file(self.source_file, "rb")
except IOError as io_error:
raise DTPOFileError(self.source_file, 0, str(io_error))
try:
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)
except PDFException as pdf_error:
message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
raise DTPOFileError(self.source_file, 0, message)
except Exception as exception:
message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
raise DTPOFileError(self.source_file, 0, message)
fp.close()
device.close()
outfp.close()
# Got the PDF converted = now get it into an array
self.file_array = []
for line in open(self.text_file):
self.file_array.append(line)
# Remove the last entry - it's always '\x0c'
if len(self.file_array) > 0:
del self.file_array[-1]
# Remove the outfile
if not test_parse:
os.remove(self.text_file)
示例5: convert_to_text_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
"""
Parse file according to BORME PDF format
filename:
filenameOut:
"""
if os.path.isdir(filename_out):
filename_out = os.path.join(filename_out, os.path.basename(filename_in))
if os.path.exists(filename_out) and not rewrite:
logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
return False
# conf
codec = 'utf-8'
laparams = LAParams()
imagewriter = None
pagenos = set()
maxpages = 0
password = ''
rotation = 0
# <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
laparams.detect_vertical = True
laparams.all_texts = False
laparams.char_margin = 2.0
laparams.line_margin = 0.5
laparams.word_margin = 0.1
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
outfp = open(filename_out, 'w')
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
fp = open(filename_in, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
# https://github.com/euske/pdfminer/issues/72
#page = PDFPage()
#PDFPage.cropbox =
# y esto?
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return True
示例6: readpdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def readpdf(pdfFile):
fp = open(pdfFile, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
#doc.initialize('password') # leave empty for no password
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.1 seems to work
laparams.line_margin = float('0.8')
device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
# receive the LTPage object for this page
device.get_result()
#print(device.rows)
df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String'])
return df
示例7: read_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def read_file(self):
with open(self.path, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 0.1
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = []
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text)
示例8: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv=None):
parser = argparse.ArgumentParser(description='Convert PDF into text.')
parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
parser.add_argument('-P', metavar='password', default='', help='pdf password')
parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
help='output file name (default: stdout)')
parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
lagroup = parser.add_argument_group(title='layout analysis')
lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
lagroup.add_argument('-V', action='store_true', help='detect vertical text')
lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
args = parser.parse_args(argv)
logging.basicConfig()
logging.getLogger('pdfminer').setLevel(args.l.upper())
laparams = LAParams()
if args.n:
laparams = None
else:
laparams.all_texts = args.A
laparams.detect_vertical = args.V
if args.M:
laparams.char_margin = args.M
if args.L:
laparams.line_margin = args.L
if args.W:
laparams.word_margin = args.W
if args.F:
laparams.boxes_flow = args.F
rsrcmgr = PDFResourceManager(caching=args.cache)
outtype = args.t
if not outtype:
if args.o:
if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
outtype = 'html'
elif args.o.name.endswith('.xml'):
outtype = 'xml'
elif args.o.name.endswith('.tag'):
outtype = 'tag'
if outtype == 'xml':
device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
laparams=laparams, imagewriter=args.O)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, args.o, codec=args.c)
else:
device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
for fp in args.file:
process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
caching=args.cache, check_extractable=True)
fp.close()
device.close()
if args.o is not sys.stdout:
args.o.close()
示例9: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv):
import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
'''
getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
长选项名后的等号(=)表示该选项必须有附加的参数。
返回opts和args。
'''
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = '' #参数P
pagenos = set() #参数p
maxpages = 0 #参数m
# output option
outfile = None #参数o output
outtype = None #参数t out type
outdir = None #参数O output directory
layoutmode = 'normal' #参数Y
codec = 'utf-8' #参数c
pageno = 1
scale = 1 #参数s,暂缺M,L,F,Y四个参数
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: #确认输出文件格式
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例10: open
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168846.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168968.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000510-BX425914.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb')
#fp = open(r'S:/Bhavani/Aaron/POxca-000052-R201631.pdf', 'rb')
fp = open(r'C:/Users/ashmaro1/Documents/_Projects/Glencore/POxca-000052-R201631.pdf','rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
#doc.initialize('password') # leave empty for no password
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.4 seems to work
laparams.line_margin = float('0.8')
device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
# receive the LTPage object for this page
device.get_result()
print(device.rows)
df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String'])
# create text rows from 'y' coordinate data
示例11: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrc = PDFResourceManager()
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
outfp.close()
return
示例12: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv):
import getopt
def usage():
print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = 'tag'
imagewriter = None
rotation = 0
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = False
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-R': rotation = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'tag'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
for fname in args:
l = glob.glob(fname)
count = len(l)
print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
for pdf in l:
# print pdf
d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
ext = '.' + d[outtype]
outfile = pdf[0:-4] + ext
print outfile
outfp = file(outfile, 'wb')
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
device.showpageno = False
#.........这里部分代码省略.........
示例13: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv):
def usage():
print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = False
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug = True
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
if debug:
set_debug_logging()
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
close_outfp = True
else:
outfp = sys.stdout
close_outfp = False
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
laparams=laparams, outdir=outdir, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp)
else:
return usage()
for fname in args:
fp = io.open(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
if close_outfp:
outfp.close()
示例14: pdf2txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def pdf2txt(argv):
import getopt
(opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例15: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import char_margin [as 别名]
def main(argv):
import getopt
def usage():
print(
"usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] "
"[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] "
"[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0]
)
return 100
try:
(opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:")
except getopt.GetoptError:
return usage()
if not args:
return usage()
# debug option
debug = 0
# input option
password = ""
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = "normal"
codec = "utf-8"
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == "-d":
debug += 1
elif k == "-p":
pagenos.update(int(x) - 1 for x in v.split(","))
elif k == "-m":
maxpages = int(v)
elif k == "-P":
password = v
elif k == "-o":
outfile = v
elif k == "-C":
caching = False
elif k == "-n":
laparams = None
elif k == "-A":
laparams.all_texts = True
elif k == "-V":
laparams.detect_vertical = True
elif k == "-M":
laparams.char_margin = float(v)
elif k == "-L":
laparams.line_margin = float(v)
elif k == "-W":
laparams.word_margin = float(v)
elif k == "-F":
laparams.boxes_flow = float(v)
elif k == "-Y":
layoutmode = v
elif k == "-O":
outdir = v
elif k == "-t":
outtype = v
elif k == "-c":
codec = v
elif k == "-s":
scale = float(v)
#
# PDFDocument.debug = debug
# PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = "text"
if outfile:
if outfile.endswith(".htm") or outfile.endswith(".html"):
outtype = "html"
elif outfile.endswith(".xml"):
outtype = "xml"
elif outfile.endswith(".tag"):
outtype = "tag"
if outfile:
outfp = file(outfile, "w")
else:
outfp = sys.stdout
if outtype == "text":
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == "xml":
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == "html":
device = HTMLConverter(
rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir
#.........这里部分代码省略.........