本文整理汇总了Python中pdfminer.layout.LAParams.line_margin方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.line_margin方法的具体用法?Python LAParams.line_margin怎么用?Python LAParams.line_margin使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout.LAParams
的用法示例。
在下文中一共展示了LAParams.line_margin方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_result_from_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def get_result_from_file(filename):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
result = {"filename": filename, "pages": []}
fp = open(filename, "rb")
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 2.0
laparams.detect_vertical = True
laparams.line_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_index = 0
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
bounding_box = get_bounding_box(layout)
labels = get_text_labels(layout)
result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
page_index += 1
fp.close()
return result
示例2: parse
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def parse(self, path):
out = StringIO.StringIO()
fp = None
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
fp = file(path)
rsrc = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin = 2.0
laparams.word_margin = 0.0
device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
interpreter = PDFPageInterpreter(rsrc, device)
for page in doc.get_pages():
interpreter.process_page(page)
device.close()
sample = Sample(path, None, out.getvalue())
out.close()
return sample
示例3: convert_to_text_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
"""
Parse file according to BORME PDF format
filename:
filenameOut:
"""
if os.path.isdir(filename_out):
filename_out = os.path.join(filename_out, os.path.basename(filename_in))
if os.path.exists(filename_out) and not rewrite:
logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
return False
# conf
codec = 'utf-8'
laparams = LAParams()
imagewriter = None
pagenos = set()
maxpages = 0
password = ''
rotation = 0
# <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
laparams.detect_vertical = True
laparams.all_texts = False
laparams.char_margin = 2.0
laparams.line_margin = 0.5
laparams.word_margin = 0.1
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
outfp = open(filename_out, 'w')
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
fp = open(filename_in, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
# https://github.com/euske/pdfminer/issues/72
#page = PDFPage()
#PDFPage.cropbox =
# y esto?
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return True
示例4: extractrefs
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def extractrefs(infile, outfile):
pagenos = set()
caching = True
infp = open(infile, 'rb')
outfp = open(outfile, 'w')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.line_margin = 1.4
device = RefsExtractor(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(infp, pagenos,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
infp.close()
outfp.close()
示例5: output_pdf_to_table
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def output_pdf_to_table(path):
fp = open(path, "rb")
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.line_margin = line_margin_threshold
codec = 'utf-8'
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password=""
maxpages=pages_to_view
caching=True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
layout = device.get_result()
getRows(layout)
示例6: readpdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def readpdf(pdfFile):
fp = open(pdfFile, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
#doc.initialize('password') # leave empty for no password
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.1 seems to work
laparams.line_margin = float('0.8')
device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
# receive the LTPage object for this page
device.get_result()
#print(device.rows)
df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String'])
return df
示例7: from_pdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def from_pdf(pdfFile):
try:
pagenos = set()
strfp = StringIO()
codec = 'utf-8'
laparams = LAParams()
#laparams.char_margin = 10
laparams.line_margin = 20
#laparams.word_margin = 10
laparams.boxes_flow = -1
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, strfp, codec=codec, laparams=laparams)
fp = file(pdfFile, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
interpreter.process_page(page)
except Exception, e:
print e
traceback.print_exc()
pass
示例8: open
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168968.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000510-BX425914.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb')
#fp = open(r'C:/Users/ashmaro1/Documents/GitHub/Text-Analysis/PDFnOCR/data/POxca-000078-AN168907.pdf', 'rb')
#fp = open(r'S:/Bhavani/Aaron/POxca-000052-R201631.pdf', 'rb')
fp = open(r'C:/Users/ashmaro1/Documents/_Projects/Glencore/POxca-000052-R201631.pdf','rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
#doc.initialize('password') # leave empty for no password
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.4 seems to work
laparams.line_margin = float('0.8')
device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
# receive the LTPage object for this page
device.get_result()
print(device.rows)
df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String'])
# create text rows from 'y' coordinate data
示例9: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def main(argv):
import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
'[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
'[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
'''
getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
长选项名后的等号(=)表示该选项必须有附加的参数。
返回opts和args。
'''
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = '' #参数P
pagenos = set() #参数p
maxpages = 0 #参数m
# output option
outfile = None #参数o output
outtype = None #参数t out type
outdir = None #参数O output directory
layoutmode = 'normal' #参数Y
codec = 'utf-8' #参数c
pageno = 1
scale = 1 #参数s,暂缺M,L,F,Y四个参数
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: #确认输出文件格式
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例10: PDFParser
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
##ATTEMPT 1
# Create a PDF parser object associated with the file object.
#parser = PDFParser(open_file)
# Create a PDF document object that stores the document structure.
#doc = PDFDocument(parser)
# Connect the parser and document objects.
#print parser.nextline()
#print parser.nextline()
#print parser.nextline()
##ATTEMPT 2
#Code from pdf2txt.py
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin=0.5
laparams.word_margin=0.1
laparams.all_texts=False
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pdf_pages = PDFPage.get_pages(fp_in, set())
pagenum = 0
pagelim = 3
for page in pdf_pages:
pagenum += 1
if pagenum > pagelim:
continue
print "Transcribing page " + str(pagenum) + " from PDF to text"
interpreter.process_page(page)
示例11: main
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrc = PDFResourceManager()
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
outfp.close()
return
示例12: LAParams
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
if m is None:
continue
if gname is not None:
logging.warn('skip %s: 2 groups in one doc: %s [%s] and %s [%s]' % (pdf_file, gname, gtype, m.groups()[2], m.groups()[0]))
return
gname = m.groups()[2].strip()
gtype = m.groups()[0].strip()
if u'五十音順' in gname:
# HACK: some docs are really borked..
return
logging.info(u'%s is a doc for %s' % (pdf_file, gname))
return gname
LAPARAMS = LAParams()
LAPARAMS.line_margin = 10.0
def extract_pdf_text(fname):
rsrcmgr = PDFResourceManager(caching=True)
codec = 'utf-8'
outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=LAPARAMS,
imagewriter=None)
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, set(), maxpages=1,
caching=True, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
示例13: convert_pdf_To_Txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def convert_pdf_To_Txt(path,opts={}):
"""
this ALGO form pdfinterp modul documentation
"""
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
retstr = StringIO()
if outtype == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
imagewriter=imagewriter)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
#print retstr.getvalue()
txt2Pdf=retstr.getvalue()
#print type(txt2Pdf)
#fp.close()
#device.close()
#outfp.close()
return txt2Pdf
示例14: pdf2txt
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def pdf2txt(argv):
import getopt
(opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
outdir = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return
示例15: pdfminerr
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import line_margin [as 别名]
def pdfminerr(argv):
global pdfminerr, install
import getopt
def usage():
print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.")
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
return