本文整理汇总了Python中pdfminer.layout.LAParams类的典型用法代码示例。如果您正苦于以下问题:Python LAParams类的具体用法?Python LAParams怎么用?Python LAParams使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LAParams类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_pdf_pdfminer
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
parser= PDFParser(f)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
for page in doc.get_pages():
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_num += 1
interpreter.process_page(page)
data = retstr.getvalue()
self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
retstr.close()
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例2: initialize_pdf_miner
def initialize_pdf_miner(fh):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fh)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser)
# Connect the parser and document objects.
parser.set_document(doc)
#doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
#doc.initialize("")
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
pass
#raise ValueError("PDFDocument is_extractable was False.")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
# for page in doc.get_pages():
# interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1,
boxes_flow=0.1, detect_vertical=False, all_texts=False)
laparams.word_margin = 0.0
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return doc, interpreter, device
示例3: initialize_pdf_miner
def initialize_pdf_miner(fh, password = None):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fh)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password)
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise ValueError("PDFDocument is_extractable was False.")
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
# for page in doc.get_pages():
# interpreter.process_page(page)
# Set parameters for analysis.
laparams = LAParams()
laparams.word_margin = 0.0
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return doc, interpreter, device
示例4: _convert_pdf_to_text
def _convert_pdf_to_text(self, password=None):
input_pdf = self.cvFile
if password is not None:
self.cvFilePasswd = password
pagenos = range(0, 30)
maxpages = pagenos.__len__()
layoutmode = 'normal'
codec = 'utf-8'
scale = 1
outtype = 'txt'
laparams = LAParams()
laparams.all_texts = True
laparams.showpageno = True
outputPath = self.scratchDir
inputPath = os.getcwd()
if os.path.exists(input_pdf):
inputPath = os.path.dirname(input_pdf)
input_filename = os.path.basename(input_pdf)
input_parts = input_filename.split(".")
input_parts.pop()
randomStr = int(time.time())
output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
self.cvTextFile = output_filename
outfp = file(output_filename, 'w')
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(input_pdf, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
fp.close()
device.close()
outfp.close()
return (0)
示例5: pdf2xml
def pdf2xml(infile):
'''
Return a string of XML representation for given PDF file handle.
Uses pdfminer to do the conversion and does some final post-processing.
'''
outfile = StringIO()
# Empirically determined...
laparams = LAParams()
laparams.char_margin = 0.4
# See pdf2txt.py
rsrcmgr = PDFResourceManager(caching=False)
device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
if page_api:
for page in PDFPage.get_pages(infile, set()):
interpreter.process_page(page)
else:
process_pdf(rsrcmgr, device, infile, set())
infile.close()
return outfile.getvalue().replace("\n", "")
示例6: parse
def parse(self, path):
out = StringIO.StringIO()
fp = None
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
fp = file(path)
rsrc = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
laparams.char_margin = 2.0
laparams.line_margin = 2.0
laparams.word_margin = 0.0
device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
doc = PDFDocument()
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
interpreter = PDFPageInterpreter(rsrc, device)
for page in doc.get_pages():
interpreter.process_page(page)
device.close()
sample = Sample(path, None, out.getvalue())
out.close()
return sample
示例7: dump_pdf_pdfminer
def dump_pdf_pdfminer(self, fpath_in):
fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
n = 0
with open(fpath_in, 'rb') as fin:
with open(fpath_out, 'wb') as fout:
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
page_num = 0
for page in PDFPage.get_pages(fin, pagenos, check_extractable=True):
page_num += 1
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
data = retstr.getvalue()
retstr.close()
fout.write(data)
n += len(data)
print "Written %d bytes to %s" % (n, fpath_out)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print "Failed parsing %s" % (fpath_in)
示例8: get_result_from_file
def get_result_from_file(filename):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
result = {"filename": filename, "pages": []}
fp = open(filename, "rb")
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 2.0
laparams.detect_vertical = True
laparams.line_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_index = 0
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
bounding_box = get_bounding_box(layout)
labels = get_text_labels(layout)
result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
page_index += 1
fp.close()
return result
示例9: count_words
def count_words(self):
"""
Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/
and http://www.unixuser.org/~euske/python/pdfminer/programming.html
"""
with open(self.filename, "rb") as fp:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
full_text = retstr.getvalue()
full_text = full_text.translate(string.maketrans("", ""), string.punctuation)
return len(full_text.split())
示例10: parse_pdf_pdfminer
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
page_num += 1
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
data = retstr.getvalue()
retstr.close()
self.parse_page(fpath, data, page_num)
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例11: _pdf_to_text
def _pdf_to_text(path):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'ascii'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
with open(path, 'rb') as fp:
process_pdf(rsrcmgr, device, fp)
device.close()
# fix the non-utf8 string ...
result = retstr.getvalue()
txt = result.encode('ascii','ignore')
retVal = (txt,True)
retstr.close()
except Exception,e:
#print str(e)
#print "\tERROR: PDF is not formatted correctly, aborting."
retVal = ("", False)
pass
示例12: pdf2str
def pdf2str(path):
#Allocate resources
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
#Set parameters
codec = 'utf-8'
laparams.all_texts=True
laparams.detect_vertical = True
caching = True
pagenos = set()
#Initialize the converter
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
#Open the file and parse
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
interpreter.process_page(page)
#Clean up
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
示例13: __init__
def __init__(self, line_overlap=0.5, header_perc=7.5, footer_perc=7.5):
LAParams.__init__(self, line_overlap=line_overlap, char_margin=line_overlap,
line_margin=line_overlap, word_margin=line_overlap,
boxes_flow=line_overlap, detect_vertical=False, all_texts=False)
self.header_perc = header_perc # Fraction of the header (% of the page)
self.footer_perc = footer_perc # Fraction of the footer (% of the page)
return
示例14: to_text
def to_text(path):
"""Wrapper around `pdfminer`.
Parameters
----------
path : str
path of electronic invoice in PDF
Returns
-------
str : str
returns extracted text from pdf
"""
try:
# python 2
from StringIO import StringIO
import sys
reload(sys) # noqa: F821
sys.setdefaultencoding('utf8')
except ImportError:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
pages = PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
)
for page in pages:
interpreter.process_page(page)
device.close()
str = retstr.getvalue()
retstr.close()
return str.encode('utf-8')
示例15: parse_pdf
def parse_pdf(self, test_parse=False):
"""
Parse a PDF and return text contents as an array
"""
dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)
# input options
pagenos = set()
maxpages = 0
# output option
codec = "utf-8"
caching = True
laparams = LAParams()
laparams.char_margin = 8.0
laparams.word_margin = 2.0
rsrcmgr = PDFResourceManager(caching=caching)
try:
outfp = file(self.text_file, "w")
except IOError as io_error:
raise DTPOFileError(self.text_file, 0, str(io_error))
try:
fp = file(self.source_file, "rb")
except IOError as io_error:
raise DTPOFileError(self.source_file, 0, str(io_error))
try:
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)
except PDFException as pdf_error:
message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
raise DTPOFileError(self.source_file, 0, message)
except Exception as exception:
message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
raise DTPOFileError(self.source_file, 0, message)
fp.close()
device.close()
outfp.close()
# Got the PDF converted = now get it into an array
self.file_array = []
for line in open(self.text_file):
self.file_array.append(line)
# Remove the last entry - it's always '\x0c'
if len(self.file_array) > 0:
del self.file_array[-1]
# Remove the outfile
if not test_parse:
os.remove(self.text_file)