本文整理汇总了Python中pdfminer.layout.LAParams.all_texts方法的典型用法代码示例。如果您正苦于以下问题:Python LAParams.all_texts方法的具体用法?Python LAParams.all_texts怎么用?Python LAParams.all_texts使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.layout.LAParams
的用法示例。
在下文中一共展示了LAParams.all_texts方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _pdf_to_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def _pdf_to_text(path):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'ascii'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
with open(path, 'rb') as fp:
process_pdf(rsrcmgr, device, fp)
device.close()
# fix the non-utf8 string ...
result = retstr.getvalue()
txt = result.encode('ascii','ignore')
retVal = (txt,True)
retstr.close()
except Exception,e:
#print str(e)
#print "\tERROR: PDF is not formatted correctly, aborting."
retVal = ("", False)
pass
示例2: parse_pdf_pdfminer
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
parser= PDFParser(f)
doc = PDFDocument(caching=True)
parser.set_document(doc)
doc.set_parser(parser)
for page in doc.get_pages():
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_num += 1
interpreter.process_page(page)
data = retstr.getvalue()
self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
retstr.close()
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例3: count_words
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def count_words(self):
"""
Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/
and http://www.unixuser.org/~euske/python/pdfminer/programming.html
"""
with open(self.filename, "rb") as fp:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
full_text = retstr.getvalue()
full_text = full_text.translate(string.maketrans("", ""), string.punctuation)
return len(full_text.split())
示例4: parse_pdf_pdfminer
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def parse_pdf_pdfminer(self, f, fpath):
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
if self.dedup:
self.dedup_store = set()
self.handler.print_header(fpath)
page_num = 0
for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
page_num += 1
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
data = retstr.getvalue()
retstr.close()
self.parse_page(fpath, data, page_num)
self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)
示例5: _convert_pdf_to_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def _convert_pdf_to_text(self, password=None):
input_pdf = self.cvFile
if password is not None:
self.cvFilePasswd = password
pagenos = range(0, 30)
maxpages = pagenos.__len__()
layoutmode = 'normal'
codec = 'utf-8'
scale = 1
outtype = 'txt'
laparams = LAParams()
laparams.all_texts = True
laparams.showpageno = True
outputPath = self.scratchDir
inputPath = os.getcwd()
if os.path.exists(input_pdf):
inputPath = os.path.dirname(input_pdf)
input_filename = os.path.basename(input_pdf)
input_parts = input_filename.split(".")
input_parts.pop()
randomStr = int(time.time())
output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
self.cvTextFile = output_filename
outfp = file(output_filename, 'w')
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(input_pdf, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
fp.close()
device.close()
outfp.close()
return (0)
示例6: dump_pdf_pdfminer
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def dump_pdf_pdfminer(self, fpath_in):
fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
n = 0
with open(fpath_in, 'rb') as fin:
with open(fpath_out, 'wb') as fout:
try:
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
page_num = 0
for page in PDFPage.get_pages(fin, pagenos, check_extractable=True):
page_num += 1
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
data = retstr.getvalue()
retstr.close()
fout.write(data)
n += len(data)
print "Written %d bytes to %s" % (n, fpath_out)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print "Failed parsing %s" % (fpath_in)
示例7: pdf2str
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def pdf2str(path):
#Allocate resources
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
#Set parameters
codec = 'utf-8'
laparams.all_texts=True
laparams.detect_vertical = True
caching = True
pagenos = set()
#Initialize the converter
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
#Open the file and parse
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
interpreter.process_page(page)
#Clean up
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
示例8: to_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def to_text(path):
"""Wrapper around `pdfminer`.
Parameters
----------
path : str
path of electronic invoice in PDF
Returns
-------
str : str
returns extracted text from pdf
"""
try:
# python 2
from StringIO import StringIO
import sys
reload(sys) # noqa: F821
sys.setdefaultencoding('utf8')
except ImportError:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
pages = PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
)
for page in pages:
interpreter.process_page(page)
device.close()
str = retstr.getvalue()
retstr.close()
return str.encode('utf-8')
示例9: get_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def get_text(self):
"""Returns all text content from the PDF as plain text.
"""
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
try:
file_pointer = file(self.path, 'rb')
process_pdf(rsrcmgr, device, file_pointer)
except Exception as e:
logging.error("Error processing PDF: %s" % e)
raise
finally:
file_pointer.close()
device.close()
text = retstr.getvalue()
retstr.close()
if (text is None) or (text.strip() == ""):
logging.info("No text found in PDF. Attempting OCR. This will take a while.")
#FIXME this should go in a separate method
#First, convert to image
import subprocess
try:
arglist = ["gs",
"-dNOPAUSE",
"-sOutputFile=temp/page%03d.png",
"-sDEVICE=png16m",
"-r72",
self.path]
process = subprocess.call(
args=arglist,
stdout=subprocess.STDOUT,
stderr=subprocess.STDOUT)
except OSError:
logging.error("Failed to run GhostScript (using `gs`)")
#Do OCR
import time
time.sleep(1) # make sure the server has time to write the files
import Image
import pytesseract
import os
text = ""
for file_ in os.listdir("temp"):
if file_.endswith(".png"):
text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe")
os.unlink("temp/" + file_)
self.text = text
return text
示例10: convert_to_text_file
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def convert_to_text_file(filename_in, filename_out, rewrite=False):
"""
Parse file according to BORME PDF format
filename:
filenameOut:
"""
if os.path.isdir(filename_out):
filename_out = os.path.join(filename_out, os.path.basename(filename_in))
if os.path.exists(filename_out) and not rewrite:
logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
return False
# conf
codec = 'utf-8'
laparams = LAParams()
imagewriter = None
pagenos = set()
maxpages = 0
password = ''
rotation = 0
# <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
laparams.detect_vertical = True
laparams.all_texts = False
laparams.char_margin = 2.0
laparams.line_margin = 0.5
laparams.word_margin = 0.1
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
outfp = open(filename_out, 'w')
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
fp = open(filename_in, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
# https://github.com/euske/pdfminer/issues/72
#page = PDFPage()
#PDFPage.cropbox =
# y esto?
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return True
示例11: to_text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def to_text(self):
rsrcmgr = PDFResourceManager()
output = StringIO()
laparams = LAParams()
laparams.detect_vertical = True
laparams.all_texts = True
laparams.word_margin = 0.4
device = TextConverter(rsrcmgr, output, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in self._doc.get_pages():
interpreter.process_page(page)
return output.getvalue().decode('utf-8', 'ignore')
示例12: _pdf2text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def _pdf2text(self,fp):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'ascii'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
process_pdf(rsrcmgr, device, fp)
device.close()
# fix the non-utf8 string ...
result = retstr.getvalue()
txt = result.encode('ascii','ignore')
# TODO: clean this up, I feel like I'm doing the converstion twice ...
# http://stackoverflow.com/a/16503222/2154772
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
#print doc.info[0]['CreationDate'].resolve()
#
# as messed up as this is ... CreationDate isn't always the same type as it
# comes back from the PDFParser, so we need to base it on an instance of a
# basestring or not.
#
created = ""
try:
if not isinstance(doc.info[0]['CreationDate'],basestring):
creatd = doc.info[0]['CreationDate'].resolve()[2:-7]
else:
created = doc.info[0]['CreationDate'][2:-7]
except:
self._report("CreationDate field could not be decoded within PDF, setting to ''")
pass
created = created.encode('ascii','ignore')
retVal = (created,txt,True)
retstr.close()
except Exception, e:
self._report("Error: \n\t%s" % str(e))
retVal = (None,"",False)
pass
示例13: _pdf2text
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def _pdf2text(self,fp):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'ascii'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
process_pdf(rsrcmgr, device, fp)
device.close()
# fix the non-utf8 string ...
result = retstr.getvalue()
txt = result.encode('ascii','ignore')
# TODO: clean this up, I feel like I'm doing the converstion twice ...
# http://stackoverflow.com/a/16503222/2154772
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
#print doc.info[0]['CreationDate'].resolve()
#
# as messed up as this is ... CreationDate isn't always the same type as it
# comes back from the PDFParser, so we need to base it on an instance of a
# basestring or not. I'm starting to dislike PDFs ...
#
if not isinstance(doc.info[0]['CreationDate'],basestring):
datestring = doc.info[0]['CreationDate'].resolve()[2:-7]
else:
datestring = doc.info[0]['CreationDate'][2:-7]
#print "working on '{0}'...".format(datestring)
ts = strptime(datestring, "%Y%m%d%H%M%S")
created = datetime.fromtimestamp(mktime(ts))
retVal = (created,txt,True)
retstr.close()
except Exception, e:
self._reportstr("Error: \n\t%s" %str(e))
retVal = (None,"",False)
pass
示例14: getPdfAsText
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def getPdfAsText(pdfPages = None, fileDescriptor = None):
if pdfPages is None and fileDescriptor is not None:
pdfPages = getPdfPages(fileDescriptor)
resourceManager = PDFResourceManager()
laparams = LAParams()
laparams.all_texts = True
laparams.detect_vertical = True
try:
outputStream = StringIO.StringIO()
device = TextConverter(resourceManager, outputStream, laparams=laparams)
intrepreter = PDFPageInterpreter(resourceManager, device)
for pdfPage in pdfPages:
intrepreter.process_page(pdfPage)
return outputStream.getvalue()
finally:
device.close()
outputStream.close()
示例15: pdf
# 需要导入模块: from pdfminer.layout import LAParams [as 别名]
# 或者: from pdfminer.layout.LAParams import all_texts [as 别名]
def pdf(f):
rsrcmgr = PDFResourceManager()
retstr = cStringIO.StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = True
device = TextConverter(
rsrcmgr, retstr, codec=codec, laparams=laparams
)
fp = file(f, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str