本文整理汇总了Python中pdfminer.converter.TextConverter方法的典型用法代码示例。如果您正苦于以下问题:Python converter.TextConverter方法的具体用法?Python converter.TextConverter怎么用?Python converter.TextConverter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pdfminer.converter
的用法示例。
在下文中一共展示了converter.TextConverter方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_text_from_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def extract_text_from_pdf(pdf_path):
'''
Helper function to extract the plain text from .pdf files
:param pdf_path: path to PDF file to be extracted
:return: iterator of string of extracted text
'''
# https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
示例2: pdf_do_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def pdf_do_pdf(astream, afile):
outstream = io.BytesIO()
laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True)
device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams,
imagewriter=None)
interpreter = PDFPageInterpreter(rsrcmgr, device)
try:
for page in PDFPage.get_pages(astream, set(),
maxpages=0, password='',
caching=True, check_extractable=True):
interpreter.process_page(page)
except PDFTextExtractionNotAllowed as e:
log_error(str(e), afile)
return
text = outstream.getvalue()
text_do_data(text, afile)
outstream.close()
示例3: getTexts
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def getTexts(self):
try:
password =''
pagenos = set()
maxpages = 0
codec = 'utf-8'
caching = True
laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=caching)
outfp = file('temppdf.txt','w')
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fname= self.fname
fp = file(fname, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
fp.close()
device.close()
outfp.close()
infp = file('temppdf.txt','rb')
test=infp.read()
infp.close()
os.remove('temppdf.txt')
self.text=test
return "ok"
except Exception,e:
return e
示例4: do_import
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def do_import(self, results, filepath):
buff = StringIO()
fp = open(filepath, 'rb')
laparams = LAParams()
laparams.all_texts = True
rsrcmgr = PDFResourceManager()
pagenos = set()
page_num = 0
for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
page_num += 1
device = TextConverter(
rsrcmgr, buff, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
buff.write("\n")
results.investigation.update(import_text=buff.getvalue())
fp.close()
buff.close()
示例5: convertPdfToText
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convertPdfToText(path): #converts all pdf pages to text
rsrcmgr=PDFResourceManager()
retstr=StringIO()
codec='utf-8'
laparams=LAParams()
device=TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp=file(path, 'rb')
filename=path
interpreter=PDFPageInterpreter(rsrcmgr, device)
maxpages=0
caching=True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password="",caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
writeToText(text,absolute_path_shortner(path))
示例6: parse_content
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def parse_content(self):
if self.document or self.parse():
caching = True
# normal pdf
rsrcmgr = PDFResourceManager(caching=caching)
retstr = io.StringIO()
# codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in WrapperPDFPage.get_pages(self.pdf, parser=self.parser, doc=self.document, caching=caching):
interpreter.process_page(page)
self.content = retstr.getvalue()
device.close()
retstr.close()
else:
# damaged pdf
self.restore_content()
示例7: pdf_to_text
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def pdf_to_text(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
result = []
for line in text.split('\n'):
line2 = line.strip()
if line2 != '':
result.append(line2)
return result
示例8: convert
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert(fp):
logger = logging.getLogger()
logger.propagate = False
logging.getLogger().setLevel(logging.ERROR)
caching = True
rsrcmgr = PDFResourceManager(caching=caching)
pagenos=set()
maxpages=0
password=''
with StringIO() as output:
try:
device = TextConverter(rsrcmgr, output, laparams=LAParams())
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True)
return output.getvalue()
finally:
device.close()
示例9: convert
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#Function to extract names from the string using spacy
示例10: convert_pdf_to_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
with open(path, 'rb') as fp:
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
示例11: convert_pdf_to_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert_pdf_to_txt(r, max_pages=3):
text = None
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
if r.status_code != 200:
logger.info(u"error: status code {} in convert_pdf_to_txt".format(r.status_code))
return None
if not r.encoding:
r.encoding = "utf-8"
fp = StringIO(r.content_big())
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
caching = True
pagenos = set()
pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True)
for page in pages:
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
# logger.info(text)
return text
示例12: convert_pdf
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert_pdf(input_file, format='text', codec='utf-8'):
"""Convert PDF file to text or html.
Args:
input_file (str): Input PDF file.
format (str): Format text or html.
codec (str): Codec for encode the text.
Returns:
str: Return text or html from PDF file.
"""
manager = PDFResourceManager()
output = BytesIO()
laparams = LAParams()
if format == 'text':
converter = TextConverter(manager, output, codec=codec, laparams=laparams)
elif format == 'html':
converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)
with open(input_file, 'rb') as f1:
interpreter = PDFPageInterpreter(manager, converter)
for page in PDFPage.get_pages(f1,
caching=True,
check_extractable=True):
interpreter.process_page(page)
converter.close()
text = output.getvalue()
output.close()
return text.decode()
示例13: convert_pdf_to_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def convert_pdf_to_txt(self, path):
"""
A very simple conversion function
which returns text for parsing from PDF.
path = The path to the file
"""
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(
rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
except Exception as e:
text = ""
return text
self.logger.error(
"Failed to PDF to text: " + str(e))
示例14: pdf_2_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def pdf_2_txt(pdf) :
outfile = pdf + '.txt'
args = [pdf]
debug = 0
pagenos = set()
password = ''
maxpages = 0
rotation = 0
codec = 'utf-8' #输出编码
caching = True
imagewriter = None
laparams = LAParams()
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
rsrcmgr = PDFResourceManager(caching=caching)
outfp = open(outfile,'w',encoding="utf8")
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,imagewriter=imagewriter)
for fname in args:
fp = open(fname,'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
#处理文档对象中每一页的内容
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True) :
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return outfile
示例15: pdf_to_txt
# 需要导入模块: from pdfminer import converter [as 别名]
# 或者: from pdfminer.converter import TextConverter [as 别名]
def pdf_to_txt(file_name):
try:
file_pointer = open(file_name,'rb')
# Setting up pdf reader
pdf_resource_manager = PDFResourceManager()
return_string = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(pdf_resource_manager, return_string, codec=codec, \
laparams=laparams)
interpreter = PDFPageInterpreter(pdf_resource_manager, device)
for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="",
caching=True, check_extractable=True):
interpreter.process_page(page)
file_pointer.close()
device.close()
# Get full string from PDF
pdf_txt = return_string.getvalue()
return_string.close()
# logging.debug(pdf_txt)
# Formatting removing and replacing special characters
pdf_txt = pdf_txt.replace("\r", "\n")
pdf_txt = re.sub(regex.bullet, " ", pdf_txt)
return pdf_txt.decode('ascii', errors='ignore')
except Exception, exception_instance:
logging.error('Error converting pdf to txt: '+str(exception_instance))
return ''