本文整理汇总了Python中textract.process方法的典型用法代码示例。如果您正苦于以下问题:Python textract.process方法的具体用法?Python textract.process怎么用?Python textract.process使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类textract
的用法示例。
在下文中一共展示了textract.process方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: add_text
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def add_text(self, my_dir = None):
""" Adds all text / files from raw_file directory """
if not my_dir:
my_dir = self.my_data_dir
## may not have to check here but seems pragmatic
self._check_defaults()
print "Extracting text from:", my_dir
file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))]
for f in file_list: # Will overwrite text for any existing files
print "\tProcessing file:", f
txt = textract.process( join(my_dir, f), encoding="utf-8" )
txt = txt.replace("\xa0", " ")
txt = txt.decode('ascii', errors="ignore")
txt = txt.encode("ascii") #, errors="ignore")
self.everything['input'][f] = txt
示例2: convert_to_text_and_and_move_field
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_and_and_move_field():
for _i in range(0,len(lists[2]) ,1) : # .doc
_text_lists_convert2[_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8').lower()
lan,x = langid.classify(_text_lists_convert2[_i])
if lan == "vi":
shutil.copy(_dir+lists[2][_i] + "",_dir_vi + lists[2][_i] +"")
if lan == "en":
shutil.copy(_dir+lists[2][_i] + "",_dir_en + lists[2][_i] +"")
else :
print 'done or the language not of english , vietnames with doc^^'
_i+=1
# print '\tHave %s doc file ' %(str(_i))
示例3: get_features
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
directory = kwargs['directory']
for file_path in RTFAdapter.get_file_list(directory, 'txt'):
features = RTFAdapter.get_meta_features(file_path=file_path)
features['text'] = textract.process(file_path)
yield features
示例4: extract
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract(path):
'''
Extract full text fro pdf's
:param path: [String] Path to a pdf file downloaded via {fetch}, or another way.
:return: [str] a string of text
Usage::
import pyminer
# a pdf
url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
out = pyminer.fetch(url)
out.parse()
# search first, then pass links to fetch
res = pyminer.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
# url = res.links_pdf()[0]
url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
x = pyminer.fetch(url)
pyminer.extract(x.path)
'''
text = textract.process(path)
return text
示例5: extract
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract(path):
'''
Extract full text fro pdf's
:param path: [String] Path to a pdf file downloaded via {fetch}, or another way.
:return: [str] a string of text
Usage::
from pyminer import miner
# a pdf
url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
out = miner.fetch(url)
out.parse()
# search first, then pass links to fetch
res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
# url = res.links_pdf()[0]
url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
x = miner.fetch(url)
miner.extract(x.path)
'''
text = textract.process(path)
return text
示例6: get_features
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
directory = kwargs['directory']
for file_path in DocAdapter.get_file_list(directory, 'txt'):
features = DocAdapter.get_meta_features(file_path=file_path)
features['text'] = textract.process(file_path)
yield features
示例7: get_features
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
directory = kwargs['directory']
for file_path in PDFAdapter.get_file_list(directory, 'txt'):
features = PDFAdapter.get_meta_features(file_path=file_path)
features['text'] = textract.process(file_path)
yield features
示例8: parse_pdfs
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def parse_pdfs():
pdfs = ScienceArticle.objects.filter(got_pdf=True).filter(text=None)
for pdf in pdfs:
filename = join(settings.BASE_DIR, 'uploads', 'research', str(pdf.file).split('/')[-1])
try:
parsed = textract.process(filename)
pdf.text = smart_text(parsed)
pdf.save()
print((colored.green("Successfully saved PDF text to db.")))
except Exception as e:
print((colored.red("[ERROR] At PDF text parse: {0}".format(e))))
示例9: add_files
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def add_files(self, my_dir = None):
""" Adds all text / files from raw_file directory - maybe call train() next """
if not my_dir:
my_dir = self.my_data_dir
## still have to check here vs. __init__() in case file is corrupt
self._check_defaults()
print "Extracting text from:", my_dir
file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))]
for f in file_list: # Will overwrite text for any existing files
print "\tProcessing file:", f
self.everything['input'][f] = textract.process( join(my_dir, f), encoding="ascii" )
示例10: convert_to_text
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text(self, path=None, content=None, converter=None):
if converter == "text":
result = self.remove_newline.sub(b' ', content)
return result
elif converter == "textract":
result = textract.process(path)
result = self.remove_newline.sub(b' ', result)
return result
else:
return None
示例11: convert_to_text_to_process
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_to_process():
for _i in range(0,len(lists[2]) ,1) : # .doc
lists[2][_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8') #.lower()
_i+=1
# print _i
'''
_j=i
for _j in range(_i,len(lists[3])+_i,1) : # .pdf
lists[2][_j] = (textract.process(_dir+lists[3][_j-_i])).decode('utf-8').lower()
_j+=1
print _j
'''
示例12: convert_to_text_to_process
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_to_process():
for _i in range(0,len(lists[2]) ,1) : # .doc
lists[2][_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8')
_i+=1
print _i
示例13: build_indexes
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def build_indexes(files_list, index_file):
toolbar_width = len(files_list)
print(toolbar_width)
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['
hash_index = {}
for item in files_list:
text = textract.process(item)
details = re.split("[, \t\n\\t:;]", text)
for i in details:
if i == "" : continue
if hash_index.has_key((i)) :
if hash_index[(i)].has_key((item)):
hash_index[(i)][(item)] += 1
else:
hash_index[(i)][(item)] = 1
else:
hash_index[(i)] = {}
if hash_index[(i)].has_key(item):
hash_index[(i)][(item)] += 1
else:
hash_index[(i)][(item)] = 1
# update the bar
sys.stdout.write("-")
sys.stdout.flush()
sys.stdout.write("\n")
fp = open(index_file, "w")
json.dump(hash_index, fp)
fp.close()
#build_indexes(path_helper.get_files_list(), 'index')
示例14: extract_text
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract_text(file_extraction, file_pdf):
logging.info(f"Extracting text from {file_pdf} ... ( This may take a minute or two.)")
with file_extraction.open("w", encoding="utf-8") as fout:
fout.write(textract.process(str(file_pdf)).decode('utf-8'))
logging.info(f"Saved extraction to {file_extraction}")
示例15: test_text_ssns
# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def test_text_ssns(self):
options = pdf_redactor.RedactorOptions()
options.content_filters = [
(
re.compile(u"[?–—~?]"),
lambda m: "-"
),
(
re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"),
lambda m: "XXX-XX-XXXX"
),
]
with RedactFixture(FIXTURE_PATH, options) as redacted_path:
text = textract.process(redacted_path)
self.assertIn(b"Here are some fake SSNs\n\nXXX-XX-XXXX\n--\n\nXXX-XX-XXXX XXX-XX-XXXX\n\nAnd some more with common OCR character substitutions:\nXXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX", text)