本文整理汇总了Python中textract.process函数的典型用法代码示例。如果您正苦于以下问题:Python process函数的具体用法?Python process怎么用?Python process使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了process函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_missing_filename_python
def test_missing_filename_python(self):
"""Make sure missing files raise the correct error"""
filename = self.get_temp_filename()
os.remove(filename)
import textract
from textract.exceptions import MissingFileError
with self.assertRaises(MissingFileError):
textract.process(filename)
示例2: test_unsupported_extension_python
def test_unsupported_extension_python(self):
"""Make sure unsupported extension raises the correct error"""
filename = self.get_temp_filename(extension="extension")
import textract
from textract.exceptions import ExtensionNotSupported
with self.assertRaises(ExtensionNotSupported):
textract.process(filename)
os.remove(filename)
示例3: annotate_doc
def annotate_doc(pdf_file_path, ontologies):
if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'):
text = textract.process(pdf_file_path, method="pdfminer")
elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'):
text = textract.process(pdf_file_path, method="beautifulsoup4")
elif pdf_file_path.endswith('txt'):
with open(pdf_file_path, 'r') as file:
text = file.read()
db = DBConnect()
if text.isspace():
log = {
'file_name': pdf_file_path.encode('utf-8'),
'error': 'Failed PDF to text transformation in annotation process',
'exception': '',
'data': ''
}
db.insert_log(log)
return
ontologies = ",".join(ontologies)
annotations = []
text = unidecode(text.decode('utf8'))
text = ' '.join(text.split())
# post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
# display_links='true', display_context='false', minimum_match_length='3',
# exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
display_links='true', display_context='false', minimum_match_length='3',
exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
try:
response = requests.post(settings.ANNOTATOR_URL, post_data)
json_results = json.loads(response.text)
for result in json_results:
for annotation in result['annotations']:
context_begin = annotation['from'] if annotation['from'] - 40 < 1 else annotation['from'] - 40
context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40
record = {
'file_name': pdf_file_path.encode('utf-8'),
'bio_class_id': result['annotatedClass']['@id'],
'bio_ontology_id': result['annotatedClass']['links']['ontology'],
'text': u'' + annotation['text'].encode('utf-8'),
'match_type': annotation['matchType'],
'context': u''+text[context_begin:context_end]
}
annotations.append(record)
db.insert_annotations(annotations)
return
except (ValueError, IndexError, KeyError) as e:
print e
log = {
'file_name': pdf_file_path.encode('utf-8'),
'error': 'Bad response from Bioportal Annotator',
'exception': str(e),
'data': ''
}
db.insert_log(log)
return
示例4: pdftotext_any
def pdftotext_any(myfile):
# Todo: use tempfile instead
path = '/tmp/infile.pdf'
with open(path, 'wb') as f:
#with tempfile.NamedTemporaryFile() as f:
# path = f.name
f.write(myfile)
text = textract.process(path, method='pdftotext')
if len(text)<5: # No text found, it is probably an image scan, so we need to do an OCR
text = textract.process(path, method='tesseract')
return text
示例5: process_text_file
def process_text_file(file_path):
file_name, extension = os.path.splitext(file_path)
print file_name, extension
if (extension == ".txt"):
return file_path
elif (extension == '.epub'):
print "Trying epub"
try:
text = textract.process(file_path)
print "Processed epub: ", file_path
output_path = file_name+'.txt'
output_file = open(output_path, 'w')
output_file.write(text)
print "Converted epub: ", output_path
return output_path
except Exception as error:
# TODO: textract raises own error so none isn't returned on try failure
print error
print 'Failed to convert epub: ', file_path
return None
elif (extension == ""):
text_content = None
try:
with open(file_path) as input_file:
text_content = input_file.read()
if text_content:
print "Managed to read file: ", file_path
return file_path
except IOError:
print "Failed to read file: ", file_path
return None
else:
print 'Unsupported file type: ', file_path
return None
示例6: get_text_from_files
def get_text_from_files(files_to_process):
"""Extracts text from each file given a list of file_names"""
file_text_dict = {}
for file_name in iter(files_to_process):
extracted_text = textract.process(file_name)
file_text_dict[file_name] = extracted_text
return file_text_dict
示例7: get_text_from_file
def get_text_from_file(self, file):
filename = file['id'] + '.pdf'
self._download_file(file, filename)
text = textract.process(filename)
os.remove(filename)
return text
示例8: extract_text_from_lectureDocuments
def extract_text_from_lectureDocuments(self):
# pull files from database
lectureDocumentsObjects = lectureDocuments.objects.filter(extracted=False)
# loop through modules and pull all text
for lectureDocumentsObject in lectureDocumentsObjects:
if lectureDocumentsObject.document:
print lectureDocumentsObject.document
path_to_file = MEDIA_ROOT + '/' + str(lectureDocumentsObject.document)
document_contents = textract.process(path_to_file, encoding='ascii')
# create tags from noun_phrases
# only add tags if none exist
blobbed = TextBlob(document_contents)
np = blobbed.noun_phrases
np = list(set(np))
np = [s for s in np if s]
lectureDocumentsObject.tags.clear()
for item in np:
s = ''.join(ch for ch in item if ch not in exclude)
print s
lectureDocumentsObject.tags.add(s)
# save this string
lectureDocumentsObject.document_contents = document_contents
lectureDocumentsObject.extracted = True
lectureDocumentsObject.save()
示例9: save
def save(self, *args, **kwargs):
super(Document, self).save(*args, **kwargs)
text = textract.process(self.source_file.url)
filtered_stems = self.get_filtered_stems(text)
self.total_word_count = len(filtered_stems)
self.count_target_words(filtered_stems)
super(Document, self).save(*args, **kwargs)
示例10: extract
def extract(path):
'''
Extract full text fro pdf's
:param path: [String] Path to a pdf file downloaded via {fetch}, or another way.
:return: [str] a string of text
Usage::
from pyminer import miner
# a pdf
url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
out = miner.fetch(url)
out.parse()
# search first, then pass links to fetch
res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
# url = res.links_pdf()[0]
url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
x = miner.fetch(url)
miner.extract(x.path)
'''
text = textract.process(path)
return text
示例11: build_indexes
def build_indexes(files_list, index_file):
toolbar_width = len(files_list)
print(toolbar_width)
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['
hash_index = {}
for item in files_list:
text = textract.process(item)
details = re.split("[, \t\n\\t:;]", text)
for i in details:
if i == "" : continue
if hash_index.has_key((i)) :
if hash_index[(i)].has_key((item)):
hash_index[(i)][(item)] += 1
else:
hash_index[(i)][(item)] = 1
else:
hash_index[(i)] = {}
if hash_index[(i)].has_key(item):
hash_index[(i)][(item)] += 1
else:
hash_index[(i)][(item)] = 1
# update the bar
sys.stdout.write("-")
sys.stdout.flush()
sys.stdout.write("\n")
fp = open(index_file, "w")
json.dump(hash_index, fp)
fp.close()
示例12: get_path_details
def get_path_details(cls, temp_path, image_path):
"""Return the byte sequence and the full text for a given path."""
byte_sequence = ByteSequence.from_path(temp_path)
extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
extension, byte_sequence.sha1)
full_text = ""
if extension is not None:
try:
logging.debug("Textract for SHA1 %s, extension map val %s",
byte_sequence.sha1, extension)
full_text = process(temp_path, extension=extension, encoding='ascii',
preserveLineBreaks=True)
except ExtensionNotSupported as _:
logging.exception("Textract extension not supported for ext %s", extension)
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except LookupError as _:
logging.exception("Lookup error for encoding.")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except UnicodeDecodeError as _:
logging.exception("UnicodeDecodeError, problem with file encoding")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
except:
logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
full_text = "N/A"
return byte_sequence, full_text
示例13: indexing
def indexing():
ana = analysis.StemmingAnalyzer()
schema = Schema(title=TEXT(analyzer=ana, spelling=True), path=ID(stored=True), content=TEXT)
ix = create_in("data/pdf_data", schema)
writer = ix.writer()
count = 0
with open('Final_Links/doc_links.txt') as fp, open('data/pdf_data/mytemp/doc_content.txt', 'w+') as f:
for line in fp:
count += 1
url = line
doc_name = re.search('.*/(.*)', url).group(1)
try:
response = urllib2.urlopen(url, timeout=3)
if int(response.headers['content-length']) > 2475248:
continue
fil = open("data/pdf_data/mytemp/" + doc_name, 'w+')
fil.write(response.read())
fil.close()
content_text = textract.process('data/pdf_data/mytemp/' + doc_name, encoding='ascii')
f.write(content_text)
writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
content=unicode(content_text))
writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
content=unicode(url))
except Exception as e:
print "Caught exception e at " + '' + str(e)
continue
print str(count) + " in " + " URL:" + url
writer.commit()
print "Indexing Completed !"
示例14: parse_sentences
def parse_sentences(pdf):
text = textract.process(pdf)
reg = "[.?!]"
sentences = re.split(reg, text)
return [s for s in sentences if "\\x" not in s]
示例15: compare_python_output
def compare_python_output(self, filename, expected_filename=None, **kwargs):
if expected_filename is None:
expected_filename = self.get_expected_filename(filename, **kwargs)
import textract
result = textract.process(filename, **kwargs)
with open(expected_filename) as stream:
self.assertEqual(result, stream.read())