本文整理汇总了Python中tika.parser.from_file方法的典型用法代码示例。如果您正苦于以下问题:Python parser.from_file方法的具体用法?Python parser.from_file怎么用?Python parser.from_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tika.parser
的用法示例。
在下文中一共展示了parser.from_file方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: filterFiles
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
if acceptTypes:
filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
else:
print "Accepting all MIME Types....."
return filename_list
示例2: lazySolr
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def lazySolr(inputDir, accept):
for doc in filterFiles(inputDir, accept):
parsed = parser.from_file(doc)
document = { "id": "file:" + os.path.abspath(inputDir) + "/" + str(parsed["metadata"].pop(u"resourceName", None).encode("utf-8")),
"content": parsed["content"]
}
for key in parsed["metadata"]:
mappedField = key + "_s_md"
value = stringify(parsed["metadata"][key])
if value:
document[mappedField] = value
yield document
示例3: filterFiles
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = (filename for filename in filename_list if parser.from_file(filename))
if acceptTypes:
filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
else:
print "Accepting all MIME Types....."
return filename_list
示例4: filterFiles
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
try:
filename_list = [filename for filename in filename_list if "metadata" in parser.from_file(filename)]
except ConnectionError:
sleep(1)
if acceptTypes:
filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
else:
print "Accepting all MIME Types....."
return filename_list
示例5: computeScores
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def computeScores(inputDir, outCSV, acceptTypes):
with open(outCSV, "wb") as outF:
a = csv.writer(outF, delimiter=',')
a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
for file1, file2 in files_tuple:
try:
row_cosine_distance = [file1, file2]
file1_parsedData = parser.from_file(file1)
file2_parsedData = parser.from_file(file2)
v1 = Vector(file1, file1_parsedData["metadata"])
v2 = Vector(file2, file2_parsedData["metadata"])
row_cosine_distance.append(v1.cosTheta(v2))
a.writerow(row_cosine_distance)
except ConnectionError:
sleep(1)
except KeyError:
continue
示例6: filterFiles
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
if acceptTypes:
filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
else:
print "Accepting all MIME Types....."
return filename_list
示例7: filterFiles
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
filename_list = []
for root, dirnames, files in os.walk(inputDir):
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
for filename in files:
if not filename.startswith('.'):
filename_list.append(os.path.join(root, filename))
filename_list = [filename for filename in filename_list if parser.from_file(filename)]
if acceptTypes:
filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
else:
print "Accepting all MIME Types....."
return filename_list
示例8: computeScores
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def computeScores(inputDir, outCSV, acceptTypes):
with open(outCSV, "wb") as outF:
a = csv.writer(outF, delimiter=',')
a.writerow(["x-coordinate","y-coordinate","Similarity_score"])
files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
for file1, file2 in files_tuple:
f1MetaData = parser.from_file(file1)["metadata"]
f2MetaData = parser.from_file(file2)["metadata"]
isCoExistant = lambda k: ( k in f2MetaData) and ( f1MetaData[k] == f2MetaData[k] )
intersection = reduce(lambda m,k: (m + 1) if isCoExistant(k) else m, f1MetaData.keys(), 0)
union = len(f1MetaData.keys()) + len(f2MetaData.keys()) - intersection
jaccard = float(intersection) / union
a.writerow([file1, file2, jaccard])
示例9: handler
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def handler(event, context):
if event['data']['EventType'] == "s3:ObjectCreated:Put" :
tf = tempfile.NamedTemporaryFile(delete=False)
bucket = event['data']['Key'].split('/')[0]
filename = event['data']['Key'].split('/')[1]
# Fetching source file from Minio
try:
print('Fetching file')
client.fget_object(bucket, filename, tf.name)
except ResponseError as err:
print('Error fetching file')
print err
# OCR text extraction performed by Tika
print 'Sending file to Tika'
parsed = parser.from_file(tf.name, 'http://tika-tika-server:80/tika')
ocrdata = json.dumps(parsed, ensure_ascii=True)
# MongoDB document insertion
db = mongo['ocr']
result = db.processed.insert_one(parsed)
print 'Document Saved!'
print('Document proccessed: {0}'.format(result.inserted_id))
# move OCRd file to done bucket
try:
# Copy from input bucket to done bucket
fullpath = 'input/' + filename
client.copy_object('done', filename, fullpath)
# Remove from input bucket
client.remove_object('input', filename)
except ResponseError as err:
print err
else:
print "Minio file deletion event"
return "OCR Finished"
示例10: scan
# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def scan(filelist, conf=DEFAULTCONF):
results = []
for f in filelist:
metadata = parser.from_file(f).get('metadata', {})
for field in conf['remove-entry']:
if field in metadata:
del metadata[field]
results.append((f, metadata))
metadata = {}
metadata["Name"] = NAME
metadata["Type"] = TYPE
return results, metadata