当前位置: 首页>>代码示例>>Python>>正文


Python parser.from_file方法代码示例

本文整理汇总了Python中tika.parser.from_file方法的典型用法代码示例。如果您正苦于以下问题:Python parser.from_file方法的具体用法?Python parser.from_file怎么用?Python parser.from_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tika.parser的用法示例。


在下文中一共展示了parser.from_file方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: filterFiles

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:18,代码来源:ingest.py

示例2: lazySolr

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def lazySolr(inputDir, accept):

    for doc in filterFiles(inputDir, accept):
        parsed = parser.from_file(doc)

        document = { "id": "file:" + os.path.abspath(inputDir) + "/" + str(parsed["metadata"].pop(u"resourceName", None).encode("utf-8")),
                     "content": parsed["content"]
        }

        for key in parsed["metadata"]:
            mappedField = key + "_s_md"

            value = stringify(parsed["metadata"][key])
            if value:
                document[mappedField] = value

        yield document 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:19,代码来源:ingest.py

示例3: filterFiles

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = (filename for filename in filename_list if parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:18,代码来源:argK-means.py

示例4: filterFiles

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))
    try:
        filename_list = [filename for filename in filename_list if "metadata" in parser.from_file(filename)]
    except ConnectionError:
        sleep(1)
    if acceptTypes:
        filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
    else:
        print "Accepting all MIME Types....."

    return filename_list 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:20,代码来源:cosine_similarity.py

示例5: computeScores

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def computeScores(inputDir, outCSV, acceptTypes):
    
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])        

        files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
        for file1, file2 in files_tuple:
            try:
                row_cosine_distance = [file1, file2]
            
                file1_parsedData = parser.from_file(file1)
                file2_parsedData = parser.from_file(file2)
           
                v1 = Vector(file1, file1_parsedData["metadata"])
                v2 = Vector(file2, file2_parsedData["metadata"])
            

                row_cosine_distance.append(v1.cosTheta(v2))            

                a.writerow(row_cosine_distance)  
            except ConnectionError:
                sleep(1)
            except KeyError:
                continue 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:27,代码来源:cosine_similarity.py

示例6: filterFiles

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))
    
    filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:18,代码来源:sk_kmeans.py

示例7: filterFiles

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = [filename for filename in filename_list if parser.from_file(filename)]
    if acceptTypes:
        filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
    else:
        print "Accepting all MIME Types....."

    return filename_list 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:18,代码来源:jaccard_similarity.py

示例8: computeScores

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def computeScores(inputDir, outCSV, acceptTypes):

    with open(outCSV, "wb") as outF:
      a = csv.writer(outF, delimiter=',')
      a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

      files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)

      for file1, file2 in files_tuple:
        f1MetaData = parser.from_file(file1)["metadata"]
        f2MetaData = parser.from_file(file2)["metadata"]

        isCoExistant = lambda k: ( k in f2MetaData) and ( f1MetaData[k] == f2MetaData[k] )
        intersection = reduce(lambda m,k: (m + 1) if isCoExistant(k) else m, f1MetaData.keys(), 0)


        union = len(f1MetaData.keys()) + len(f2MetaData.keys()) - intersection
        jaccard = float(intersection) / union

        a.writerow([file1, file2, jaccard]) 
开发者ID:chrismattmann,项目名称:tika-similarity,代码行数:22,代码来源:jaccard_similarity.py

示例9: handler

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def handler(event, context):
  if event['data']['EventType'] == "s3:ObjectCreated:Put" : 
        tf = tempfile.NamedTemporaryFile(delete=False)
        bucket = event['data']['Key'].split('/')[0]
        filename = event['data']['Key'].split('/')[1]
      
        # Fetching source file from Minio
        try:
            print('Fetching file')
            client.fget_object(bucket, filename, tf.name)
        except ResponseError as err:
            print('Error fetching file')
            print err

        # OCR text extraction performed by Tika
        print 'Sending file to Tika'
        parsed = parser.from_file(tf.name, 'http://tika-tika-server:80/tika')
        ocrdata = json.dumps(parsed, ensure_ascii=True)

        # MongoDB document insertion 
        db = mongo['ocr']
        result = db.processed.insert_one(parsed)
        print 'Document Saved!'
        print('Document proccessed: {0}'.format(result.inserted_id))

        # move OCRd file to done bucket 
        try:
            # Copy from input bucket to done bucket
            fullpath = 'input/' + filename 
            client.copy_object('done', filename, fullpath)
            # Remove from input bucket
            client.remove_object('input', filename)
        except ResponseError as err:
            print err
  else:
       print "Minio file deletion event"
      
  return "OCR Finished" 
开发者ID:kubeless,项目名称:functions,代码行数:40,代码来源:ocr.py

示例10: scan

# 需要导入模块: from tika import parser [as 别名]
# 或者: from tika.parser import from_file [as 别名]
def scan(filelist, conf=DEFAULTCONF):
    results = []

    for f in filelist:
        metadata = parser.from_file(f).get('metadata', {})
        for field in conf['remove-entry']:
            if field in metadata:
                del metadata[field]
        results.append((f, metadata))

    metadata = {}
    metadata["Name"] = NAME
    metadata["Type"] = TYPE
    return results, metadata 
开发者ID:mitre,项目名称:multiscanner,代码行数:16,代码来源:Tika.py


注:本文中的tika.parser.from_file方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。