當前位置: 首頁>>代碼示例>>Python>>正文


Python parser.from_file方法代碼示例

本文整理匯總了Python中tika.parser.from_file方法的典型用法代碼示例。如果您正苦於以下問題:Python parser.from_file方法的具體用法?Python parser.from_file怎麽用?Python parser.from_file使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在tika.parser的用法示例。


在下文中一共展示了parser.from_file方法的10個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: filterFiles

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:18,代碼來源:ingest.py

示例2: lazySolr

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def lazySolr(inputDir, accept):

    for doc in filterFiles(inputDir, accept):
        parsed = parser.from_file(doc)

        document = { "id": "file:" + os.path.abspath(inputDir) + "/" + str(parsed["metadata"].pop(u"resourceName", None).encode("utf-8")),
                     "content": parsed["content"]
        }

        for key in parsed["metadata"]:
            mappedField = key + "_s_md"

            value = stringify(parsed["metadata"][key])
            if value:
                document[mappedField] = value

        yield document 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:19,代碼來源:ingest.py

示例3: filterFiles

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = (filename for filename in filename_list if parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:18,代碼來源:argK-means.py

示例4: filterFiles

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))
    try:
        filename_list = [filename for filename in filename_list if "metadata" in parser.from_file(filename)]
    except ConnectionError:
        sleep(1)
    if acceptTypes:
        filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
    else:
        print "Accepting all MIME Types....."

    return filename_list 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:20,代碼來源:cosine_similarity.py

示例5: computeScores

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def computeScores(inputDir, outCSV, acceptTypes):
    
    with open(outCSV, "wb") as outF:
        a = csv.writer(outF, delimiter=',')
        a.writerow(["x-coordinate","y-coordinate","Similarity_score"])        

        files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)
        for file1, file2 in files_tuple:
            try:
                row_cosine_distance = [file1, file2]
            
                file1_parsedData = parser.from_file(file1)
                file2_parsedData = parser.from_file(file2)
           
                v1 = Vector(file1, file1_parsedData["metadata"])
                v2 = Vector(file2, file2_parsedData["metadata"])
            

                row_cosine_distance.append(v1.cosTheta(v2))            

                a.writerow(row_cosine_distance)  
            except ConnectionError:
                sleep(1)
            except KeyError:
                continue 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:27,代碼來源:cosine_similarity.py

示例6: filterFiles

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))
    
    filename_list = (filename for filename in filename_list if "metadata" in parser.from_file(filename))
    if acceptTypes:
        filename_list = (filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes)
    else:
        print "Accepting all MIME Types....."

    return filename_list 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:18,代碼來源:sk_kmeans.py

示例7: filterFiles

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def filterFiles(inputDir, acceptTypes):
    filename_list = []

    for root, dirnames, files in os.walk(inputDir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        for filename in files:
            if not filename.startswith('.'):
                filename_list.append(os.path.join(root, filename))

    filename_list = [filename for filename in filename_list if parser.from_file(filename)]
    if acceptTypes:
        filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes]
    else:
        print "Accepting all MIME Types....."

    return filename_list 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:18,代碼來源:jaccard_similarity.py

示例8: computeScores

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def computeScores(inputDir, outCSV, acceptTypes):

    with open(outCSV, "wb") as outF:
      a = csv.writer(outF, delimiter=',')
      a.writerow(["x-coordinate","y-coordinate","Similarity_score"])

      files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2)

      for file1, file2 in files_tuple:
        f1MetaData = parser.from_file(file1)["metadata"]
        f2MetaData = parser.from_file(file2)["metadata"]

        isCoExistant = lambda k: ( k in f2MetaData) and ( f1MetaData[k] == f2MetaData[k] )
        intersection = reduce(lambda m,k: (m + 1) if isCoExistant(k) else m, f1MetaData.keys(), 0)


        union = len(f1MetaData.keys()) + len(f2MetaData.keys()) - intersection
        jaccard = float(intersection) / union

        a.writerow([file1, file2, jaccard]) 
開發者ID:chrismattmann,項目名稱:tika-similarity,代碼行數:22,代碼來源:jaccard_similarity.py

示例9: handler

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def handler(event, context):
  if event['data']['EventType'] == "s3:ObjectCreated:Put" : 
        tf = tempfile.NamedTemporaryFile(delete=False)
        bucket = event['data']['Key'].split('/')[0]
        filename = event['data']['Key'].split('/')[1]
      
        # Fetching source file from Minio
        try:
            print('Fetching file')
            client.fget_object(bucket, filename, tf.name)
        except ResponseError as err:
            print('Error fetching file')
            print err

        # OCR text extraction performed by Tika
        print 'Sending file to Tika'
        parsed = parser.from_file(tf.name, 'http://tika-tika-server:80/tika')
        ocrdata = json.dumps(parsed, ensure_ascii=True)

        # MongoDB document insertion 
        db = mongo['ocr']
        result = db.processed.insert_one(parsed)
        print 'Document Saved!'
        print('Document proccessed: {0}'.format(result.inserted_id))

        # move OCRd file to done bucket 
        try:
            # Copy from input bucket to done bucket
            fullpath = 'input/' + filename 
            client.copy_object('done', filename, fullpath)
            # Remove from input bucket
            client.remove_object('input', filename)
        except ResponseError as err:
            print err
  else:
       print "Minio file deletion event"
      
  return "OCR Finished" 
開發者ID:kubeless,項目名稱:functions,代碼行數:40,代碼來源:ocr.py

示例10: scan

# 需要導入模塊: from tika import parser [as 別名]
# 或者: from tika.parser import from_file [as 別名]
def scan(filelist, conf=DEFAULTCONF):
    results = []

    for f in filelist:
        metadata = parser.from_file(f).get('metadata', {})
        for field in conf['remove-entry']:
            if field in metadata:
                del metadata[field]
        results.append((f, metadata))

    metadata = {}
    metadata["Name"] = NAME
    metadata["Type"] = TYPE
    return results, metadata 
開發者ID:mitre,項目名稱:multiscanner,代碼行數:16,代碼來源:Tika.py


注:本文中的tika.parser.from_file方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。