當前位置: 首頁>>代碼示例>>Python>>正文


Python fuzz.partial_ratio方法代碼示例

本文整理匯總了Python中fuzzywuzzy.fuzz.partial_ratio方法的典型用法代碼示例。如果您正苦於以下問題:Python fuzz.partial_ratio方法的具體用法?Python fuzz.partial_ratio怎麽用?Python fuzz.partial_ratio使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在fuzzywuzzy.fuzz的用法示例。


在下文中一共展示了fuzz.partial_ratio方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: extract_features

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df 
開發者ID:aerdem4,項目名稱:kaggle-quora-dup,代碼行數:26,代碼來源:nlp_feature_extraction.py

示例2: inquiry

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def inquiry(self):
		sentence = self.line_edit.text()
		matched = []
		score_thresh = self.getScoreThresh()
		if not sentence:
			QMessageBox.warning(self, "Warning", '請先輸入需要查詢的魯迅名言')
		else:
			for p in self.paragraphs:
				score = fuzz.partial_ratio(p, sentence)
				if score >= score_thresh and len(sentence) <= len(p):
					matched.append([score, p])
			infos = []
			for match in matched:
				infos.append('[匹配度]: %d\n[內容]: %s\n' % (match[0], match[1]))
			if not infos:
				infos.append('未匹配到任何相似度大於%d的句子.\n' % score_thresh)
			self.text.setText('\n\n\n'.join(infos)[:-1]) 
開發者ID:CharlesPikachu,項目名稱:Tools,代碼行數:19,代碼來源:main.py

示例3: add_dup_simhash_caches

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def add_dup_simhash_caches(simhashcache, dup_obj_ids):
    if not dup_obj_ids:
        return
    old_dup_obj_ids = set(dup_obj_ids)
    start_time = time.time()
    for i, dup_obj_id in enumerate(dup_obj_ids, 1):
        with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)):
            logging.info('--' * 100)
            try:
                dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id)
            except Exception, e:
                print e
                continue
            sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text)
            logging.info(simhashcache.text)
            logging.info('--' * 20)
            logging.info(dup_simhash.text)
            logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id))

            if dup_simhash not in old_dup_obj_ids:
                if sim_ratio > 50:
                    old_dup_obj_ids.add(dup_obj_id)
            else:
                if sim_ratio <= 50:
                    old_dup_obj_ids.remove(dup_obj_id) 
開發者ID:likaiguo,項目名稱:simhashpy,代碼行數:27,代碼來源:__init__.py

示例4: search

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def search(self, targets, partial=True, fuzzy=False):
        allInstances = self.instances()
        matchedInstances = set()

        for host in targets:
            for instance in allInstances:
                names = [instance.name]
                if instance.aliases != None:
                    names += list(instance.aliases)
                for name in names:
                    if host.lower() == name.lower():
                        matchedInstances.add((100, instance))
                    elif partial and host.lower() in name.lower():
                        matchedInstances.add((99, instance))

                    if fuzzy:
                        score = fuzz.partial_ratio(host.lower(), name.lower())
                        if score > 85 or host.lower() in name.lower():
                            matchedInstances.add((score, instance))

        # it is possible for the same instance to be matched, if so, it should only
        # appear on the return list once (still ordered by the most probable match)
        return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys()) 
開發者ID:wagoodman,項目名稱:bridgy,代碼行數:25,代碼來源:source.py

示例5: _create_fuzzy_wuzzy_features

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def _create_fuzzy_wuzzy_features(self, df):
        df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
   
        def _get_longest_substr_ratio(a, b):
            strs = list(distance.lcsubstrings(a, b))
            if len(strs) == 0:
                return 0
            else:
                return len(strs[0]) / (min(len(a), len(b)) + 1)

        df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1) 
開發者ID:zake7749,項目名稱:CIKM-AnalytiCup-2018,代碼行數:18,代碼來源:feature_engineering.py

示例6: find_near_matches

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def find_near_matches(w, sentence):
    ret = []
    max_ratio = 0
    t = 0
    for word in sentence.split():
        while sentence[t] != word[0]:
            t += 1
        score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2
        if score > max_ratio:
            max_ratio = score
            ret = [(t, t + len(word))]
        elif score == max_ratio:
            ret.append((t, t + len(word)))
        else:
            pass
        t += len(word)
    return ret if max_ratio > 85 else [] 
開發者ID:THUDM,項目名稱:CogQA,代碼行數:19,代碼來源:process_train.py

示例7: fuzz_list

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None):
    node_dict = { }
    for i,node1 in enumerate(node1_list):
        match_score_dict = { }
        for node2 in node2_list:
            if node1 != node2:
                if string_map is not None:
                    n1 = string_map(node1)
                    n2 = string_map(node2)
                    score = fuzz.partial_ratio(n1,n2)
                    if n1 == n2:
                        node2_list.remove(node2)
                else:
                    score = fuzz.partial_ratio(node1,node2)
                if score > score_baseline:
                    match_score_dict[node2] = score
            else:
                node2_list.remove(node2)
        node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True)
        node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]]
        print i,len(node1_list)
    return node_dict, node2_list 
開發者ID:HLIG,項目名稱:HUAWEIOCR-2019,代碼行數:24,代碼來源:py_op.py

示例8: extract_stat_features

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
開發者ID:lampts,項目名稱:wsdm19cup,代碼行數:30,代碼來源:make_handcrafted_33_features.py

示例9: search

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def search(self, query):
        def processor(x):
            if isinstance(x, Issue):
                x = x.title
            return x.strip().lower()

        # We don't care about the score, so return first element
        # This must not happen while updating the self.issues dict so acquire the lock
        with self.issues_lock:
            return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
                                                            processor=processor, limit=5)] 
開發者ID:python-telegram-bot,項目名稱:rules-bot,代碼行數:13,代碼來源:util.py

示例10: partial_ratio

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def partial_ratio(str1: str, str2: str) -> int:
    """Get partial fuzzy ratio with korean text"""

    return fuzz.partial_ratio(
        normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2),
    ) 
開發者ID:item4,項目名稱:yui,代碼行數:8,代碼來源:fuzz.py

示例11: pdf_annotate

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def pdf_annotate(self, data):

        title_text = data.get('title')
        if not title_text:
            log.error('Unable to run pubmed matching since we have no title')
            # unable to do pubmed unless we have a title, so just return the original data
            return data

        vec_q = self.vectorizer.transform([title_text])
        token_overlap = vec_q.dot(self.vec_ti.T)
        self.to = token_overlap
        best_ind = token_overlap.indices[token_overlap.data.argmax()]
        pmid = int(self.pmid_ind[best_ind])

        # checking both the overall similarity, and overlap similarity

        pubmed_data = self.query_pubmed(pmid)

        match_pc = fuzz.ratio(title_text.lower(), pubmed_data['title'].lower())
        match_pc_overlap = fuzz.partial_ratio(title_text.lower(), pubmed_data['title'].lower())

        # seems like a reasonable heuristic but not checked
        # (given that sometimes our query is a partial title
        # retrieved by Grobid)
        pubmed_data['pubmed_match_quality'] = sum([match_pc, match_pc_overlap])

        var_map = [('abstract', pubmed_data['abstract']),
                   ('pmid', pubmed_data['pmid']),
                   ('mesh', pubmed_data['mesh'])]


        if pubmed_data['pubmed_match_quality'] > 180:
            data.data['pubmed'] = pubmed_data # until setattr is worked out
        else:
             # keep it just in case, but don't replace better quality match
             data.data['dubious'] = pubmed_data # until setattr is worked out

        return data 
開發者ID:ijmarshall,項目名稱:robotreviewer,代碼行數:40,代碼來源:pubmed_robot.py

示例12: fuzzy

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def fuzzy(tokens):
    averages = []
    for token in tokens:
        sameTokenRemoved = False
        result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
        scores = []
        for each in result:
            score = each[1]
            if score == 100 and not sameTokenRemoved:
                sameTokenRemoved = True
                continue
            scores.append(score)
        average = statistics.mean(scores)
        averages.append(average)
    return statistics.mean(averages) 
開發者ID:s0md3v,項目名稱:Bolt,代碼行數:17,代碼來源:bolt.py

示例13: check_names_fuzzy_match

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def check_names_fuzzy_match(row):
    row["name_match"] = fuzz.partial_ratio(row["Player"], row["PLAYER_NAME"]) > 60
    return row 
開發者ID:rd11490,項目名稱:NBA_Tutorials,代碼行數:5,代碼來源:deduplicate_sources.py

示例14: write_cluster

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def write_cluster(cluster_topic, filepath):
    with open(filepath, "w") as w:
        for key, value in cluster_topic.items():
            cluster = [(text, sim) for text, sim in value if text != key]
            cluster = json.dumps(cluster, ensure_ascii=False)
            w.write("{}\t{}\n".format(key, cluster))

#def fuzz_sim(text1, text2, lower=True):
#    if lower:
#        text1, text2 = text1.lower(), text2.lower()
#    partial_ratio = fuzz.partial_ratio(text1, text2)/100
#    simple_ratio = fuzz.ratio(text1, text2)/100
#    return 0.8*partial_ratio + 0.2*simple_ratio 
開發者ID:HaowenHOU,項目名稱:single-pass-clustering-for-chinese-text,代碼行數:15,代碼來源:cluster_for_short_text.py

示例15: adjust_score

# 需要導入模塊: from fuzzywuzzy import fuzz [as 別名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 別名]
def adjust_score(partial_ratio, text1, text2):
    min_len = min(len(text1), len(text2))
    if partial_ratio >= 0.8 and min_len >= 4:
        partial_ratio += 0.15
    if partial_ratio >= 0.8 and min_len <= 2:
        partial_ratio -= 0.15
        head_hit = hit_head(text1, text2)
        if head_hit:
            partial_ratio += 0.15
        else:
            partial_ratio -= 0.15
    if min_len == 1:
        partial_ratio = 0.0
    return partial_ratio 
開發者ID:HaowenHOU,項目名稱:single-pass-clustering-for-chinese-text,代碼行數:16,代碼來源:cluster_for_short_text.py


注:本文中的fuzzywuzzy.fuzz.partial_ratio方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。