当前位置: 首页>>代码示例>>Python>>正文


Python fuzz.partial_ratio方法代码示例

本文整理汇总了Python中fuzzywuzzy.fuzz.partial_ratio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.partial_ratio方法的具体用法?Python fuzz.partial_ratio怎么用?Python fuzz.partial_ratio使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fuzzywuzzy.fuzz的用法示例。


在下文中一共展示了fuzz.partial_ratio方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df 
开发者ID:aerdem4,项目名称:kaggle-quora-dup,代码行数:26,代码来源:nlp_feature_extraction.py

示例2: inquiry

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def inquiry(self):
		sentence = self.line_edit.text()
		matched = []
		score_thresh = self.getScoreThresh()
		if not sentence:
			QMessageBox.warning(self, "Warning", '请先输入需要查询的鲁迅名言')
		else:
			for p in self.paragraphs:
				score = fuzz.partial_ratio(p, sentence)
				if score >= score_thresh and len(sentence) <= len(p):
					matched.append([score, p])
			infos = []
			for match in matched:
				infos.append('[匹配度]: %d\n[内容]: %s\n' % (match[0], match[1]))
			if not infos:
				infos.append('未匹配到任何相似度大于%d的句子.\n' % score_thresh)
			self.text.setText('\n\n\n'.join(infos)[:-1]) 
开发者ID:CharlesPikachu,项目名称:Tools,代码行数:19,代码来源:main.py

示例3: add_dup_simhash_caches

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def add_dup_simhash_caches(simhashcache, dup_obj_ids):
    if not dup_obj_ids:
        return
    old_dup_obj_ids = set(dup_obj_ids)
    start_time = time.time()
    for i, dup_obj_id in enumerate(dup_obj_ids, 1):
        with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)):
            logging.info('--' * 100)
            try:
                dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id)
            except Exception, e:
                print e
                continue
            sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text)
            logging.info(simhashcache.text)
            logging.info('--' * 20)
            logging.info(dup_simhash.text)
            logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id))

            if dup_simhash not in old_dup_obj_ids:
                if sim_ratio > 50:
                    old_dup_obj_ids.add(dup_obj_id)
            else:
                if sim_ratio <= 50:
                    old_dup_obj_ids.remove(dup_obj_id) 
开发者ID:likaiguo,项目名称:simhashpy,代码行数:27,代码来源:__init__.py

示例4: search

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def search(self, targets, partial=True, fuzzy=False):
        allInstances = self.instances()
        matchedInstances = set()

        for host in targets:
            for instance in allInstances:
                names = [instance.name]
                if instance.aliases != None:
                    names += list(instance.aliases)
                for name in names:
                    if host.lower() == name.lower():
                        matchedInstances.add((100, instance))
                    elif partial and host.lower() in name.lower():
                        matchedInstances.add((99, instance))

                    if fuzzy:
                        score = fuzz.partial_ratio(host.lower(), name.lower())
                        if score > 85 or host.lower() in name.lower():
                            matchedInstances.add((score, instance))

        # it is possible for the same instance to be matched, if so, it should only
        # appear on the return list once (still ordered by the most probable match)
        return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys()) 
开发者ID:wagoodman,项目名称:bridgy,代码行数:25,代码来源:source.py

示例5: _create_fuzzy_wuzzy_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
        df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
   
        def _get_longest_substr_ratio(a, b):
            strs = list(distance.lcsubstrings(a, b))
            if len(strs) == 0:
                return 0
            else:
                return len(strs[0]) / (min(len(a), len(b)) + 1)

        df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1) 
开发者ID:zake7749,项目名称:CIKM-AnalytiCup-2018,代码行数:18,代码来源:feature_engineering.py

示例6: find_near_matches

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def find_near_matches(w, sentence):
    ret = []
    max_ratio = 0
    t = 0
    for word in sentence.split():
        while sentence[t] != word[0]:
            t += 1
        score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2
        if score > max_ratio:
            max_ratio = score
            ret = [(t, t + len(word))]
        elif score == max_ratio:
            ret.append((t, t + len(word)))
        else:
            pass
        t += len(word)
    return ret if max_ratio > 85 else [] 
开发者ID:THUDM,项目名称:CogQA,代码行数:19,代码来源:process_train.py

示例7: fuzz_list

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None):
    node_dict = { }
    for i,node1 in enumerate(node1_list):
        match_score_dict = { }
        for node2 in node2_list:
            if node1 != node2:
                if string_map is not None:
                    n1 = string_map(node1)
                    n2 = string_map(node2)
                    score = fuzz.partial_ratio(n1,n2)
                    if n1 == n2:
                        node2_list.remove(node2)
                else:
                    score = fuzz.partial_ratio(node1,node2)
                if score > score_baseline:
                    match_score_dict[node2] = score
            else:
                node2_list.remove(node2)
        node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True)
        node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]]
        print i,len(node1_list)
    return node_dict, node2_list 
开发者ID:HLIG,项目名称:HUAWEIOCR-2019,代码行数:24,代码来源:py_op.py

示例8: extract_stat_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:30,代码来源:make_handcrafted_33_features.py

示例9: search

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def search(self, query):
        def processor(x):
            if isinstance(x, Issue):
                x = x.title
            return x.strip().lower()

        # We don't care about the score, so return first element
        # This must not happen while updating the self.issues dict so acquire the lock
        with self.issues_lock:
            return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
                                                            processor=processor, limit=5)] 
开发者ID:python-telegram-bot,项目名称:rules-bot,代码行数:13,代码来源:util.py

示例10: partial_ratio

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def partial_ratio(str1: str, str2: str) -> int:
    """Get partial fuzzy ratio with korean text"""

    return fuzz.partial_ratio(
        normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2),
    ) 
开发者ID:item4,项目名称:yui,代码行数:8,代码来源:fuzz.py

示例11: pdf_annotate

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def pdf_annotate(self, data):

        title_text = data.get('title')
        if not title_text:
            log.error('Unable to run pubmed matching since we have no title')
            # unable to do pubmed unless we have a title, so just return the original data
            return data

        vec_q = self.vectorizer.transform([title_text])
        token_overlap = vec_q.dot(self.vec_ti.T)
        self.to = token_overlap
        best_ind = token_overlap.indices[token_overlap.data.argmax()]
        pmid = int(self.pmid_ind[best_ind])

        # checking both the overall similarity, and overlap similarity

        pubmed_data = self.query_pubmed(pmid)

        match_pc = fuzz.ratio(title_text.lower(), pubmed_data['title'].lower())
        match_pc_overlap = fuzz.partial_ratio(title_text.lower(), pubmed_data['title'].lower())

        # seems like a reasonable heuristic but not checked
        # (given that sometimes our query is a partial title
        # retrieved by Grobid)
        pubmed_data['pubmed_match_quality'] = sum([match_pc, match_pc_overlap])

        var_map = [('abstract', pubmed_data['abstract']),
                   ('pmid', pubmed_data['pmid']),
                   ('mesh', pubmed_data['mesh'])]


        if pubmed_data['pubmed_match_quality'] > 180:
            data.data['pubmed'] = pubmed_data # until setattr is worked out
        else:
             # keep it just in case, but don't replace better quality match
             data.data['dubious'] = pubmed_data # until setattr is worked out

        return data 
开发者ID:ijmarshall,项目名称:robotreviewer,代码行数:40,代码来源:pubmed_robot.py

示例12: fuzzy

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def fuzzy(tokens):
    averages = []
    for token in tokens:
        sameTokenRemoved = False
        result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
        scores = []
        for each in result:
            score = each[1]
            if score == 100 and not sameTokenRemoved:
                sameTokenRemoved = True
                continue
            scores.append(score)
        average = statistics.mean(scores)
        averages.append(average)
    return statistics.mean(averages) 
开发者ID:s0md3v,项目名称:Bolt,代码行数:17,代码来源:bolt.py

示例13: check_names_fuzzy_match

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def check_names_fuzzy_match(row):
    row["name_match"] = fuzz.partial_ratio(row["Player"], row["PLAYER_NAME"]) > 60
    return row 
开发者ID:rd11490,项目名称:NBA_Tutorials,代码行数:5,代码来源:deduplicate_sources.py

示例14: write_cluster

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def write_cluster(cluster_topic, filepath):
    with open(filepath, "w") as w:
        for key, value in cluster_topic.items():
            cluster = [(text, sim) for text, sim in value if text != key]
            cluster = json.dumps(cluster, ensure_ascii=False)
            w.write("{}\t{}\n".format(key, cluster))

#def fuzz_sim(text1, text2, lower=True):
#    if lower:
#        text1, text2 = text1.lower(), text2.lower()
#    partial_ratio = fuzz.partial_ratio(text1, text2)/100
#    simple_ratio = fuzz.ratio(text1, text2)/100
#    return 0.8*partial_ratio + 0.2*simple_ratio 
开发者ID:HaowenHOU,项目名称:single-pass-clustering-for-chinese-text,代码行数:15,代码来源:cluster_for_short_text.py

示例15: adjust_score

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def adjust_score(partial_ratio, text1, text2):
    min_len = min(len(text1), len(text2))
    if partial_ratio >= 0.8 and min_len >= 4:
        partial_ratio += 0.15
    if partial_ratio >= 0.8 and min_len <= 2:
        partial_ratio -= 0.15
        head_hit = hit_head(text1, text2)
        if head_hit:
            partial_ratio += 0.15
        else:
            partial_ratio -= 0.15
    if min_len == 1:
        partial_ratio = 0.0
    return partial_ratio 
开发者ID:HaowenHOU,项目名称:single-pass-clustering-for-chinese-text,代码行数:16,代码来源:cluster_for_short_text.py


注:本文中的fuzzywuzzy.fuzz.partial_ratio方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。