本文整理汇总了Python中fuzzywuzzy.fuzz.QRatio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.QRatio方法的具体用法?Python fuzz.QRatio怎么用?Python fuzz.QRatio使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fuzzywuzzy.fuzz
的用法示例。
在下文中一共展示了fuzz.QRatio方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import QRatio [as 别名]
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
示例2: _create_fuzzy_wuzzy_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import QRatio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
def _get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)
示例3: extract_stat_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import QRatio [as 别名]
def extract_stat_features(self,df):
df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)
print("token features...")
token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
if 'label' in df.columns.tolist():
return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
else:
return df.drop(["title1_zh", "title2_zh"], axis=1).values
示例4: extract_string_similarity_vector
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import QRatio [as 别名]
def extract_string_similarity_vector(instance: dict):
"""
Returns a vector encoding a variety of lexical similarity metrics given a dictionary containing keys
sentence_1,sentence_2
:return: a vector containing similarity scores
"""
s1 = instance['sentence_1']
s2 = instance['sentence_2']
return torch.tensor([
normalized_levenshtein.similarity(s1,s2),
jarowinkler.similarity(s1,s2),
metric_lcs.distance(s1,s2),
qgram2.distance(s1,s2),
qgram3.distance(s1,s2),
qgram4.distance(s1,s2),
jaccard.similarity(s1,s2),
cosine.similarity(s1,s2),
fuzz.partial_token_set_ratio(s1,s2),
fuzz.partial_token_sort_ratio(s1,s2),
fuzz.token_set_ratio(s1,s2),
fuzz.token_sort_ratio(s1,s2),
fuzz.QRatio(s1,s2),
fuzz.UQRatio(s1,s2),
fuzz.UWRatio(s1,s2),
fuzz.WRatio(s1,s2)
])
示例5: fuzzy_fuzzywuzzy_list
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import QRatio [as 别名]
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
start_time = time.time()
# user_input_set = set([user_input_one for user_input_one in user_input])
user_input_set = [user_input_one for user_input_one in user_input]
same_char_list = []
max_data = 0
max_data_list = []
count_collection_new_one = 0
for collection_new_one in collection: # 获取相同字符串多的问题
count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
if count_same_char_one > 0:
same_char_list.append((count_collection_new_one, count_same_char_one))
if count_same_char_one > max_data:
max_data_list.append(count_same_char_one)
max_data = count_same_char_one
count_collection_new_one += 1
end_time1 = time.time()
list_max_count = []
len_max_data_list = len(max_data_list)
for x in range(len_max_data_list): # 获取前20排名
for k,l in same_char_list:
if l == max_data_list[len_max_data_list -1 - x]:
list_max_count.append(qa_list[k]) #问答重这里取出来
if len(list_max_count) >= 5000:
list_max_count = list_max_count[0:5000]
break
end_time2 = time.time()
# end_time1: 0.34090662002563477
# end_time2: 0.4080846309661865
# end_time1: 0.06417036056518555
# end_time2: 0.08422374725341797
# same_char_list.sort(key=lambda x: x[1], reverse=True)
# if len(same_char_list) >= 20:
# same_char_list = same_char_list[0: 20]
result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
end_time3 = time.time()
# print('end_time1: ' + str(end_time1 - start_time))
# print('end_time2: ' + str(end_time2 - start_time))
# print('end_time3: ' + str(end_time3 - start_time))
return result
# [fuzz.WRatio, fuzz.QRatio,
# fuzz.token_set_ratio, fuzz.token_sort_ratio,
# fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
# fuzz.UWRatio, fuzz.UQRatio]