本文整理汇总了Python中fuzzywuzzy.fuzz.token_set_ratio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.token_set_ratio方法的具体用法?Python fuzz.token_set_ratio怎么用?Python fuzz.token_set_ratio使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fuzzywuzzy.fuzz
的用法示例。
在下文中一共展示了fuzz.token_set_ratio方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
示例2: similar_string_fast
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def similar_string_fast(first_string, second_string):
"""Determine if two strings are similar (using two most effective methods).
Params:
- first_string: (type: string) first string.
- second_string: (type: string) second string.
Returns:
- result: (type: bool) match result.
"""
partial_score = fuzz.ratio(first_string, second_string)
token_score = fuzz.token_set_ratio(first_string, second_string)
if max(partial_score, token_score) >= SCORE_THRESHOLD_FAST:
return True
return False
示例3: fuzzy_fuzzywuzzy
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
collection_new = []
len_user_input = len(user_input)
for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误
for i in range(len_user_input):
if user_input[i] in coll:
collection_new.append(coll)
if not collection_new:
return None
collection_new = list(set(collection_new))
same_char_list = []
for collection_new_one in collection_new: # 获取相同字符串多的问题
count_same_char_one = count_same_char(user_input, collection_new_one)
same_char_list.append((collection_new_one, count_same_char_one))
same_char_list.sort(key=lambda x: x[1], reverse=True)
if len(same_char_list) >= 500:
same_char_list = same_char_list[0: 500]
result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
return result
示例4: _create_fuzzy_wuzzy_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
def _get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)
示例5: extract_stat_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_stat_features(self,df):
df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)
print("token features...")
token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
if 'label' in df.columns.tolist():
return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
else:
return df.drop(["title1_zh", "title2_zh"], axis=1).values
示例6: worker
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def worker(num,total,foodStrings):
stringMatches = []
partialList = {}
"""thread worker function"""
for foodString in foodStrings:
for (i,key) in enumerate(foodList.keys()):
if i%total==num:
leven1 = fuzz.token_set_ratio(key,foodString)
leven2 = Levenshtein.ratio(foodString,key)
if leven2>0.5:
stringMatches.append((key,foodList[key],leven1,leven2))
pickle.dump(stringMatches,open(str(num)+'.p','wb'))
return
示例7: set_ratio_fuzzywuzzy
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def set_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_set_ratio(str1, str2)
示例8: extract_string_similarity_vector
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_string_similarity_vector(instance: dict):
"""
Returns a vector encoding a variety of lexical similarity metrics given a dictionary containing keys
sentence_1,sentence_2
:return: a vector containing similarity scores
"""
s1 = instance['sentence_1']
s2 = instance['sentence_2']
return torch.tensor([
normalized_levenshtein.similarity(s1,s2),
jarowinkler.similarity(s1,s2),
metric_lcs.distance(s1,s2),
qgram2.distance(s1,s2),
qgram3.distance(s1,s2),
qgram4.distance(s1,s2),
jaccard.similarity(s1,s2),
cosine.similarity(s1,s2),
fuzz.partial_token_set_ratio(s1,s2),
fuzz.partial_token_sort_ratio(s1,s2),
fuzz.token_set_ratio(s1,s2),
fuzz.token_sort_ratio(s1,s2),
fuzz.QRatio(s1,s2),
fuzz.UQRatio(s1,s2),
fuzz.UWRatio(s1,s2),
fuzz.WRatio(s1,s2)
])
示例9: _scholar_score
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _scholar_score(txt, bib):
# high score means high similarity
from fuzzywuzzy.fuzz import token_set_ratio
return sum([token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract'] if k in bib])
示例10: _crossref_score
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _crossref_score(txt, r):
# high score means high similarity
from fuzzywuzzy.fuzz import token_set_ratio
score = 0
if 'author' in r:
author = ' '.join([p['family'] for p in r.get('author',[]) if 'family' in p])
score += token_set_ratio(author, txt)
if 'title' in r:
score += token_set_ratio(r['title'][0], txt)
if 'abstract' in r:
score += token_set_ratio(r['abstract'], txt)
return score
示例11: compare_entries
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def compare_entries(e1, e2, fuzzy=False):
"""assess two entries' similarity
"""
if e1 == e2:
return EXACT_DUPLICATES
id1 = entry_id(e1)
id2 = entry_id(e2)
logger.debug('{} ?= {}'.format(id1, id2))
if id1 == id2:
score = GOOD_DUPLICATES
elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # all defined fields agree
score = FAIR_DUPLICATES
elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # some of the defined fields agree
score = PARTIAL_DUPLICATES
elif not fuzzy:
score = 0
else:
from fuzzywuzzy.fuzz import token_set_ratio
doi1, tag1 = id1
doi2, tag2 = id2
score = token_set_ratio(tag1, tag2)
return score
示例12: detect_xss
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def detect_xss(self, payload, browser_object, user_screenshot_name,
injected_link):
"""Check the HTML source to determine if XSS payload was reflected."""
# If fuzzy detection chosen, evaluate partial reflection of XSS
# by tokenizing the HTML source and detecting parts of the payload
# and source common to both.
#
# Other methods of scoring include fuzz.ratio(), fuzz.partial_ratio()
# and fuzz.token_sort_ratio()
partial_score = fuzz.token_set_ratio(
payload.lower(), browser_object.html.lower())
# Set the level of detection asked for by the user, e.g. Only detect
# matches with score higher than 50% fuzzy detection
fuzzy_level = self.user_args.FUZZY_DETECTION
if payload.lower() in browser_object.html.lower():
print Color.GREEN + "\n[+] XSS vulnerability found:" + \
Color.END
# If user set the --screen flag to target, capture screenshot of
# payload
if user_screenshot_name is not None:
self.take_screenshot(user_screenshot_name,
browser_object, self.screen_index)
# Add link to list of all positive XSS hits
self.xss_links.append(injected_link)
print Color.BLUE + injected_link + Color.END
# If user enabled fuzzy detection and partial score was larger than
# fuzz level, add it to partials list and print results
elif fuzzy_level and (partial_score >= fuzzy_level):
print Color.YELLOW + \
"\n[-] Partial XSS vulnerability found:" + Color.END
print Color.BLUE + injected_link + Color.END
self.xss_partials.append(injected_link)
print "Detection score: %s" % partial_score
else:
print Color.RED + "\n[+] No XSS detected at: \n" + \
Color.BLUE + injected_link + Color.END
if (fuzzy_level):
print "Detection score: %s" % partial_score
示例13: getStringMatches
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def getStringMatches(foodString):
print(foodString)
foodString = foodString.replace(',',' ').lower()
foodStrings = []
foodStrings.append(foodString)
foodWords = foodString.split()
if len(foodWords)>2:
otherFoodWords = combinations(foodWords,2)
for words in otherFoodWords:
foodStrings.append(' '.join(words))
if len(foodWords)>3:
otherFoodWords = combinations(foodWords,3)
for words in otherFoodWords:
foodStrings.append(' '.join(words))
stringMatches = []
partialList = {}
processes = []
totalProcesses = NUM_PROCESSORS
for i in range(totalProcesses):
t = Process(target=worker, args=(i,totalProcesses,foodStrings,))
processes.append(t)
for t in processes:
t.start()
for t in processes:
t.join()
for i in range(totalProcesses):
foo = pickle.load(open(str(i)+'.p','rb'))
stringMatches = stringMatches + foo
os.system('rm ' + str(i)+'.p')
'''
for foodString in foodStrings:
for (i,key) in enumerate(foodList.keys()):
partialList[key] = fuzz.token_set_ratio(key,foodString)
foo = sorted(partialList.items(), key=operator.itemgetter(1),reverse=True)[:100]
for result in foo:
leven=Levenshtein.ratio(foodString,result[0])
if leven>0.5:
stringMatches.append((result[0],foodList[result[0]],result[1],leven))
'''
matches = (sorted(stringMatches, key=operator.itemgetter(2, 3), reverse=True))
return matches
示例14: fuzzy_score_string
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_score_string(first_string, second_string):
"""Produce a similarity score for two strings (using Levenshtein distance).
Params:
- first_string: (type: string) first string.
- second_string: (type: string) second string.
Returns:
- result: (type: int) score.
"""
score = 0
if len(first_string) < len(second_string):
shorter, longer = (first_string, second_string)
window_length = len(shorter)
num_iterations = len(longer) - len(shorter) + 1
for position in range(0, num_iterations):
window = longer[position:position + window_length]
l_ratio = Levenshtein.ratio(window, shorter) * 100
if l_ratio > 60:
result = statistics.mean(
[100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])
else:
result = l_ratio
if result > score:
score = result
else:
l_ratio = Levenshtein.ratio(first_string, second_string) * 100
score = statistics.mean(
[100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])
simple = fuzz.ratio(first_string, second_string)
partial = fuzz.partial_ratio(first_string, second_string)
sort = fuzz.token_sort_ratio(first_string, second_string)
set_ratio = fuzz.token_set_ratio(first_string, second_string)
score = max([score, simple, partial, sort, set_ratio])
if score < 75:
score = 0
return score * 0.85
示例15: fuzzy_fuzzywuzzy_list
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
start_time = time.time()
# user_input_set = set([user_input_one for user_input_one in user_input])
user_input_set = [user_input_one for user_input_one in user_input]
same_char_list = []
max_data = 0
max_data_list = []
count_collection_new_one = 0
for collection_new_one in collection: # 获取相同字符串多的问题
count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
if count_same_char_one > 0:
same_char_list.append((count_collection_new_one, count_same_char_one))
if count_same_char_one > max_data:
max_data_list.append(count_same_char_one)
max_data = count_same_char_one
count_collection_new_one += 1
end_time1 = time.time()
list_max_count = []
len_max_data_list = len(max_data_list)
for x in range(len_max_data_list): # 获取前20排名
for k,l in same_char_list:
if l == max_data_list[len_max_data_list -1 - x]:
list_max_count.append(qa_list[k]) #问答重这里取出来
if len(list_max_count) >= 5000:
list_max_count = list_max_count[0:5000]
break
end_time2 = time.time()
# end_time1: 0.34090662002563477
# end_time2: 0.4080846309661865
# end_time1: 0.06417036056518555
# end_time2: 0.08422374725341797
# same_char_list.sort(key=lambda x: x[1], reverse=True)
# if len(same_char_list) >= 20:
# same_char_list = same_char_list[0: 20]
result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
end_time3 = time.time()
# print('end_time1: ' + str(end_time1 - start_time))
# print('end_time2: ' + str(end_time2 - start_time))
# print('end_time3: ' + str(end_time3 - start_time))
return result
# [fuzz.WRatio, fuzz.QRatio,
# fuzz.token_set_ratio, fuzz.token_sort_ratio,
# fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
# fuzz.UWRatio, fuzz.UQRatio]