本文整理汇总了Python中fuzzywuzzy.fuzz.token_sort_ratio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.token_sort_ratio方法的具体用法?Python fuzz.token_sort_ratio怎么用?Python fuzz.token_sort_ratio使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fuzzywuzzy.fuzz
的用法示例。
在下文中一共展示了fuzz.token_sort_ratio方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
示例2: _create_fuzzy_wuzzy_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
def _get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)
示例3: extract_stat_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def extract_stat_features(self,df):
df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)
print("token features...")
token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
if 'label' in df.columns.tolist():
return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
else:
return df.drop(["title1_zh", "title2_zh"], axis=1).values
示例4: token_sort_ratio
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def token_sort_ratio(str1: str, str2: str) -> int:
"""Get token sorted fuzzy ratio with korean text"""
return fuzz.token_sort_ratio(
normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2),
)
示例5: match
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def match(s1: str, s2: str) -> int:
"""Get custom ratio for yui functions"""
rng = [len(s1), len(s2)]
tsr = token_sort_ratio(s1, s2)
r = ratio(s1, s2)
weight = 1 - (min(rng) / max(rng))
return max(0, min(100, int(r * (1 + weight * tsr / 100))))
示例6: partial_string_based
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def partial_string_based(str1, str2):
"""Performs a partial string match using the Jaro-Winkler distance algorithm.
Args:
str1: A string value to check.
str2: A string value to check.
Returns:
float: Number between 0.0 and 1.0 depending on match criteria.
"""
from fuzzywuzzy import fuzz
result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
return result / 100.0
示例7: sort_ratio_fuzzywuzzy
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def sort_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_sort_ratio(str1, str2)
示例8: extract_string_similarity_vector
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def extract_string_similarity_vector(instance: dict):
"""
Returns a vector encoding a variety of lexical similarity metrics given a dictionary containing keys
sentence_1,sentence_2
:return: a vector containing similarity scores
"""
s1 = instance['sentence_1']
s2 = instance['sentence_2']
return torch.tensor([
normalized_levenshtein.similarity(s1,s2),
jarowinkler.similarity(s1,s2),
metric_lcs.distance(s1,s2),
qgram2.distance(s1,s2),
qgram3.distance(s1,s2),
qgram4.distance(s1,s2),
jaccard.similarity(s1,s2),
cosine.similarity(s1,s2),
fuzz.partial_token_set_ratio(s1,s2),
fuzz.partial_token_sort_ratio(s1,s2),
fuzz.token_set_ratio(s1,s2),
fuzz.token_sort_ratio(s1,s2),
fuzz.QRatio(s1,s2),
fuzz.UQRatio(s1,s2),
fuzz.UWRatio(s1,s2),
fuzz.WRatio(s1,s2)
])
示例9: erPredict
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def erPredict(self, chunks):
erpredictions = []
combinedchunks = []
for chunk in chunks:
wordlist = []
surfacestart = chunk[0][2]
for word in chunk:
wordlist.append(word[0])
surfacelength = word[2]+word[3] - surfacestart
wordlist = ' '.join(wordlist)
combinedchunks.append((wordlist,surfacestart,surfacelength))
for chunk in combinedchunks:
x = None
chunkk = chunk[0].encode('ascii','ignore')
chunkwords = chunkk.translate(None, string.punctuation)
embedding = self.embed(chunkwords)
esresult = self.es.search(index="dbentityindex11", body={"query":{"multi_match":{"query":chunkwords,"fields":["wikidataLabel", "dbpediaLabel^1.5"]}},"size":1})
topresult = esresult['hits']['hits']
if len(topresult) == 1:
topresult = topresult[0]
if 'dbpediaLabel' in topresult['_source']:
x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0]
if 'wikidataLabel' in topresult['_source']:
x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0]
else:
x = embedding + [0.0,0.0,0.0,0.0]
#print(x, type(x))
x = torch.FloatTensor(x)
pred = self.ermodel(x)
print(chunkwords,pred,pred[0])
if pred[0] >0.5:
erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'entity'})
else:
erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'relation'})
return erpredictions
示例10: detect_xss
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def detect_xss(self, payload, browser_object, user_screenshot_name,
injected_link):
"""Check the HTML source to determine if XSS payload was reflected."""
# If fuzzy detection chosen, evaluate partial reflection of XSS
# by tokenizing the HTML source and detecting parts of the payload
# and source common to both.
#
# Other methods of scoring include fuzz.ratio(), fuzz.partial_ratio()
# and fuzz.token_sort_ratio()
partial_score = fuzz.token_set_ratio(
payload.lower(), browser_object.html.lower())
# Set the level of detection asked for by the user, e.g. Only detect
# matches with score higher than 50% fuzzy detection
fuzzy_level = self.user_args.FUZZY_DETECTION
if payload.lower() in browser_object.html.lower():
print Color.GREEN + "\n[+] XSS vulnerability found:" + \
Color.END
# If user set the --screen flag to target, capture screenshot of
# payload
if user_screenshot_name is not None:
self.take_screenshot(user_screenshot_name,
browser_object, self.screen_index)
# Add link to list of all positive XSS hits
self.xss_links.append(injected_link)
print Color.BLUE + injected_link + Color.END
# If user enabled fuzzy detection and partial score was larger than
# fuzz level, add it to partials list and print results
elif fuzzy_level and (partial_score >= fuzzy_level):
print Color.YELLOW + \
"\n[-] Partial XSS vulnerability found:" + Color.END
print Color.BLUE + injected_link + Color.END
self.xss_partials.append(injected_link)
print "Detection score: %s" % partial_score
else:
print Color.RED + "\n[+] No XSS detected at: \n" + \
Color.BLUE + injected_link + Color.END
if (fuzzy_level):
print "Detection score: %s" % partial_score
示例11: fuzzy_score_string
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def fuzzy_score_string(first_string, second_string):
"""Produce a similarity score for two strings (using Levenshtein distance).
Params:
- first_string: (type: string) first string.
- second_string: (type: string) second string.
Returns:
- result: (type: int) score.
"""
score = 0
if len(first_string) < len(second_string):
shorter, longer = (first_string, second_string)
window_length = len(shorter)
num_iterations = len(longer) - len(shorter) + 1
for position in range(0, num_iterations):
window = longer[position:position + window_length]
l_ratio = Levenshtein.ratio(window, shorter) * 100
if l_ratio > 60:
result = statistics.mean(
[100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])
else:
result = l_ratio
if result > score:
score = result
else:
l_ratio = Levenshtein.ratio(first_string, second_string) * 100
score = statistics.mean(
[100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])
simple = fuzz.ratio(first_string, second_string)
partial = fuzz.partial_ratio(first_string, second_string)
sort = fuzz.token_sort_ratio(first_string, second_string)
set_ratio = fuzz.token_set_ratio(first_string, second_string)
score = max([score, simple, partial, sort, set_ratio])
if score < 75:
score = 0
return score * 0.85
示例12: score_match
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def score_match(first_string, second_string, domain_score=False):
"""Produce a similarity score for two strings.
Params:
- first_string: (type: string) first string.
- second_string: (type: string) second string.
- domain_score: (type: bool) whether the comparison is of two domains.
Returns:
- result: (type: int) score.
"""
score = 0
if first_string == second_string:
return SCORE_THRESHOLD_NORMAL
if domain_score:
if remove_tld(first_string) == remove_tld(second_string):
return SCORE_THRESHOLD_NORMAL
if second_string in first_string:
return SCORE_THRESHOLD_NORMAL
if domain_score:
first_string = remove_tld(first_string)
second_string = remove_tld(second_string)
l_distance = Levenshtein.distance(first_string, second_string)
fuzz_ratio = fuzz.token_sort_ratio(first_string, second_string)
if l_distance <= 2:
score = 50 + 25 * (2 - l_distance)
elif fuzz_ratio > 80:
score = fuzz_ratio - 25
first_len = len(first_string)
second_len = len(second_string)
if first_len > second_len / 2 and first_len > 4:
score += fuzzy_score_string(first_string, second_string)
return score
示例13: fuzzy_fuzzywuzzy_list
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
start_time = time.time()
# user_input_set = set([user_input_one for user_input_one in user_input])
user_input_set = [user_input_one for user_input_one in user_input]
same_char_list = []
max_data = 0
max_data_list = []
count_collection_new_one = 0
for collection_new_one in collection: # 获取相同字符串多的问题
count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
if count_same_char_one > 0:
same_char_list.append((count_collection_new_one, count_same_char_one))
if count_same_char_one > max_data:
max_data_list.append(count_same_char_one)
max_data = count_same_char_one
count_collection_new_one += 1
end_time1 = time.time()
list_max_count = []
len_max_data_list = len(max_data_list)
for x in range(len_max_data_list): # 获取前20排名
for k,l in same_char_list:
if l == max_data_list[len_max_data_list -1 - x]:
list_max_count.append(qa_list[k]) #问答重这里取出来
if len(list_max_count) >= 5000:
list_max_count = list_max_count[0:5000]
break
end_time2 = time.time()
# end_time1: 0.34090662002563477
# end_time2: 0.4080846309661865
# end_time1: 0.06417036056518555
# end_time2: 0.08422374725341797
# same_char_list.sort(key=lambda x: x[1], reverse=True)
# if len(same_char_list) >= 20:
# same_char_list = same_char_list[0: 20]
result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
end_time3 = time.time()
# print('end_time1: ' + str(end_time1 - start_time))
# print('end_time2: ' + str(end_time2 - start_time))
# print('end_time3: ' + str(end_time3 - start_time))
return result
# [fuzz.WRatio, fuzz.QRatio,
# fuzz.token_set_ratio, fuzz.token_sort_ratio,
# fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
# fuzz.UWRatio, fuzz.UQRatio]
示例14: process_group
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def process_group(
data, group, toc, toc_table, page_num, section,
sectionid, html):
"""Retreives a group from the full data, and creates toc stuff
Args:
data (List): Full set of data containing all hosts
group (String): String representing group to process
toc (String): HTML for Table of Contents
toc_table (String): HTML for Table in ToC
page_num (int): Page number we're on in the report
section (String): Display name of the group
sectionid (String): Unique ID for ToC navigation
html (String): HTML for current page of report
Returns:
List: Elements for category sorted and grouped
String: HTML representing ToC
String: HTML representing ToC Table
String: HTML representing current report page
"""
group_data = sorted([x for x in data if x.category == group], key=lambda k: str(k.page_title))
grouped_elements = []
if len(group_data) == 0:
return grouped_elements, toc, toc_table, html
if page_num == 0:
toc += ("<li><a href=\"report.html#{0}\">{1} (Page 1)</a></li>").format(
sectionid, section)
else:
toc += ("<li><a href=\"report_page{0}.html#{1}\">{2} (Page {0})</a></li>").format(
str(page_num+1), sectionid, section)
html += "<h2 id=\"{0}\">{1}</h2>".format(sectionid, section)
unknowns = [x for x in group_data if x.page_title == 'Unknown']
group_data = [x for x in group_data if x.page_title != 'Unknown']
while len(group_data) > 0:
test_element = group_data.pop(0)
temp = [x for x in group_data if fuzz.token_sort_ratio(
test_element.page_title, x.page_title) >= 70]
temp.append(test_element)
temp = sorted(temp, key=lambda k: k.page_title)
grouped_elements.extend(temp)
group_data = [x for x in group_data if fuzz.token_sort_ratio(
test_element.page_title, x.page_title) < 70]
grouped_elements.extend(unknowns)
toc_table += ("<tr><td>{0}</td><td>{1}</td>").format(section,
str(len(grouped_elements)))
return grouped_elements, toc, toc_table, html
示例15: search_keybindings
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_sort_ratio [as 别名]
def search_keybindings(software, search_key):
"""
search
:param software:
:param search_key:
"""
#importing fuzzywuzzy in this function improves load time for all yoda commands
from fuzzywuzzy import fuzz
SOFTWARE_FILE_PATH = get_software_file_path(software)
matched_keys = []
matched_actions = []
matched_keys_actions_pairs = []
if os.path.isfile(SOFTWARE_FILE_PATH):
with open(SOFTWARE_FILE_PATH) as fin:
contents = yaml.load(fin)
entries = contents["entries"]
# click.echo(entries)
for entry in entries:
act = entry["action"]
key = entry["key"]
# fr = fuzz.ratio(search_key,act)
# fpr = fuzz.partial_ratio(search_key,act)
ftsr = fuzz.token_sort_ratio(search_key, act)
# print([fr,fpr,ftsr])
# if any(fuzzy_match for fuzzy_match in [fr,fpr,ftsr] if fuzzy_match>=50):
if ftsr >= 50:
# click.echo(entry)
matched_actions.append(act)
matched_keys.append(key)
if matched_actions:
matched_keys_actions_pairs = list(zip(matched_keys, matched_actions))
## Beautify matched output
if matched_keys_actions_pairs:
click.echo("Key Bindings:")
click.echo("---------------------------------------")
click.echo(" key | action ")
click.echo("---------------|-----------------------")
for key, act in matched_keys_actions_pairs:
click.echo(" " + key + " | " + act)
else:
click.echo(chalk.red("No key matched, please try another option"))
else:
click.echo(
chalk.red(
"Software's config file doesn't exist. Type 'yoda dev keybindings --help'"
)
)