本文整理汇总了Python中fuzzywuzzy.fuzz.partial_ratio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.partial_ratio方法的具体用法?Python fuzz.partial_ratio怎么用?Python fuzz.partial_ratio使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fuzzywuzzy.fuzz
的用法示例。
在下文中一共展示了fuzz.partial_ratio方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def extract_features(df):
df["question1"] = df["question1"].fillna("").apply(preprocess)
df["question2"] = df["question2"].fillna("").apply(preprocess)
print("token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df
示例2: inquiry
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def inquiry(self):
sentence = self.line_edit.text()
matched = []
score_thresh = self.getScoreThresh()
if not sentence:
QMessageBox.warning(self, "Warning", '请先输入需要查询的鲁迅名言')
else:
for p in self.paragraphs:
score = fuzz.partial_ratio(p, sentence)
if score >= score_thresh and len(sentence) <= len(p):
matched.append([score, p])
infos = []
for match in matched:
infos.append('[匹配度]: %d\n[内容]: %s\n' % (match[0], match[1]))
if not infos:
infos.append('未匹配到任何相似度大于%d的句子.\n' % score_thresh)
self.text.setText('\n\n\n'.join(infos)[:-1])
示例3: add_dup_simhash_caches
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def add_dup_simhash_caches(simhashcache, dup_obj_ids):
if not dup_obj_ids:
return
old_dup_obj_ids = set(dup_obj_ids)
start_time = time.time()
for i, dup_obj_id in enumerate(dup_obj_ids, 1):
with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)):
logging.info('--' * 100)
try:
dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id)
except Exception, e:
print e
continue
sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text)
logging.info(simhashcache.text)
logging.info('--' * 20)
logging.info(dup_simhash.text)
logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id))
if dup_simhash not in old_dup_obj_ids:
if sim_ratio > 50:
old_dup_obj_ids.add(dup_obj_id)
else:
if sim_ratio <= 50:
old_dup_obj_ids.remove(dup_obj_id)
示例4: search
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def search(self, targets, partial=True, fuzzy=False):
allInstances = self.instances()
matchedInstances = set()
for host in targets:
for instance in allInstances:
names = [instance.name]
if instance.aliases != None:
names += list(instance.aliases)
for name in names:
if host.lower() == name.lower():
matchedInstances.add((100, instance))
elif partial and host.lower() in name.lower():
matchedInstances.add((99, instance))
if fuzzy:
score = fuzz.partial_ratio(host.lower(), name.lower())
if score > 85 or host.lower() in name.lower():
matchedInstances.add((score, instance))
# it is possible for the same instance to be matched, if so, it should only
# appear on the return list once (still ordered by the most probable match)
return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys())
示例5: _create_fuzzy_wuzzy_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
def _get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)
示例6: find_near_matches
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def find_near_matches(w, sentence):
ret = []
max_ratio = 0
t = 0
for word in sentence.split():
while sentence[t] != word[0]:
t += 1
score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2
if score > max_ratio:
max_ratio = score
ret = [(t, t + len(word))]
elif score == max_ratio:
ret.append((t, t + len(word)))
else:
pass
t += len(word)
return ret if max_ratio > 85 else []
示例7: fuzz_list
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None):
node_dict = { }
for i,node1 in enumerate(node1_list):
match_score_dict = { }
for node2 in node2_list:
if node1 != node2:
if string_map is not None:
n1 = string_map(node1)
n2 = string_map(node2)
score = fuzz.partial_ratio(n1,n2)
if n1 == n2:
node2_list.remove(node2)
else:
score = fuzz.partial_ratio(node1,node2)
if score > score_baseline:
match_score_dict[node2] = score
else:
node2_list.remove(node2)
node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True)
node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]]
print i,len(node1_list)
return node_dict, node2_list
示例8: extract_stat_features
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def extract_stat_features(self,df):
df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)
print("token features...")
token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
df["cwc_min"] = list(map(lambda x: x[0], token_features))
df["cwc_max"] = list(map(lambda x: x[1], token_features))
df["csc_min"] = list(map(lambda x: x[2], token_features))
df["csc_max"] = list(map(lambda x: x[3], token_features))
df["ctc_min"] = list(map(lambda x: x[4], token_features))
df["ctc_max"] = list(map(lambda x: x[5], token_features))
df["last_word_eq"] = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))
df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
df["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
if 'label' in df.columns.tolist():
return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
else:
return df.drop(["title1_zh", "title2_zh"], axis=1).values
示例9: search
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def search(self, query):
def processor(x):
if isinstance(x, Issue):
x = x.title
return x.strip().lower()
# We don't care about the score, so return first element
# This must not happen while updating the self.issues dict so acquire the lock
with self.issues_lock:
return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
processor=processor, limit=5)]
示例10: partial_ratio
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def partial_ratio(str1: str, str2: str) -> int:
"""Get partial fuzzy ratio with korean text"""
return fuzz.partial_ratio(
normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2),
)
示例11: pdf_annotate
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def pdf_annotate(self, data):
title_text = data.get('title')
if not title_text:
log.error('Unable to run pubmed matching since we have no title')
# unable to do pubmed unless we have a title, so just return the original data
return data
vec_q = self.vectorizer.transform([title_text])
token_overlap = vec_q.dot(self.vec_ti.T)
self.to = token_overlap
best_ind = token_overlap.indices[token_overlap.data.argmax()]
pmid = int(self.pmid_ind[best_ind])
# checking both the overall similarity, and overlap similarity
pubmed_data = self.query_pubmed(pmid)
match_pc = fuzz.ratio(title_text.lower(), pubmed_data['title'].lower())
match_pc_overlap = fuzz.partial_ratio(title_text.lower(), pubmed_data['title'].lower())
# seems like a reasonable heuristic but not checked
# (given that sometimes our query is a partial title
# retrieved by Grobid)
pubmed_data['pubmed_match_quality'] = sum([match_pc, match_pc_overlap])
var_map = [('abstract', pubmed_data['abstract']),
('pmid', pubmed_data['pmid']),
('mesh', pubmed_data['mesh'])]
if pubmed_data['pubmed_match_quality'] > 180:
data.data['pubmed'] = pubmed_data # until setattr is worked out
else:
# keep it just in case, but don't replace better quality match
data.data['dubious'] = pubmed_data # until setattr is worked out
return data
示例12: fuzzy
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def fuzzy(tokens):
averages = []
for token in tokens:
sameTokenRemoved = False
result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
scores = []
for each in result:
score = each[1]
if score == 100 and not sameTokenRemoved:
sameTokenRemoved = True
continue
scores.append(score)
average = statistics.mean(scores)
averages.append(average)
return statistics.mean(averages)
示例13: check_names_fuzzy_match
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def check_names_fuzzy_match(row):
row["name_match"] = fuzz.partial_ratio(row["Player"], row["PLAYER_NAME"]) > 60
return row
示例14: write_cluster
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def write_cluster(cluster_topic, filepath):
with open(filepath, "w") as w:
for key, value in cluster_topic.items():
cluster = [(text, sim) for text, sim in value if text != key]
cluster = json.dumps(cluster, ensure_ascii=False)
w.write("{}\t{}\n".format(key, cluster))
#def fuzz_sim(text1, text2, lower=True):
# if lower:
# text1, text2 = text1.lower(), text2.lower()
# partial_ratio = fuzz.partial_ratio(text1, text2)/100
# simple_ratio = fuzz.ratio(text1, text2)/100
# return 0.8*partial_ratio + 0.2*simple_ratio
示例15: adjust_score
# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import partial_ratio [as 别名]
def adjust_score(partial_ratio, text1, text2):
min_len = min(len(text1), len(text2))
if partial_ratio >= 0.8 and min_len >= 4:
partial_ratio += 0.15
if partial_ratio >= 0.8 and min_len <= 2:
partial_ratio -= 0.15
head_hit = hit_head(text1, text2)
if head_hit:
partial_ratio += 0.15
else:
partial_ratio -= 0.15
if min_len == 1:
partial_ratio = 0.0
return partial_ratio