本文整理汇总了Python中nltk.metrics.edit_distance函数的典型用法代码示例。如果您正苦于以下问题:Python edit_distance函数的具体用法?Python edit_distance怎么用?Python edit_distance使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了edit_distance函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_abstract_by_title
def get_abstract_by_title(title):
print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
print 'searching entry with title: ' + title
fetch = metapub.PubMedFetcher()
pmids = fetch.pmids_for_query(title)
if (len(pmids) == 0):
print 'warning: no entry retrieved for given title'
return None, ''
elif (len(pmids) == 1):
article = fetch.article_by_pmid(pmids[0])
if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
print 'successfully matched title: ' + article.title
return article.title, article.abstract
else:
print 'warning: found one entry but not a match'
return None, ''
else:
print 'warning: retrieved more than one entry for given title'
for i in range(min(20, len(pmids))):
article = fetch.article_by_pmid(pmids[i])
if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
print 'successfully matched title: ' + article.title
return article.title, article.abstract
print 'warning: no entry is a match'
return None, ''
示例2: get_related_evidence
def get_related_evidence(title):
print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
try:
print 'given title: ' + title
# TODO: fix this...
except UnicodeEncodeError:
print 'title cannot be printed - containing unicode encode error'
return [], {}, 0
fetch = metapub.PubMedFetcher()
pmids = fetch.pmids_for_query(title)
if len(pmids) == 1:
article = fetch.article_by_pmid(pmids[0])
if edit_distance(article.title, title) <= len(title) * 0.1:
print 'matched title: ' + article.title.encode('utf-8')
related_pmids = fetch.related_pmids(pmids[0])
return _merge_related_pmids(pmids[0], related_pmids, fetch)
elif len(pmids) > 1:
for i in range(min(20, len(pmids))):
article = fetch.article_by_pmid(pmids[i])
if edit_distance(article.title, title) <= len(title) * 0.1:
print 'matched title: ' + article.title.encode('utf-8')
related_pmids = fetch.related_pmids(pmids[i])
return _merge_related_pmids(pmids[i], related_pmids, fetch)
print 'no match found'
return [], {}, 0
示例3: matches_author
def matches_author(self, string, fuzzy=False, distance_threshold=3):
"""
This function retrieves from the KnowledgeBase possible authors that match the search string.
None is returned if no matches are found.
:param string: the string to be matched
:param fuzzy: whether exact or fuzzy string matching should be applied
:distance_threshold: the maximum edit distance threshold (ignored if `fuzzy==False`)
:return: a list of tuples, ordered by distance between the seach and the matching string, where:
tuple[0] contains the id (i.e. CTS URN) of the matching author
tuple[1] contains a label of the matching author
tuple[2] is the distance, measured in characters, between the search string and the matching string
or None if no match is found.
"""
#string = string.lower()
author_matches, abbr_matches = [],[]
if(not fuzzy):
author_matches = [(id.split("$$")[0]
, self._author_names[id]
, len(self._author_names[id])-len(string))
for id in self._author_idx.searchAllWords(string)]
abbr_matches = [(id.split("$$")[0]
, self._author_abbreviations[id]
, len(self._author_abbreviations[id])-len(string))
for id in self._author_abbr_idx.searchAllWords(string)]
else:
abbr_matches = [(id.split("$$")[0]
, self._author_abbreviations[id]
, edit_distance(string,self._author_abbreviations[id]))
for id in self._author_abbreviations
if edit_distance(string,self._author_abbreviations[id]) <= distance_threshold]
abbr_matches = sorted(abbr_matches, key =itemgetter(2))
author_matches = []
for id in self._author_names:
if(string.endswith(".")):
if string.replace(".","") in self._author_names[id]:
if(len(string) > (len(self._author_names[id]) / 2)):
try:
assert abbr_matches[0][2] == 0
distance = len(self._author_names[id]) - len(string)
if distance < 0:
distance = 1
author_matches.append((id.split("$$")[0], self._author_names[id],distance))
except Exception, e:
author_matches.append((id.split("$$")[0], self._author_names[id],0))
else:
if(edit_distance(string,self._author_names[id]) <= distance_threshold):
author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
else:
if(edit_distance(string,self._author_names[id]) <= distance_threshold):
author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
示例4: searchEvidenceByTitle
def searchEvidenceByTitle(request):
if request.method == 'POST':
data = json.loads(request.body)
collection_id = data['collection_id']
title = data['title']
result_limit = data['result_limit']
include_personal = data['include_personal']
user_id = data['user_id']
# DONE: we can alternatively change this to treat given title as a series of separated terms
title_terms = title.split(' ')
print title_terms
evidence = Evidence.objects.filter(Q(created_by=collection_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
if include_personal:
personal_evidence = Evidence.objects.filter(Q(created_by=user_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
evidence = chain(evidence, personal_evidence)
serialized_json = serializers.serialize('json', evidence)
evidence_json = flattenSerializedJson(serialized_json)
evidence = json.loads(evidence_json)
pprint.pprint(evidence)
for e in evidence:
e['dist'] = edit_distance(title, e['title'])
print 'result limit'
print result_limit
evidence = sorted(evidence, key=lambda e:e['dist'])[:result_limit]
for e in evidence:
e['topic'] = -1
try:
e['topic'] = EvidenceTopic.objects.get(evidence=e['id']).primary_topic
except ObjectDoesNotExist:
if len(e['abstract']) > 50:
name = Collection.objects.get(collection_id=collection_id).collection_name
topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name)
primary_topic_tuple = max(topic_dist, key=lambda x:x[1])
e['topic'] = primary_topic_tuple[0]
else:
print 'warning: evidence with no topic'
return HttpResponse(json.dumps(evidence), status=status.HTTP_200_OK)
elif request.method == 'GET':
collection_id = 13
title = 'UpSet: Visualization of Intersecting Sets'
evidence = Evidence.objects.filter(created_by=collection_id)
serialized_json = serializers.serialize('json', evidence)
evidence_json = flattenSerializedJson(serialized_json)
evidence = json.loads(evidence_json)
for e in evidence:
e['dist'] = edit_distance(title, e['title'])
evidence = sorted(evidence, key=lambda e:e['dist'])
return HttpResponse(json.dumps(evidence[:20]), status=status.HTTP_200_OK)
示例5: string_matching
def string_matching(label1, label2): #by Maedchen and Staab
""" (string, string) -> float
Return the coefficient of similarity between two sequence of strings based on
the Levenshtein distance (edit distance). It equates 1 for exact match and
0 to no similarity.
>>> string_matching('power','power')
1.0
>>> string_matching('power','abba')
0.0
"""
sm = float(
min(len(label1),len(label2)) -
edit_distance(label1, label2)
) / min(len(label1),len(label2)
)
try:
if sm < 0:
return 0.0
else:
return sm
except:
print "Error found:"
traceback.print_exc(file=sys.stdout)
return 0
示例6: replace
def replace(self, word):
if self.spell_dict.check(word):
return word
suggestions = []
suggestions = self.spell_dict.suggest(word)
distance = []
print(distance)
print(suggestions)
retVal = ""
for suggestedWord in suggestions:
distance.append(edit_distance(word, suggestedWord))
print(distance)
lengthMatched = False
if min(distance) <= self.max_dist:
retVal = suggestions[distance.index(min(distance))]
i = 0
for ed in distance:
if ed == min(distance) :
if len(word) == len(suggestions[i]) and lengthMatched == False:
retVal = suggestions[i]
lengthMatched = True
i += 1
else :
retVal = word
return retVal
示例7: _GetScore
def _GetScore(self, query, match):
"""Custom edit-distance based scoring."""
str_query = str(query)
str_candidate = str(match.key)
dist = float(edit_distance(str_query, str_candidate))
max_len = float(max(len(str_query), len(str_candidate)))
return (max_len - dist) / max_len
示例8: replace_word
def replace_word(self, word):
if self.dictionary.check(word):
return word
suggestions = self.dictionary.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
return suggestions[0]
示例9: replace
def replace(self, word):
if self.spell_dict.check(word):
return word
distance = []
suggestions = []
suggestions = self.spell_dict.suggest(word)
retVal = ""
for suggestedWord in suggestions:
distance.append(edit_distance(word, suggestedWord))
if min(distance) <= self.max_dist:
retVal = suggestions[distance.index(min(distance))]
i = 0
for ed in distance:
if ed == min(distance) :
if len(word) == len(suggestions[i]):
retVal = suggestions[i]
break
i += 1
else:
retVal = word
return retVal
示例10: spellChecker
def spellChecker(sentences, file_name_s):
dict_name = 'en_GB'
spell_dict = enchant.Dict(dict_name)
max_dist = 3
corrected = []
csv_writer = csv.writer(open(file_name_s, 'wb'))
#csv_writer.writerow(HEADER2)
for sentence in sentences:
corrected_sent = ''
sentence = str(sentence)
sc = set(["[", "]", "'", '"'])
words = ''.join([c for c in sentence if c not in sc])
words = words.split()
#print words
for word in words:
print word
suggestions = spell_dict.suggest(word)
#print suggestions[0]
#print edit_distance(word, suggestions[0])
if suggestions and edit_distance(word, suggestions[0]) <= max_dist:
#print word
corrected_sent = corrected_sent + " " + suggestions[0]
else:
corrected_sent = corrected_sent + " " + word
corrected_sent.replace("[","")
corrected_sent.replace("]","")
corrected_sent.replace("'","")
#print corrected_sent
corrected.append(corrected_sent)
csv_writer.writerow([corrected_sent])
print corrected
示例11: fuzzy_comparison
def fuzzy_comparison(tokens_1,tokens_2,max_dist=1):
""" compares the tokens based on fuzzy match """
matched = 0
matched_len_1 = init_term_1 - len(tokens_1)
matched_len_2 = init_term_2 - len(tokens_2)
for token in reversed(tokens_1):
if len(token)<=2:
tokens_1.remove(token)
continue
for tkn in reversed(tokens_2):
if len(tkn)<=2:
tokens_2.remove(tkn)
continue
if metrics.edit_distance(token, tkn) <= max_dist:
matched = matched + 1
logging.debug("Match found for:"+token+" - "+tkn)
tokens_2.remove(tkn)
tokens_1.remove(token)
break
logging.info("Fuzzy match count:"+str(matched))
score_1 = (matched_len_1 + matched)/float(init_term_1)
score_2 = (matched_len_2 + matched)/float(init_term_2)
return score_1,score_2
示例12: replace
def replace(self,word):
if self.spell_dict.check(word):
return word
suggestions = self.spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <=self.max_dist:
return suggestions[0]
else:
return word
示例13: spell_check
def spell_check(r, a, s, scores, weight=1):
change = weight*(1-(edit_distance(r, a)/float(max(len(r), len(a)))))
if s in scores:
# penalty for returning multiple of the same result when
# one instance is incorrectly spelled
return (scores[s] + change)/2.0
else:
return change
示例14: check_replace_word
def check_replace_word(word):
if spell_dict.check(word):
return word
suggestions = spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) < 2:
return suggestions[0]
else:
return word
示例15: ordered_content_distance
def ordered_content_distance(self, sentence, normalized=True):
"""Normalized levenshtein distance on (ordered) content words
between `self` and `sentence`."""
self_content_words = self.content_words
sentence_content_words = sentence.content_words
distance = edit_distance(self_content_words, sentence_content_words)
norm = max(len(self_content_words), len(sentence_content_words))
return distance / norm if normalized else distance