本文整理汇总了Python中ngram.NGram.compare方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.compare方法的具体用法?Python NGram.compare怎么用?Python NGram.compare使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ngram.NGram
的用法示例。
在下文中一共展示了NGram.compare方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ngram_similarity
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def ngram_similarity(univ_name):
out = {}
with open("static/UniqueFBUnivNames.csv", 'rb') as f:
reader = csv.reader(f)
for row in reader:
row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
row = re.sub(' ', ' ', str(row))
out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
out['score_used'] = NGram.compare(str(row).lower(), univ_name)
out['univ'] = str(row)
return out
return out
示例2: verify
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def verify(self,text_compare):
results = []
dictio = []
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
if linea2 != '\n':
dictio += [self.ng.items_sharing_ngrams(linea2)]
compares = 0.0
for parrafo in self.lsn:
comp = NGram.compare(parrafo,linea2)
if compares < comp:
compares = comp
results += [compares]
linea2 = file2.readline()
file2.close()
major_ocurrences=[]
for d in dictio:
major=0
for val in d.values():
if major<val:
major=val
major_ocurrences+=[major]
avg_perc=0.0
for r in results:
avg_perc+=r
avg_perc=avg_perc/len(results)
print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
print("Porcentaje Similitud: "+repr(avg_perc))
示例3: simtitle
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def simtitle( request ):
"""calculate similarity based on title and naive threshold"""
n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
results = []
for article in articles:
article.is_duplicate = False
article.duplicate_of = None
article.save()
sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.is_duplicate:
nearest = nearest.duplicate_of
if NGram.compare( article.title, nearest.title ) < 0.7:
results.append( article )
break
article.is_duplicate = True
article.duplicate_of = nearest
article.save()
break
else:
results.append( article )
n.add( article )
return render( request, "dump.html", dictionary = { "article_list": results, } )
示例4: compare_ngrams
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def compare_ngrams(left, right, N=2, pad_len=0):
left = ascii(left)
right = ascii(right)
if len(left) == 1 and len(right) == 1:
# NGram.compare returns 0.0 for 1 letter comparison, even if letters
# are equal.
return 1.0 if left == right else 0.0
return NGram.compare(left, right, N=N, pad_len=pad_len)
示例5: test_ngram_search
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def test_ngram_search(self):
"""Tests from the original ngram.py, to check that the
rewrite still uses the same underlying algorithm"""
# Basic searching of the index
idx = NGram(self.items)
self.assertEqual(idx.search('askfjwehiuasdfji'), [
('askfjwehiuasdfji', 1.0),
('asdfawe', 0.17391304347826086),
('asfwef', 0.083333333333333329),
('adfwe', 0.041666666666666664)])
self.assertEqual(idx.search('afadfwe')[:2],
[('adfwe', 0.59999999999999998),
('asdfawe', 0.20000000000000001)])
# Pairwise comparison of strings
self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
示例6: cumulative_score_strings
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def cumulative_score_strings(iline, jline, N):
iString = " ".join(iline.split(" :::: ")[:3])
jString = " ".join(jline.split(" :::: ")[:3])
score = 0
while N >= 1:
score += (NGram.compare(iString, jString, N=N)) #* N)
N = N - 1
return score
示例7: backoff_score_strings
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def backoff_score_strings(iline, jline, N, T=0.0):
iString = " ".join(iline.split(" :::: ")[:3])
jString = " ".join(jline.split(" :::: ")[:3])
score = -1
while score <= T and N >= 1:
score = NGram.compare(iString, jString, N=N)
N = N - 1
return score
示例8: guess_image
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def guess_image(name):
'''
Guess which meme image they mean by finding the alias with greatest ngram
similarity
'''
name = tokenize(name)
best = '404'
best_score = None
for guess_image, names in IMAGES.iteritems():
for guess in names:
score = NGram.compare(guess, name)
if best_score is None or score > best_score:
best_score = score
best = guess_image
app.logger.info('Pick image %s for name "%s"' % (best, name))
return best
示例9: guess_meme_image
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def guess_meme_image(meme_name):
'''
Guess which meme image they mean by finding the alias with greatest ngram
similarity
'''
meme_name = tokenize(meme_name)
best = ''
best_score = None
for guess_image, names in MEMES.items():
for guess in names:
guess = tokenize(guess)
score = NGram.compare(guess, meme_name)
if best_score is None or score > best_score:
best_score = score
best = guess_image
app.logger.info('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score)
app.logger.info('Picked meme "%s" for name "%s"' % (best, meme_name))
return best
示例10: smart_read
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def smart_read(url):
resp = urllib2.urlopen(url)
#resolve url
url = resp.url
domain = urlparse(url).netloc
path = urlparse(url).path
html = resp.read()
tree = etree.parse(StringIO.StringIO(html), parser)
links = tree.xpath("//body//@href")
nmax = 0
for link in links:
if urlparse(link).netloc == domain:
ng = NGram.compare(urlparse(link).path,path)
#print link,ng
if ng > nmax and ng < 1:
nmax = ng
mirror = link
diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"])
tree = etree.parse(StringIO.StringIO(diffh), parser)
diff = tree.xpath("//ins//text()")
for d in diff:
print d
示例11: process
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def process(hr,sr,he,se):
categories_relevant = {}
categories_extracted = {}
category_idx_list = []
for i,h in enumerate(hr):
for j,h1 in enumerate(he):
if NGram.compare(hr[i], he[j]) > 0.95:
category_idx_list.append((i,j))
if he:
if len(he) != len(se):
return 0 , 0
for i,C in enumerate(category_idx_list):
categories_relevant[i] = sr[C[0]]
tmp = se[C[1]].replace('\r', '').replace('\n','')
categories_extracted[i] = tmp
e = Evaluator(categories_relevant, categories_extracted)
p, r = e.evaluate_using_ngrams(3)
return p, r
示例12: sim
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def sim( a, b ):
return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )
示例13: SPARQLWrapper
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
query = dup_label
# Set up
pp = pprint.PrettyPrinter(indent=2)
sparql = SPARQLWrapper("http://husky-big.cs.uwaterloo.ca:8890/sparql")
# Send query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
entries = results["results"]["bindings"]
"""
for result in entries:
pp.pprint((result['s']['value'], result['o']["value"]))
"""
# Approximately 20k entries
print(len(entries), " entries retrieved")
for (indexA, entryA) in enumerate(entries):
for (indexB, entryB) in enumerate(entries):
if (indexA <= indexB):
distance = NGram.compare(entryA['o']['value'], entryB['o']['value'])
if (distance > 0.8 and entryA['s']['value'] != entryB['s']['value']):
pp.pprint((distance, entryA['s']['value'], entryA['o']['value'],
entryB['s']['value'],entryB['o']['value']))
示例14: nearlySameText
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def nearlySameText(text_1, text_2):
return NGram.compare(text_1.strip(), text_2.strip()) >= 0.9
示例15: ng_pare
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import compare [as 别名]
def ng_pare(needle='default', fn='/usr/share/dict/words', pth=.50):
with open(fn, 'r') as fh:
ng_haystack = {line.lower()
for line in fh
if NGram.compare(needle, line.lower(), N=1) - pth >= 0.0}
return ng_haystack