本文整理汇总了Python中ngram.NGram.search方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.search方法的具体用法?Python NGram.search怎么用?Python NGram.search使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ngram.NGram
的用法示例。
在下文中一共展示了NGram.search方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = next(right_file)
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'), lineterminator='\n')
if titles:
left_header = next(left_file)
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
示例2: simtitle
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def simtitle( request ):
"""calculate similarity based on title and naive threshold"""
n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
results = []
for article in articles:
article.is_duplicate = False
article.duplicate_of = None
article.save()
sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.is_duplicate:
nearest = nearest.duplicate_of
if NGram.compare( article.title, nearest.title ) < 0.7:
results.append( article )
break
article.is_duplicate = True
article.duplicate_of = nearest
article.save()
break
else:
results.append( article )
n.add( article )
return render( request, "dump.html", dictionary = { "article_list": results, } )
示例3: map
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def map(self,phrase):
for term in phrase:
if len(term) > 4:
continue
for word in self.corpus:
z = Set(term) & Set(word)
matches = []
if len(z) > 0 and len(z) < len(term):
#
#
g=NGram(z - Set(term))
#matches = g.search(term)
else:
#
# At this point we assume context is not informative
# In the advent of context not being informative, we resort to fuzzy lookup
#
g = NGram(word)
#matches = g.search(term)
g.remove(term)
matches = g.search(term)
key = None
value = None
if len(matches) > 0:
matches = list(matches[0])
Pz_ = len(matches) / self.size
Px_ = fuzz.ratio(term,matches[0]) / 100
if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
key = term
value= {}
value= [matches[0],Pz_,Px_,1]
self.emit (key,value)
示例4: build_multiclusters
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def build_multiclusters(inlines, threshold=0.05, N=4):
clusters = []
ignoreus = []
for i, iline in enumerate(inlines):
if i in ignoreus:
continue
iString = " ".join(iline.split(" :::: ")[:3])
ignoreus.append(i)
icluster = {}
icluster[iline] = -1
iModel = NGram(iString)
for j in range(i, len(inlines)):
if j in ignoreus:
continue
jline = inlines[j]
jString = " ".join(jline.split(" :::: ")[:3])
results = iModel.search(jString)
score = sum([y for x,y in results]) / len(results) \
if len(results) > 0 else 0.0
print score
if score > threshold:
icluster[jline] = score
iModel.add(jString)
ignoreus.append(j)
clusters.append(icluster)
return clusters
示例5: test_set_operations
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def test_set_operations(self):
"""Test advanced set operations"""
items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
items2 = set(["cdefg", "lmnop"])
idx1 = NGram(items1)
idx2 = NGram(items2)
results = lambda L: sorted(x[0] for x in L)
# Item removal
self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
idx1.remove('abcde')
self.assertEqual(results(idx1.search('cde')), ["cdefg"])
# Set intersection operation
items1.remove('abcde')
idx1.intersection_update(idx2)
self.assertEqual(idx1, items1.intersection(items2))
self.assertEqual(results(idx1.search('lmn')), [])
self.assertEqual(results(idx1.search('ijk')), [])
self.assertEqual(results(idx1.search('def')), ['cdefg'])
示例6: test_ngram_search
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def test_ngram_search(self):
"""Tests from the original ngram.py, to check that the
rewrite still uses the same underlying algorithm"""
# Basic searching of the index
idx = NGram(self.items)
self.assertEqual(idx.search('askfjwehiuasdfji'), [
('askfjwehiuasdfji', 1.0),
('asdfawe', 0.17391304347826086),
('asfwef', 0.083333333333333329),
('adfwe', 0.041666666666666664)])
self.assertEqual(idx.search('afadfwe')[:2],
[('adfwe', 0.59999999999999998),
('asdfawe', 0.20000000000000001)])
# Pairwise comparison of strings
self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
示例7: main
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join
>>> open('left.csv', 'w').write('''ID,NAME
... 1,Joe
... 2,Kin
... 3,ZAS''')
>>> open('right.csv', 'w').write('''ID,NAME
... ID,NAME
... A,Joe
... B,Jon
... C,Job
... D,Kim''')
>>> main(left_path='left.csv', left_column=1,
... right_path='right.csv', right_column=1, outfile='out.csv',
... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
>>> print open('out.csv').read() #doctest: +NORMALIZE_WHITESPACE
ID,NAME,Rank,Similarity,ID,NAME
1,Joe,1,1.0,A,Joe
1,Joe,2,0.25,B,Jon
1,Joe,3,0.25,C,Job
2,Kin,1,0.25,D,Kim
3,ZAS
<BLANKLINE>
"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = right_file.next()
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'))
if titles:
left_header = left_file.next()
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
示例8: wordsoccurrences
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def wordsoccurrences(self, words_list, option='ortony'):
frequencies = FreqDist(words_list)
ordered_unigrams = frequencies.most_common()
if option == 'ortony':
lexicon = self.ortony_list
else:
lexicon = self.profane_words
count = 0
for t_word, count_w in ordered_unigrams:
lower_word = t_word.lower()
three_grams = NGram(lexicon)
likely_words = three_grams.search(lower_word)
if len(likely_words) > 0:
# if lower_word in lexicon:
count += 1 * count_w
if lower_word in lexicon:
count += 1
return count
示例9: verify
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def verify(self,text_compare):
results = []
texto = []
'''
file2 = open(text_compare,"r")
for linea2 in file2.readlines():
texto+=linea2.split(" ")
tng=NGram(texto)
file2.close()
'''
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
texto+=linea2.split(" ")
linea2 = file2.readline()
tng=NGram(texto)
file2.close()
for ngs in self.ng:
count=0
for word in list(ngs):
for porc in tng.search(word):
if porc[1]>0.3:
count+=1
results+=[count]
print list(results)
pos=0
count=0
i=0
for res in results:
if count<res:
count=res
pos=i
i+=1
if results[pos]>2:
print("Tema mas preciso del texto: "+repr(self.topic[pos]))
else:
print("No se ha podido precisar de que trata")
print ""
示例10: handle
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def handle( self, *args, **options ):
if "simonly" in args:
new_count = 100000
else:
new_count = 0
for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
l = feedparser.parse( source.scraper_config )
ok = True
if l[ "bozo" ] == 1:
if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
ok = False
if ok:
for article in l[ "entries" ]:
#print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
a, created = Article.objects.get_or_create(
source = source,
# Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
source_reference = article[ "id" ],
defaults = {
'date_created' : datetime.now(),
'source_url' : article[ "link" ],
'title' : self.normalise( article[ "title" ] ),
'num_comments' : article.get( "slash_comments", 0 ),
'summary' : article[ "summary" ],
'author' : article.get( "author", "" ),
'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
'status' : "live"
}
)
if created:
#print "Creating new article."
pass
else:
#print "Updating article."
pass
new_count += 1
if article.has_key( "content" ):
# TODO test for multiple content blocks and pick most appropriate
a.body = article[ "content" ][0][ "value" ]
a.tags.clear()
for tag in article.get( "tags", () ):
a.tags.add( tag[ "term" ] )
a.save()
else:
logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) )
logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) )
break
#calculate similarities
#create a similarity corpus of last 200 docs
def enrich( obj ):
s = unicode( obj )
# simple stop words
s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
# type prefixes
s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
return s
n = NGram( warp=2.5, iconv=enrich )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
for article in articles:
if "simonly" in args:
article.is_duplicate = False
article.duplicate_of = None
article.save()
continue
#articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
#for article in articles:
#print( u"similarity for %s" % ( article.title, ) )
sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.source == article.source:
continue
if nearest.is_duplicate:
nearest = nearest.duplicate_of
# do it again!
if nearest.source == article.source:
continue
article.is_duplicate = True
article.duplicate_of = nearest
#print u" is duplicate of %s" % ( nearest.title, )
article.save()
break
n.add( article )
示例11: _location_choices
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def _location_choices(self, search):
ngram_index = NGram(key=self._location_to_name)
ngram_index.update(Ward.objects.all())
ngram_index.update(District.objects.all())
locations = ngram_index.search(search)[:self.num_choices]
return [self._location_to_choice(l) for l, _score in locations]
示例12: run
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def run(self):
N = len(self.context)
imatches = []
found = {}
Y = range(0,len(self.bag))
for i in range(0,N):
Xo_ = list(self.bag[i]) # skip_gram
#Y = (Set(range(0,N)) - (Set([i]) | Set(imatches)))
for ii in Y:
if self.bag[i] == self.bag[ii] :
imatches.append(ii) ;
continue
#
# We are sure we are not comparing the identical phrase
# NOTE: Repetition doesn't yield learning, rather context does.
# Lets determine if there are common terms
#
Z = Set(self.bag[i]) & Set(self.bag[ii])
if len(Z) > 0 and len(Xo_) > 0:
Xo_ = Set(Xo_) - Z # - list(Set(bag[i]) - Set(bag[ii]))
Yo_ = Set(self.bag[ii]) - Z #list(Set(bag[ii]) - Set(bag[i]))
size = len(Xo_)
g = NGram(Yo_)
for term in Xo_:
xo = g.search(term)
if len(xo) > 0 and len(term) < 4:
xo = xo[0]
else:
continue;
xo = list(xo)
xo_i = self.bag[i].index(term)
yo_i = self.bag[ii].index(xo[0])
#
# We have the pair, and we will compute the distance
#
ratio = fuzz.ratio(term,xo[0])/100
is_subset = len(Set(term) & Set(xo[0])) == len(term)
if is_subset and len(term) < len(xo[0]) and ratio > 0.5 and xo_i ==yo_i:
xo[1] = [ratio,xo_i]
if (term not in self.info):
#xo[1] = ratio
self.info[term] = [term,xo[0]]+xo[1]
elif term in self.info and ratio > self.info[term][1] :
self.info[term] = [term,xo[0]]+xo[1]
imatches.append(ii)
break;
#
# At this point we consolidate all that has been learnt
# And make it available to the outside word, otherwise client should retrieve it
#
self.lock.acquire()
if self.queue is not None:
for term in self.info:
value = ['thread # ',self.name]+list(self.info[term])
self.queue.put(value)
self.lock.release()
示例13: open
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
"""
address_longlat = []
for address in location:
g = geocoder.google(address)
list_longlat = g.latlnga
list_longlat.insert(0,address)
address_longlat.append(list_longlat)
print address_longlat
"""
get long lat from data POI using Ngram
"""
with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file:
reader = csv.reader(file)
#reader.next()
corpus = []
for row in reader:
corpus.append(row[0])
corpus_name = []
for word in corpus:
corpus_name.append(word.split(';')[0])
address = []
G = NGram(corpus_name)
G_latlng = NGram(corpus)
for word in location:
out = G.search(word)
out1 = G_latlng.append(out[0][0])
address.append(out1[0][0])