本文整理汇总了Python中ngram.NGram类的典型用法代码示例。如果您正苦于以下问题:Python NGram类的具体用法?Python NGram怎么用?Python NGram使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了NGram类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_multiclusters
def build_multiclusters(inlines, threshold=0.05, N=4):
clusters = []
ignoreus = []
for i, iline in enumerate(inlines):
if i in ignoreus:
continue
iString = " ".join(iline.split(" :::: ")[:3])
ignoreus.append(i)
icluster = {}
icluster[iline] = -1
iModel = NGram(iString)
for j in range(i, len(inlines)):
if j in ignoreus:
continue
jline = inlines[j]
jString = " ".join(jline.split(" :::: ")[:3])
results = iModel.search(jString)
score = sum([y for x,y in results]) / len(results) \
if len(results) > 0 else 0.0
print score
if score > threshold:
icluster[jline] = score
iModel.add(jString)
ignoreus.append(j)
clusters.append(icluster)
return clusters
示例2: filterByOp
def filterByOp(self,clone):
opStr1 = ""
opStr2 = ""
indx1,start1,end1 = clone[1]
indx2,start2,end2 = clone[2]
for i in range(start1,end1+1):
opStr1 += str(self.op1_hash.get(i,-1))
for i in range(start2,end2+1):
opStr2 += str(self.op2_hash.get(i,-1))
if config.DEBUG is True:
print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1)
print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2)
# if ((self.hasChanged(opStr1) is False) or
# (self.hasChanged(opStr2) is False)):
if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)):
return None
idx = NGram(N=config.NGRAM)
ngram1 = list(idx.ngrams(opStr1))
ngram2 = list(idx.ngrams(opStr2))
metric = self.compareList(ngram1,ngram2)
return metric
示例3: map
def map(self,phrase):
for term in phrase:
if len(term) > 4:
continue
for word in self.corpus:
z = Set(term) & Set(word)
matches = []
if len(z) > 0 and len(z) < len(term):
#
#
g=NGram(z - Set(term))
#matches = g.search(term)
else:
#
# At this point we assume context is not informative
# In the advent of context not being informative, we resort to fuzzy lookup
#
g = NGram(word)
#matches = g.search(term)
g.remove(term)
matches = g.search(term)
key = None
value = None
if len(matches) > 0:
matches = list(matches[0])
Pz_ = len(matches) / self.size
Px_ = fuzz.ratio(term,matches[0]) / 100
if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
key = term
value= {}
value= [matches[0],Pz_,Px_,1]
self.emit (key,value)
示例4: select_translation
def select_translation(sentence, idx, word, translations):
# make sure the subject pronoun is in subject form
# heuristic: if it's the first word or the previous word is punctuation
# or conjunction, it's considered a subject
if word[1] == 'r' and word[0] in subject_pronoun:
if idx == 0 or sentence[idx-1][1] in ['x', 'c']:
return (subject_pronoun[word[0]], 'pron')
# handle special case: <digits>/m 日/m
if word[1] == 'm':
if DIGITS_PATTERN.match(word[0]):
if idx+1 < len(sentence) and sentence[idx+1][0] == u'日':
# return proper date string
return (translate_date(int(word[0])), 'n')
else:
# return digits directly
return (word[0], 'n')
elif word[0] == u'日':
# symmetric case
if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]):
return ('', '')
# construct a list of translations with the same pos as word
same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations)
ng = NGram()
if len(same_pos_translations) > 0:
max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0]))
return max_unigram_trans
return translations[0]
示例5: main
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = next(right_file)
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'), lineterminator='\n')
if titles:
left_header = next(left_file)
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
示例6: simtitle
def simtitle( request ):
"""calculate similarity based on title and naive threshold"""
n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
results = []
for article in articles:
article.is_duplicate = False
article.duplicate_of = None
article.save()
sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.is_duplicate:
nearest = nearest.duplicate_of
if NGram.compare( article.title, nearest.title ) < 0.7:
results.append( article )
break
article.is_duplicate = True
article.duplicate_of = nearest
article.save()
break
else:
results.append( article )
n.add( article )
return render( request, "dump.html", dictionary = { "article_list": results, } )
示例7: test
def test():
filter = opFilter()
opStr1 = "nnn+"
opStr2 = "nn+"
idx = NGram(N=config.NGRAM)
l1 = list(idx.ngrams(opStr1))
l2 = list(idx.ngrams(opStr2))
print filter.compareList(l1,l2)
示例8: ngram_similarity
def ngram_similarity(univ_name):
out = {}
with open("static/UniqueFBUnivNames.csv", 'rb') as f:
reader = csv.reader(f)
for row in reader:
row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
row = re.sub(' ', ' ', str(row))
out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
out['score_used'] = NGram.compare(str(row).lower(), univ_name)
out['univ'] = str(row)
return out
return out
示例9: main
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join
>>> open('left.csv', 'w').write('''ID,NAME
... 1,Joe
... 2,Kin
... 3,ZAS''')
>>> open('right.csv', 'w').write('''ID,NAME
... ID,NAME
... A,Joe
... B,Jon
... C,Job
... D,Kim''')
>>> main(left_path='left.csv', left_column=1,
... right_path='right.csv', right_column=1, outfile='out.csv',
... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
>>> print open('out.csv').read() #doctest: +NORMALIZE_WHITESPACE
ID,NAME,Rank,Similarity,ID,NAME
1,Joe,1,1.0,A,Joe
1,Joe,2,0.25,B,Jon
1,Joe,3,0.25,C,Job
2,Kin,1,0.25,D,Kim
3,ZAS
<BLANKLINE>
"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = right_file.next()
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'))
if titles:
left_header = left_file.next()
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
示例10: get_distr
def get_distr(strlist, n_len):
alphabet = ['A', 'C', 'G', 'T', 'N']
n = NGram(N=n_len, pad_len=0)
all_ngrams = 0
grams = init_grams_dict(n_len, alphabet)
for item in strlist:
if item == '':
continue
ngram_list = list(n._split(item))
for ng in ngram_list:
if ng in grams:
grams[ng] += float(1)
all_ngrams += 1
for item in grams.keys():
grams[item] /= all_ngrams
return grams
示例11: verify
def verify(self,text_compare):
results = []
dictio = []
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
if linea2 != '\n':
dictio += [self.ng.items_sharing_ngrams(linea2)]
compares = 0.0
for parrafo in self.lsn:
comp = NGram.compare(parrafo,linea2)
if compares < comp:
compares = comp
results += [compares]
linea2 = file2.readline()
file2.close()
major_ocurrences=[]
for d in dictio:
major=0
for val in d.values():
if major<val:
major=val
major_ocurrences+=[major]
avg_perc=0.0
for r in results:
avg_perc+=r
avg_perc=avg_perc/len(results)
print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
print("Porcentaje Similitud: "+repr(avg_perc))
示例12: main
def main():
questions_path, answers_path = sys.argv[1:]
print("Reading Corpus:")
train_sentences = read_corpus('train_data', disp=True)
print('\nTraining on Corpus')
model = NGram.train_model(train_sentences, disp=True)
with open(answers_path, 'r') as answer_file:
answers = get_sentences(untokenized_text=answer_file.read(),
is_tokenized=True,
token_start_end=('<s>', '</s>'))
dev_sentences = answers[:520]
print('Calculating Probabilities for Dev Sentences:')
model.sentences_probabilities(dev_sentences, disp=True)
lambdas = optimize_lambdas(model)
with open(questions_path, 'r') as question_file:
questions = get_sentences(untokenized_text=question_file.read(),
is_tokenized=True,
token_start_end=('<s>', '</s>'))
print('Calculating Probabilities for Test Sentences:')
model.sentences_probabilities(sentences=questions, disp=True)
_, sentences_perplexity = model.perplexity(lambdas=lambdas)
print('Writing sentences and perplexities to file')
with open('output.txt', 'w') as out_file:
for i, perplexity in enumerate(sentences_perplexity):
out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
示例13: test_count_1gram
def test_count_1gram(self):
ngram = NGram(1, self.sents)
counts = {
(): 12,
('el',): 1,
('gato',): 1,
('come',): 2,
('pescado',): 1,
('.',): 2,
('</s>',): 2,
('la',): 1,
('gata',): 1,
('salmón',): 1,
}
for gram, c in counts.items():
self.assertEqual(ngram.count(gram), c)
示例14: compare_ngrams
def compare_ngrams(left, right, N=2, pad_len=0):
left = ascii(left)
right = ascii(right)
if len(left) == 1 and len(right) == 1:
# NGram.compare returns 0.0 for 1 letter comparison, even if letters
# are equal.
return 1.0 if left == right else 0.0
return NGram.compare(left, right, N=N, pad_len=pad_len)
示例15: test_ngram_search
def test_ngram_search(self):
"""Tests from the original ngram.py, to check that the
rewrite still uses the same underlying algorithm"""
# Basic searching of the index
idx = NGram(self.items)
self.assertEqual(idx.search('askfjwehiuasdfji'), [
('askfjwehiuasdfji', 1.0),
('asdfawe', 0.17391304347826086),
('asfwef', 0.083333333333333329),
('adfwe', 0.041666666666666664)])
self.assertEqual(idx.search('afadfwe')[:2],
[('adfwe', 0.59999999999999998),
('asdfawe', 0.20000000000000001)])
# Pairwise comparison of strings
self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)