本文整理汇总了Python中nltk.util.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ngrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: modified_precision
def modified_precision(candidate, references, n):
candidate_ngrams=[]
candidate_n = ngrams(candidate, n)
for x in candidate_n:
#print x
candidate_ngrams.append(x)
# print candidate_ngrams
#print type(candidate_ngrams)
#length+=1
if len(candidate_ngrams) == 0:
return 0
#raw_input()
c_words = set(candidate_ngrams)
#print c_words
for word in c_words:
count_w = candidate_ngrams.count(word) + 1
#print count_w
count_max = 0
for reference in references:
reference_ngrams=[]
reference_n = ngrams(reference, n)
for x in reference_n:
reference_ngrams.append(x)
count = reference_ngrams.count(word) + 1
if count > count_max:
count_max = count
return min(count_w, count_max) / (len(candidate) + len(c_words))
示例2: getTrainData
def getTrainData(corpus, embedsize, ngramsize, m):
f = open(corpus)
datap = []
for line in f:
data = line.strip().split('\t')
s1 = data[0]
s2 = data[1]
label = data[2]
s1ng = ngrams(s1.split(' '), ngramsize)
s2ng = ngrams(s2.split(' '), ngramsize)
s1ng = set([ng for ng in s1ng])
s2ng = set([ng for ng in s2ng])
#diff = s2ng.difference(s1ng)
all = s1ng.union(s2ng)
for ng in all:
datap.append([ng, label])
X = np.zeros((len(datap), ngramsize, embedsize))
Y = np.zeros((len(datap), 3))
wildcard = np.array([0.0]*embedsize)
for i in range(0, len(datap)):
item = datap[i]
ngram = item[0]
label = item[1]
vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
labels = getLabels(label)
X[i] = vectors
Y[i] = labels
return X, Y
示例3: str_common_grams
def str_common_grams(str1, str2, length=3):
'''Return how many times the ngrams (of length min_len to max_len) of str1
appeared on str2
'''
grams1 = list(ngrams(str1, length))
grams2 = list(ngrams(str2, length))
return sum(grams2.count(gram) for gram in grams1)
示例4: getTestData
def getTestData(corpus, embedsize, ngramsize, m):
f = open(corpus)
datap = []
for line in f:
data = line.strip().split('\t')
s1 = data[0]
s2 = data[1]
label = data[2]
s1ng = ngrams(s1.split(' '), ngramsize)
s2ng = ngrams(s2.split(' '), ngramsize)
s1ng = set([ng for ng in s1ng])
s2ng = set([ng for ng in s2ng])
#diff = s2ng.difference(s1ng)
all = s1ng.union(s2ng)
datap.append(list(all))
Xs = []
wildcard = np.array([0.0]*embedsize)
for ngs in datap:
X = np.zeros((len(ngs), ngramsize, embedsize))
for i in range(0, len(ngs)):
ngram = ngs[i]
vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
X[i] = vectors
Xs.append(X)
return Xs
示例5: extract_terms_features
def extract_terms_features(terms, separateGrams=False):
vector = dict()
while('' in terms):
terms.remove('')
# for term in terms:
# if vector.has_key(term):
# vector[term] += 1
# else:
# vector[term] = 1
# for i in range(len(terms) - 2):
# cb2 = ' '.join(terms[i:i+1])
# cb3 = ' '.join(terms[i:i+2])
# if vector.has_key(cb2):
# vector[cb2] += 1
# else:
# vector[cb2] = 1
# if vector.has_key(cb3):
# vector[cb3] += 1
# else:
# vector[cb3] = 1
# cb2 = ' '.join(terms[len(terms)-2:len(terms)])
# if vector.has_key(cb2):
# vector[cb2] += 1
# else:
# vector[cb2] = 1
# print terms
g2 = ngrams(terms, 2)
g3 = ngrams(terms, 3)
g2j = [' '.join(gterms) for gterms in g2]
g3j = [' '.join(gterms) for gterms in g3]
vec1 = {}
vec2 = {}
vec3 = {}
for t in terms:
if(not vector.has_key(t)):
vec1[t] = 1
else:
vec1[t] += 1
for t in g2j:
if(not vector.has_key(t)):
vec2[t] = 1
else:
vec2[t] += 1
for t in g3j:
if(not vector.has_key(t)):
vec3[t] = 1
else:
vec3[t] += 1
vector = dict(vec1.items() + vec2.items() + vec3.items())
if(separateGrams == True):
return (vector, vec1, vec2, vec3)
else:
return vector
示例6: format_text
def format_text(entries, LSTM_shape=True):
THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
sentences = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
decoded = base64.b64decode(entries)
decoded = str(decoded)
decoded = decoded[2:]
decoded = decoded[:-1]
decoded = decoded.split(".")
#print(decoded, "is decoded")
for entry in decoded:
token_sentences = tokenizer.tokenize(entry)
for sentence in token_sentences:
sentences.append(sentence)
tokenized_sentences = []
#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
#remove_tokens = string.punctuation
remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
tweet_tknzr = TweetTokenizer()
for sentence in sentences:
tokens = tweet_tknzr.tokenize(sentence)
tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
tokenized_sentences.append(tokens)
all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
#once the model gets updated with good data, ngrams.py needs to get changed/updated too!
X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 1)
for gram in my_ngrams:
if gram in all_ngrams1:
index = all_ngrams1[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 2)
for gram in my_ngrams:
if gram in all_ngrams2:
index = len(all_ngrams1) + all_ngrams2[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 3)
for gram in my_ngrams:
if gram in all_ngrams3:
index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
X[i][index] = 1
if LSTM_shape:
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
else:
X = np.reshape(X, (X.shape[0], X.shape[1]))
return X
示例7: getNgramProbs
def getNgramProbs(file):
f = open(file,'r');
unigramList = [] ;
for line in f.read().split():
unigramList.append( line );
bigramList = ngrams(unigramList, 2);
trigramList = ngrams(unigramList, 3);
#dictionary of unigrams, bigrams, trigrams
unigramDict = dict()
bigramDict = dict()
trigramDict = dict()
#Counts for Unigrams
countUni = 0 ;
for item in unigramList:
countUni += 1
if item not in unigramDict:
unigramDict[item] = 1
else:
unigramDict[item] += 1
#Counts for Bigram
for item in bigramList:
if item not in bigramDict:
bigramDict[item] = 1
else:
bigramDict[item] += 1
#Counts for Trigrams
for item in trigramList:
if item not in trigramDict:
trigramDict[item] = 1
else:
trigramDict[item] += 1
#Probabilities for Trigrams
for key,item in trigramDict.iteritems():
trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ;
#Probabilities for Bigrams
for key,item in bigramDict.iteritems():
bigramDict[key] /= float(unigramDict[key[0]]) ;
#Probabilities for Unigrams
for key,item in unigramDict.iteritems():
unigramDict[key] /= float(countUni) ;
# print "***** Unigrams";
# for key,item in unigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
# print "***** Bigrams";
# for key,item in bigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
# print "***** Trigrams";
# for key,item in trigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
return [unigramDict,bigramDict,trigramDict];
示例8: scoreScopeOverlap
def scoreScopeOverlap(self,scopeHyp,scopeRef):
totalScore = 0
for scope_h in scopeHyp:
bestScore = 0
for scope_r in scopeRef:
if scope_r==[] or scope_h==[]:
partialScore = 0
if partialScore > bestScore: bestScore = partialScore
else:
ngram_range=range(1,len(scope_h)+1)
logging.info("ngram_range")
logging.info(ngram_range)
score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range)
logging.info(score_weights)
partialScore=float()
for i in ngram_range:
hyp=ngrams(scope_h,i)
ref=ngrams(scope_r,i)
partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1])
logging.info("partialScore")
logging.info(partialScore)
if partialScore > bestScore: bestScore = partialScore
totalScore+=bestScore
logging.info("totalScore")
logging.info(totalScore)
return totalScore
示例9: create_candidate_list
def create_candidate_list(sentence):
tokens = nltk.tokenize.word_tokenize(sentence)
candidates_lists = create_candidates_lists(tokens)
# Create list of 1-grams.
candidates = []
for l in candidates_lists:
candidates += l
# Remove irrelevant stop words in 1-grams.
res = [token for token in candidates
if token not in ENGLISH_STOPWORDS]
# Create list of bigrams.
bigrams = []
for l in candidates_lists:
bigrams += ngrams(l, 2)
# Create list of trigrams.
trigrams = []
for l in candidates_lists:
trigrams += ngrams(l, 3)
# Create list of 4-grams.
fourgrams = []
for l in candidates_lists:
fourgrams += ngrams(l, 4)
res += [' '.join(a) for a in bigrams]
res += [' '.join(a) for a in trigrams]
res += [' '.join(a) for a in fourgrams]
return res
示例10: calc_ngram
def calc_ngram(htokens,etokens):
features = []
for n in range(1,5):
hgrams = nltk.FreqDist(ngrams(htokens,n))
egrams = nltk.FreqDist(ngrams(etokens,n))
prec = 0
num = 0
for k in hgrams:
if k in egrams:
prec = prec + hgrams[k]
num = num + hgrams[k]
if num > 0:
prec = float(prec) / num
features.append(prec)
recall = 0
num = 0
for k in egrams:
if k in hgrams:
recall = recall + egrams[k]
num = num + egrams[k]
if num > 0:
recall = float(recall) / num
features.append(recall)
features.append(calc_f1(prec,recall))
return features
示例11: rouge_s
def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):
rouge_s_list = []
k_c = len(candidate) if d_skip is None else d_skip
cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
n=2, k=k_c))
for ref in references:
k_ref = len(ref) if d_skip is None else d_skip
ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
n=2, k=k_ref))
count = 0
for bigram in cand_skip_list:
if bigram in ref_skip_list:
count = count+1
if not smoothing:
r_skip = count/len(ref_skip_list)
p_skip = count/len(cand_skip_list)
else:
cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
n=1))
ref_ungm = list(ngrams(tokenizer.tokenize(ref),
n=1))
for ungm in cand_ungm:
if ungm in ref_ungm:
count += 1
r_skip = count/(len(ref_skip_list)+len(ref_ungm))
p_skip = count/(len(cand_skip_list)+len(cand_ungm))
score = Rouge.get_score(r_skip, p_skip, beta)
rouge_s_list.append(score)
return Rouge.jacknifing(rouge_s_list, averaging=averaging)
示例12: char_ngram_similarity
def char_ngram_similarity(doc1, doc2, n, top=100):
"""
Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
:param doc1:
:param doc2:
:param n: the n-gram length
:param top: Only use the N most frequent n-grams from each document.
:return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
character n-grams distribution.)
"""
ngrams1 = Counter(ngrams(doc1, n))
ngrams2 = Counter(ngrams(doc2, n))
profile1 = [n[0] for n in ngrams1.most_common(top)]
profile2 = [n[0] for n in ngrams2.most_common(top)]
# normalise the two ngram distributions
total1 = np.sum(list(ngrams1.values()))
for key in ngrams1:
ngrams1[key] /= total1
total2 = np.sum(list(ngrams2.values()))
for key in ngrams2:
ngrams2[key] /= total2
# calculate global dissimilarity score
score = 0
for n in set(profile1 + profile2):
f1 = ngrams1[n]
f2 = ngrams2[n]
score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
return score
示例13: jaccardIdx
def jaccardIdx(w1, w2):
w1ngrams = set(ngrams(w1, 2))
w2ngrams = set(ngrams(w2, 2))
union = w1ngrams.union(w2ngrams)
intersect = w1ngrams.intersection(w2ngrams)
return 1.0 - float(len(intersect)) / float(len(union))
示例14: count_word
def count_word(self,doc,unigram = True,bigram = False,binary = False):
str = word_tokenize(self.remove_non_ascii(doc))
doc_voc = {}
if(unigram):
uni = ngrams(str,1)
self.count_word_sub(doc_voc,uni,binary)
if(bigram):
bi = ngrams(str,2)
self.count_word_sub(doc_voc,bi,binary)
示例15: trainModel
def trainModel(self, listOfFilenames):
#dictionary of unigrams, bigrams, trigrams
unigramDict = dict()
bigramDict = dict()
trigramDict = dict()
#total count of unigrams, bigrams, trigrams
countUni = 0
countBi = 0
countTri = 0
i = 1
#iterate over list of files
for fileName in listOfFilenames:
print "Reading", i
i += 1
stag = STagger(fileName)
stag.find_unigrams(True, False)
for item in stag.unigrams:
countUni += 1
if item not in unigramDict:
unigramDict[item] = 1
else:
unigramDict[item] += 1
codeBigrams = ngrams(stag.unigrams, 2)
codeTrigrams = ngrams(stag.unigrams, 3)
for item in codeBigrams:
countBi += 1
if item not in bigramDict:
bigramDict[item] = 1
else:
bigramDict[item] += 1
for item in codeTrigrams:
countTri += 1
if item not in trigramDict:
trigramDict[item] = 1
else:
trigramDict[item] += 1
#write the ngrams to the file
outputFile = open('corpus.txt', 'w')
outputFile.write(str(countUni) + "\n")
for key, x in unigramDict.iteritems():
outputFile.write(str(key) + " " + str(x) + "\n")
outputFile.write(str(countBi) + "\n")
for key, x in bigramDict.iteritems():
outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(x) + "\n")
outputFile.write(str(countTri) + "\n")
for key, x in trigramDict.iteritems():
outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n")
outputFile.close()