本文整理汇总了Python中nltk.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ngrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ngrams
def ngrams(self, ns=[2, 3, 5]):
_p = ["/".join(t) for t in zip(self.SUF, self.POS)]
for n in ns:
ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)}
ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)}
self.features.update(ngf)
self.features.update(ngfp)
示例2: update_freqs
def update_freqs(self, doc_text, id_str):
for bigram in list(ngrams(doc_text, 2)):
k = bigram[0] + u"_" + bigram[1]
self.bicount.update([k])
self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str]
for trigram in list(ngrams(doc_text, 3)):
k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2]
self.tricount.update([k])
self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
示例3: get_gram_ratio
def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1):
t1 = list(ngrams(text1.split(), n_grams_1))
t2 = list(ngrams(text2.split(), n_grams_2))
pairs = list(iter_product(t1, t2, repeat=1))
res = list(map(lambda x: similarity(w2v, x), pairs))
if len(res) == 0:
return 0
else:
return np.mean(res)
示例4: ngrams_extract
def ngrams_extract(string):
if random.random() < SAMPLE_RATE:
print '[*]',string
l = list
grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
SIZE = 1024
vec = zeros((SIZE,))
for t in grams:
vec[hash(t)%SIZE]+=1
return log(vec+1.0)
示例5: build_ngram
def build_ngram(source):
ngram_set = {}
for key, value in source.items():
ngram = []
for line in value:
if IS_PAD:
ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS'))
else:
ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL))
ngram_set[key] = ngram
return ngram_set
示例6: read_data
def read_data(type):
datapath = '../data/' + type + '/'
data = {}
maxindex = 500
count = 0
unigrams = []
bigrams = []
dependecies = []
for c in string.ascii_uppercase:
data[c] = {}
for i in range(1, maxindex):
filename = datapath + c + str(i)
txtpath = filename + '.data'
metapath = filename + '.meta'
text = read_file(txtpath)
meta = read_file(metapath)
if text is not None:
count += 1
# print (count)
data[c][i] = {'text': text[0], 'meta': parse_meta(meta)}
tokens = nltk.word_tokenize(text[0])
data[c][i]['tokens'] = tokens
data[c][i]['length'] = len(tokens)
s = remove_punct(text[0])
tokens = nltk.word_tokenize(remove_punct(s.lower()))
data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1))
data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2))
# data[c][i]['dependencies'] = dependency_parse(text[0])
# deppath = filename + '.dep'
# with open (deppath, 'w') as f:
# json.dump(data[c][i]['dependencies'],f)
# with open (deppath, 'r') as f:
# data[c][i]['dependencies'] = json.load(f)
unigrams.extend(data[c][i]['unigrams'])
bigrams.extend(data[c][i]['bigrams'])
# dependecies.extend(data[c][i]['dependencies'])
data[c]['sequences'] = gen_sequences(data[c])
data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3)
data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3)
# data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3)
# pprint.pprint (data['unigram_model'])
# pprint.pprint (data['bigram_model'])
# pprint.pprint (data['dependencies'])
# print(type, count)
return data
示例7: lookup_phrases
def lookup_phrases(sentence, noun_types, ignore_case=False):
phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1)
matches = []
for phrase in phrases:
if contains_noun(phrase):
phrase_str = u' '.join(w.form for w in phrase)
if ignore_case:
phrase_str = phrase_str.lower()
types = noun_types.get(phrase_str)
if types:
matches.append((phrase, types))
return sorted(matches)
示例8: extract_ngrams
def extract_ngrams (self, memes):
for meme_type in memes:
for meme in memes[meme_type]:
top_unigrams = meme[0]
bottom_unigrams = meme[1]
all_unigrams = top_unigrams + bottom_unigrams
top_bigrams = ngrams (meme[0], 2)
bottom_bigrams = ngrams (meme[1], 2)
all_bigrams = top_bigrams + bottom_bigrams
self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
示例9: get_gram_ratio
def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000):
arr = np.ndarray((w, h), np.float32)
arr.fill(0)
t1 = list(ngrams(text1.split(), n_grams_1))
t2 = list(ngrams(text2.split(), n_grams_2))
for i in range(len(t1)):
for j in range(len(t2)):
try:
arr[i, j] = w2v.n_similarity(t1[i], t2[j])
except:
pass
return arr
示例10: generate_location_vector
def generate_location_vector(self, branch, index):
if branch.text is not None:
branch.text = branch.text.encode('ascii', 'ignore')
if not branch.getchildren():
sentences = branch.text.split('. ')
for sentence in range(0, len(sentences)):
#sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
words = sentences[sentence].split()
for doc_word in range(0, len(words)):
word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word])
# any change in line below should be replicated in corpus.py also
symbols = ".,[]();:<>+=&+%[email protected]#~?{}|"
whitespace = " "
replace = maketrans(symbols, whitespace)
doc_word = word_location[1].translate(replace)
doc_word = doc_word.lstrip()
doc_word = doc_word.rstrip()
if len(doc_word) > 1 and not len(doc_word) > 16:
self.doc_words.append(doc_word)
doc_bigrams = bigrams(words)
if not len(doc_bigrams) < 1:
doc_bigrams = self.n_gram_cleaner(doc_bigrams)
for bi_gram in doc_bigrams:
bi_gram = ' '.join(bi_gram)
self.bi_grams.append(bi_gram)
doc_trigrams = trigrams(words)
if not len(doc_trigrams) < 1:
doc_trigrams = self.n_gram_cleaner(doc_trigrams)
for tri_gram in doc_trigrams:
tri_gram = ' '.join(tri_gram)
self.tri_grams.append(tri_gram)
doc_fourgrams = ngrams(words, 4)
if not len(doc_fourgrams) < 1:
doc_fourgrams = self.n_gram_cleaner(doc_fourgrams)
for four_gram in doc_fourgrams:
four_gram = ' '.join(four_gram)
self.four_grams.append(four_gram)
doc_fivegrams = ngrams(words, 5)
if not len(doc_fivegrams) < 1:
doc_fivegrams = self.n_gram_cleaner(doc_fivegrams)
for five_gram in doc_fivegrams:
five_gram = ' '.join(five_gram)
self.five_grams.append(five_gram)
else:
for subtree in range(0, len(branch)):
LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
示例11: get_top_ngrams_tfidf
def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None):
bigs = nltk.ngrams(text,NGRAM)
print 'totally',len(bigs),'bigrams'
bigs = remove_website_stopwords(bigs)
freqdist = nltk.FreqDist(bigs)
topwords = freqdist.keys()[:cutoff]
# print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]]
from math import log
if True: #do_tfidf
df = {}
df_les = {}
df_time = {}
tfidf ={}
for doc_id, text in docs.items():
words = [w for w in nltk.ngrams(text,NGRAM)]
les_id,time_id = doc_id.split(':')
time_id = time_id.replace('.csv','')
time_id = time_id[0:8]
for w in words:
df.setdefault(w,set())
df[w].add(doc_id)
df_les.setdefault(w,set())
df_les[w].add(les_id)
df_time.setdefault(w,set())
df_time[w].add(time_id)
_cutoff=10000
_topwords = freqdist.keys()[:_cutoff]
df0,df1,df2={},{},{}
for w in _topwords:
# print w
try: df0[w] = len(df[w])
except: df0[w] = 0
try: df1[w] = len(df_les[w])
except: df1[w] = 0
try: df2[w] = len(df_time[w])
except: df2[w] = 0
tfidf[w] = freqdist[w]/(1+df0[w])
# print df0
#get sorted words in decreasing order of tfidf values
sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True)
sortedwords = sortedwords[:cutoff]
topwords = [w for w,s in sortedwords]
sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True)
sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True)
sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True)
print 'TF-IDF topwords:'
print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]]
print sortedwords0[:30]
print sortedwords1[:30]
print sortedwords2[:30]
return topwords,freqdist,df0,df1,df2
return topwords,freqdist
示例12: __call__
def __call__(self, words):
grams = list(ngrams(words, 2)) + list(ngrams(words, 3))
positives = [
(i, len(gram), gram) for i, gram in enumerate(grams)
if self.colls[len(gram)][gram]
]
if not positives:
return words
positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True)
matches, covered = self.__non_overlapping(positives)
unigrams = [(i, w) for i, w in enumerate(words) if i not in covered]
catted = sorted(matches + unigrams)
return zip(*catted)[1]
示例13: generateLocationVector
def generateLocationVector(self, branch, index):
if branch.text is not None:
branch.text = branch.text.encode('ascii', 'ignore')
if not branch.getchildren():
sentences = branch.text.split('. ')
for sentence in range(0, len(sentences)):
#sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
words = sentences[sentence].split()
for word in range(0, len(words)):
word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word])
symbols = ",[]();:<>+=&+%[email protected]#~?{}|"
whitespace = " "
replace = maketrans(symbols, whitespace)
spec_word = word_location[1].translate(replace)
spec_word = spec_word.lstrip()
spec_word = spec_word.rstrip()
if len(spec_word) > 1 and not len(spec_word) > 16:
self.spec_words.append(spec_word)
bi_grams = bigrams(words)
if not len(bi_grams) < 1:
for bi_gram in bi_grams:
bi_gram = ' '.join(bi_gram)
self.bi_grams.append(bi_gram)
tri_grams = trigrams(words)
if not len(tri_grams) < 1:
for tri_gram in tri_grams:
tri_gram = ' '.join(tri_gram)
self.tri_grams.append(tri_gram)
four_grams = ngrams(words, 4)
if not len(four_grams) < 1:
for four_gram in four_grams:
four_gram = ' '.join(four_gram)
self.four_grams.append(four_gram)
five_grams = ngrams(words, 5)
if not len(five_grams) < 1:
for five_gram in five_grams:
five_gram = ' '.join(five_gram)
self.five_grams.append(five_gram)
else:
for subtree in range(0, len(branch)):
Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
示例14: __init__
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
split_text = text.split()
if len(split_text) < shingle_length:
raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))
self.minhash = []
self.shingles = ngrams(split_text, shingle_length)
for hash_seed in generate_random_seeds(minhash_size, random_seed):
min_value = float('inf')
for shingle in ngrams(split_text, shingle_length):
value = mmh3.hash(' '.join(shingle), hash_seed)
min_value = min(min_value, value)
self.minhash.append(min_value)
示例15: train
def train(self, words, tagged=False):
if tagged is True:
tags = []
for i in range(len(words)):
tags.append(words[i][1])
self.ngrams = list(nltk.ngrams(tags, self.n))
else:
# text = nltk.word_tokenize(words)
tagged_words = nltk.pos_tag(words)
universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
self.ngrams = list(nltk.ngrams(universal_tags, self.n))
self.frequencies = nltk.FreqDist(self.ngrams)
self.probs_ng = nltk.MLEProbDist(self.frequencies)
print self.probs_ng