本文整理汇总了Python中nltk.util.bigrams函数的典型用法代码示例。如果您正苦于以下问题:Python bigrams函数的具体用法?Python bigrams怎么用?Python bigrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了bigrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: wiki_to_feature
def wiki_to_feature(wiki):
"""
Specifically handles a single wiki document
:param wiki: dict for wiki fields
:type wiki: dict
:return: tuple with wiki id and list of feature strings
:rtype: tuple
"""
try:
features = []
bow = []
features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')]
features += [u'TOP_CAT:%s' % u'_'.join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
bow += [u"_".join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
features += [u'TOP_ART:%s' % u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
bow += [u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
desc_ngrams = [u"_".join(n) for grouping in
[bigrams(normalize(np))
for np in TextBlob(wiki.get(u'description_txt', [u''])[0]).noun_phrases]
for n in grouping]
bow += desc_ngrams
features += [u'DESC:%s' % d for d in desc_ngrams]
bow += [u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0]))]
mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases
bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping]
bow += [u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words]
return wiki[u'id'], bow + features
except Exception as e:
print e, format_exc()
raise e
示例2: getFeatures
def getFeatures(tokens, typefeat='unigrams'):
if typefeat == 'unigrams':
_features = FreqDist(tokens)
elif typefeat == 'bigrams':
_bigrams = bigrams(tokens)
_features = FreqDist(_bigrams)
elif typefeat == 'uni+bigrams':
_bigrams = bigrams(tokens)
_features = FreqDist(_bigrams + tokens)
return _features
示例3: score_by_topic
def score_by_topic(pkg, scores):
'''Examines the pkg and adds scores according to topics in it.'''
themes = Themes.instance()
for level in range(3):
pkg_text = package_text(pkg, level)
words, words_without_stopwords = normalize_text(pkg_text)
for num_words in (1, 2, 3):
if num_words == 1:
ngrams = words_without_stopwords
topic_ngrams = themes.topic_words
topic_ngrams_set = themes.topic_words_set
elif num_words == 2:
ngrams = bigrams(words)
topic_ngrams = themes.topic_bigrams
topic_ngrams_set = themes.topic_bigrams_set
elif num_words == 3:
ngrams = trigrams(words)
topic_ngrams = themes.topic_trigrams
topic_ngrams_set = themes.topic_trigrams_set
matching_ngrams = set(ngrams) & topic_ngrams_set
if matching_ngrams:
for ngram in matching_ngrams:
occurrences = ngrams.count(ngram)
score = (3-level) * occurrences * num_words
theme = topic_ngrams[ngram]
ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram
reason = '"%s" matched %s' % (ngram_printable, LEVELS[level])
if occurrences > 1:
reason += ' (%s times)' % occurrences
scores[theme].append((score, reason))
log.debug(' %s %s %s', theme, score, reason)
示例4: aggregate_topics_of_segmented_reports
def aggregate_topics_of_segmented_reports(self, cut_of_segmented_reports, topics):
aggregated_topics = []
bigrams_of_topics = bigrams(map(lambda x: [x.decode('utf-8')], topics))
for i in range(len(bigrams_of_topics)):
for j in range(len(cut_of_segmented_reports)):
aggregated_topics.extend(cut_of_segmented_reports[j][cut_of_segmented_reports[j].index(bigrams_of_topics[i][0]):cut_of_segmented_reports[j].index(bigrams_of_topics[i][1])])
return aggregated_topics
示例5: autocorrect_query
def autocorrect_query(query,df,cutoff=0.8,warning_on=True):
"""
autocorrect a query based on the training set
"""
train_data = df.values[df['search_term'].values==query,:]
s = ""
for r in train_data:
w = r
s = "%s %s %s"%(s,BeautifulSoup(r[1]).get_text(" ",strip=True),BeautifulSoup(r[2]).get_text(" ",strip=True))
s = re.findall(r'[\'\"\w]+',s.lower())
s_bigram = [' '.join(i) for i in bigrams(s)]
s.extend(s_bigram)
corrected_query = []
for q in query.lower().split():
if len(q)<=2:
corrected_query.append(q)
continue
if bool(re.search('\d', q)): # skip if it is word with number, like 4.5in_
corrected_query.append(q)
continue
corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff)
if len(corrected_word) >0:
corrected_query.append(corrected_word[0])
else :
if warning_on:
print("WARNING: cannot find matched word for '%s' -> used the original word"%(q))
corrected_query.append(q)
return ' '.join(corrected_query)
示例6: generate_unibitrigrams
def generate_unibitrigrams(key_score_file):
with open(key_score_file,'rb') as infile:
infile.readline()
key_list=list()
for line in infile:
row=list(line.split(','))
key_list.append(row[0])
uni_bi_trigrams=[]
for phrase in key_list:
words=[]
unigrams_ls=[]
bigrams_ls=[]
trigrams_ls=[]
for word in nltk.word_tokenize(phrase):
word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>[email protected][\]\^_`{|}~]','',word)
words.append(word)
unigrams_ls=words
#bigrams_ls=list(bigrams(words))
for x in list(bigrams(words)):
bigrams_ls.append(x[0]+' '+x[1] )
for x in list(trigrams(words)):
trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] )
#trigrams_ls=list(trigrams(words))
uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls
return uni_bi_trigrams
示例7: gender_feature
def gender_feature(text, feature_vect):
"""
Extract the gender features
:param text:
:param feature_vect: contains a bag of words and a list of bigrams
:return: a dictionary which contains the feature and its computed value
"""
#sentence length and vocab features
tokens = word_tokenize(text.lower())
sentences = sent_tokenize(text.lower())
words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])
#bag_of_word features
bag_dict = {}
for bag in feature_vect[:29]:
bag_dict[bag] = bag in tokens
#bigrams features
bigram_dict = {}
for big in feature_vect[29:]:
bigram_dict[big] = big in bigrams(tokens)
#POS tagging features
POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)
d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)
return dict(dict(d, **bigram_dict), **freq_POS)
示例8: get_bigram
def get_bigram(text_list):
# text_list is a list of strings
new_list = []
for i in range(len(text_list)):
new_list.append(list(bigrams(text_list[i])))
return new_list
示例9: BigramAll
def BigramAll():
to_save_folder = "./#Bigram[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1 :
continue;
folder_name = "./" + folder + "/"
data_path = folder_name+"data.doc";
fw = open(data_path,"r",encoding="utf8");
text = fw.read();
words = word_tokenize(text);
big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
myBig = []
for bi in big:
myBig.append(bi[0]+" "+bi[1]);
fdist = FreqDist(str(w) for w in myBig);
keys = fdist.most_common(len(fdist.keys()))
dataFreq = "";
for key in keys:
dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";
make_sure_path_exists(to_save_folder+folder)
writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
示例10: generate_ds
def generate_ds(self, words):
learning_info_dict = {lang: {w: float(t)
for w, t in self._language_model_cfd[lang].most_common()}
for lang in self._language_model_cfd.keys()}
testing_info_dict = {w: float(t)
for w, t in FreqDist([tpl for word in words for tpl in bigrams(word)]).most_common()}
return learning_info_dict, testing_info_dict
示例11: bigramsPhi
def bigramsPhi(comment):
"""The basis for a bigrams feature function.
"""
sent = [stemmer.stem(tok) for tok in comment.split()] # Stemming + punc
unis = Counter()
sent = ["<<START>>"] + sent + ["<<END>>"]
unis.update(bigrams(sent)) # Bigrams
return unis
示例12: perplexity
def perplexity(self, sentence, method):
"""
Compute the perplexity of a sentence given a estimation method
You do not need to modify this code.
"""
return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \
bigrams(self.tokenize_and_censor(sentence))]))
示例13: bigram_format
def bigram_format( test_corpus ):
"""
>>> bigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"])
[[('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')], [('the', 'cat'), ('cat', 'walks'), ('walks', 'STOP')], [('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')]]
"""
wl = [ [word for word in sentence.split()] for sentence in test_corpus]
return [ util.bigrams( l ) for l in wl ]
示例14: get_ngram_tokens
def get_ngram_tokens(self, line):
tokens = nltk.wordpunct_tokenize(line)
message = [self.stemmer.stem(x) for x in tokens if len(x) > 2 and x not in self.stops]
bigram = bigrams(message)
for pair in bigram:
joined = " ".join(pair)
message.append(joined)
return list(set(message))
示例15: sentProbaility
def sentProbaility(self,sent,smooth_const):
V = 217847
tool = MyToolKit()
bigrs = bigrams(tool.words(sent));
p = 1
for tuple in bigrs:
p = math.exp(math.log(p)+math.log(self.LaplaceSmoothing(tuple[1],tuple[0],smooth_const,V)))
#p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V)))
return p