本文整理汇总了Python中nltk.probability.FreqDist.iteritems方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.iteritems方法的具体用法?Python FreqDist.iteritems怎么用?Python FreqDist.iteritems使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.iteritems方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_word_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_word_scores():
# posdata是list类型,长度1084,表中每个元素都是一个list,如元素:[u'\u7535\u6c60', u'\u4e0d\u7ed9\u529b', u'\u90fd'],
# 是每条评论的分词,如[电池 不给力 都 很 好 老婆 买 带 16G 卡 一张]
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
objWords = list(itertools.chain(*objdata))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in negWords:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
for word in objWords:
word_fd[word] += 1
cond_word_fd['obj'][word] += 1
pos_word_count = cond_word_fd['pos'].N() # N()计算出现过的次数总和,可以理解为所有pos类型的词出现的次数总和
neg_word_count = cond_word_fd['neg'].N()
obj_word_count = cond_word_fd['obj'].N()
total_word_count = pos_word_count + neg_word_count + obj_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
word_scores[word] = pos_score + neg_score + obj_score
return word_scores
示例2: __init__
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def __init__(self):
## Best words feature extraction
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
self.bestwords = set([w for w, s in best])
self.train_classifier()
示例3: create_words_bigrams_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_words_bigrams_scores():
posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word]+=1
cond_word_fd['pos'][word]+=1
for word in neg:
word_fd[word]+=1
cond_word_fd['neg'][word]+=1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例4: GetHighInformationWordsChi
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def GetHighInformationWordsChi(num_bestwords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd[word.lower()] +=1
label_word_fd['pos'][word.lower()] +=1
for word in movie_reviews.words(categories=['neg']):
word_fd[word.lower()] +=1
label_word_fd['neg'][word.lower()] +=1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
bestwords = set([w for w, s in best])
return bestwords
示例5: setup
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def setup():
global bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return train(best_bigram_word_features)
示例6: create_word_bigram_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_word_bigram_scores():
posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
for word in neg:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例7: create_word_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_word_scores(posWords, negWords):
file_scores = file("cn_sample_data/scores.txt", "w")
#迭代,将多个序列合并
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in negWords:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
for key in word_scores:
file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
file_scores.close()
return word_scores
示例8: getWordScores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def getWordScores():
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例9: clean_train_data_and_find_best_features
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def clean_train_data_and_find_best_features(self):
#Top n best unigram features are selected
freq_dist_obj = FreqDist()
cond_freq_dist_obj = ConditionalFreqDist()
self.book_category_set = set()
for instance in self.book_instances:
try:
raw_data = instance and instance.strip() and instance.strip().split("\t")
if not raw_data or len(raw_data) != 4 : continue
bookid = raw_data[0]
self.book_category_set.add(bookid)
features = []
features.extend(self.clean_book_title(raw_data[2]))
features.extend(self.clean_author_name(raw_data[3]))
features.extend(self.bookid_to_toc_dict.get(raw_data[1], []))
for feat in features:
freq_dist_obj.inc(feat)
cond_freq_dist_obj[bookid].inc(feat)
except:
self.logging.info("Exception while running this instance %s \n" % instance)
total_word_count = 0
for bookid in self.book_category_set:
total_word_count += cond_freq_dist_obj[bookid].N()
word_score_dict = {}
for word, freq in freq_dist_obj.iteritems():
score = 0
if word and word.lower() in self.stopwords_set:continue
for bookid in self.book_category_set:
score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count)
word_score_dict[word] = score
self.select_top_n_best_features(word_score_dict)
示例10: tfidf
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def tfidf(phrase_lists, corpus=nltk.corpus.brown.words(), ngram_range=(1, 6)):
ranker = CorpusRanker(corpus, ngram_range)
phrase_frequencies = FreqDist(tuple(p) for p in phrase_lists)
phrase_scores = {}
for phrase, freq in phrase_frequencies.iteritems():
phrase_scores[phrase] = ranker.score(phrase, freq)
return phrase_scores, phrase_frequencies
示例11: get_bestwords
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
if os.path.exists(cache_path):
bestwords = pickle.load(open(cache_path, 'r'))
print 'Loaded from cache'
print 'bestwords count = %d' % (len(bestwords))
return bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos_contents = contents[labels == 1]
neg_contents = contents[labels != 0]
pos_words = set()
neg_words = set()
for pos_content in pos_contents:
pos_words = pos_words.union(word_tokenize(pos_content))
for neg_content in neg_contents:
neg_words = neg_words.union(word_tokenize(neg_content))
for word in pos_words:
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in neg_words:
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
bestwords = set([w for w, s in best])
print 'all words count = %d' % (len(word_scores))
print 'bestwords count = %d' % (len(bestwords))
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
f = open(cache_path, 'w')
pickle.dump(bestwords, f)
print 'Dumped to cache'
return bestwords
示例12: best_word_feats
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def best_word_feats(self, words):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return dict([(word, True) for word in words if word in bestwords])
示例13: create_word_bigram_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_word_bigram_scores():
posdata = tp.seg_fil_senti_excel("~", 1, 1)
negdata = tp.seg_fil_senti_excel("~", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
last_word = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
last_word['pos'].inc(word)
for word in neg:
word_fd.inc(word)
last_word['neg'].inc(word)
pos_word_count = last_word['pos'].N()
neg_word_count = last_word['neg'].N()
totalnumber = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
word_scores[word] = pos_score + neg_score
return word_scores
示例14: create_word_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def create_word_scores(sentences):
# logging.info(sentences)
words = list(itertools.chain(*sentences))
# logging.info(words)
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in words:
word_fd.inc(word.lower())
cond_word_fd['pos'].inc(word.lower())
cond_word_fd['neg'].inc(word.lower())
#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例15: get_best_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import iteritems [as 别名]
def get_best_words(words_list, num_best_words):
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for pair in words_list:
line,sent = pair
for word in nltk.word_tokenize(line):
word_fd.inc(word.lower())
label_word_fd[sent].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
bestwords = set([w for w, s in best])
return bestwords