本文整理汇总了Python中nltk.metrics.BigramAssocMeasures.chi_sq方法的典型用法代码示例。如果您正苦于以下问题:Python BigramAssocMeasures.chi_sq方法的具体用法?Python BigramAssocMeasures.chi_sq怎么用?Python BigramAssocMeasures.chi_sq使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.metrics.BigramAssocMeasures
的用法示例。
在下文中一共展示了BigramAssocMeasures.chi_sq方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_word_bigram_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_bigram_scores():
posdata = tp.seg_fil_senti_excel("~", 1, 1)
negdata = tp.seg_fil_senti_excel("~", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
last_word = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
last_word['pos'].inc(word)
for word in neg:
word_fd.inc(word)
last_word['neg'].inc(word)
pos_word_count = last_word['pos'].N()
neg_word_count = last_word['neg'].N()
totalnumber = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
word_scores[word] = pos_score + neg_score
return word_scores
示例2: high_words
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos = 0
neg = 0
for review in posids:
pos += 1
if (pos != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
for review in negids:
neg += 1
if (neg != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
bestwords = set([w for w, s in best])
return bestwords
"""
示例3: create_word_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_scores(self):
[posWords, negWords] = self.getAllWords()
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
log("Total number of words: %d" % total_word_count)
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例4: create_word_bigram_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_bigram_scores(posWords, negWords):
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in neg:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例5: setup
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def setup():
global bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return train(best_bigram_word_features)
示例6: create_word_bigram_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_bigram_scores():
posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
for word in neg:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例7: create_word_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_scores(posWords,negWords,posTag,negTag):
from nltk.probability import FreqDist, ConditionalFreqDist
import itertools
posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*negWords)) #同理
word_fd = FreqDist() #可统计所有词的词频
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
#help(FreqDist)
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd[posTag].N() #积极词的数量
neg_word_count = cond_word_fd[negTag].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
示例8: create_word_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_word_scores():
posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*datan)) #同理
word_fd = nltk.FreqDist()
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in negWords:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N() #积极词的数量
neg_word_count = cond_word_fd['neg'].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
示例9: create_bigram_scores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def create_bigram_scores():
posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
pos = posBigrams
neg = negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in neg:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例10: getWordScores
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def getWordScores():
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例11: getBestWords
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def getBestWords(posWords, negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd["pos"][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd["neg"][word.lower()] += 1
pos_word_count = label_word_fd["pos"].N()
neg_word_count = label_word_fd["neg"].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
# best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
bestwords = set([w for w, s in sorted_x])
return bestwords
示例12: get_bestwords
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
if os.path.exists(cache_path):
bestwords = pickle.load(open(cache_path, 'r'))
print 'Loaded from cache'
print 'bestwords count = %d' % (len(bestwords))
return bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos_contents = contents[labels == 1]
neg_contents = contents[labels != 0]
pos_words = set()
neg_words = set()
for pos_content in pos_contents:
pos_words = pos_words.union(word_tokenize(pos_content))
for neg_content in neg_contents:
neg_words = neg_words.union(word_tokenize(neg_content))
for word in pos_words:
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in neg_words:
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
bestwords = set([w for w, s in best])
print 'all words count = %d' % (len(word_scores))
print 'bestwords count = %d' % (len(bestwords))
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
f = open(cache_path, 'w')
pickle.dump(bestwords, f)
print 'Dumped to cache'
return bestwords
示例13: best_word_feats
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def best_word_feats(self, words):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return dict([(word, True) for word in words if word in bestwords])
示例14: computeFreqDistribution
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def computeFreqDistribution():
if DEBUG:
print word_fd
pos_word_count = label_word_fd['positive'].N()
neg_word_count = label_word_fd['negative'].N()
neu_word_count = label_word_fd['neutral'].N()
total_word_count = pos_word_count + neg_word_count + neu_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count)
neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count)
word_scores[word] = pos_score + neg_score + neu_score
if DEBUG:
print json.dumps(word_scores, indent = 4)
threshold = 2
temp = []
for item in word_scores:
if word_scores[item] > threshold:
temp.append(item)
if DEBUG:
print temp
return temp
示例15: get_best_words
# 需要导入模块: from nltk.metrics import BigramAssocMeasures [as 别名]
# 或者: from nltk.metrics.BigramAssocMeasures import chi_sq [as 别名]
def get_best_words(words_list, num_best_words):
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for pair in words_list:
line,sent = pair
for word in nltk.word_tokenize(line):
word_fd.inc(word.lower())
label_word_fd[sent].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
bestwords = set([w for w, s in best])
return bestwords