本文整理汇总了Python中nltk.collocations.BigramCollocationFinder类的典型用法代码示例。如果您正苦于以下问题:Python BigramCollocationFinder类的具体用法?Python BigramCollocationFinder怎么用?Python BigramCollocationFinder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BigramCollocationFinder类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_word_bigram_scores
def create_word_bigram_scores():
posdata = tp.seg_fil_senti_excel("~", 1, 1)
negdata = tp.seg_fil_senti_excel("~", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
last_word = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
last_word['pos'].inc(word)
for word in neg:
word_fd.inc(word)
last_word['neg'].inc(word)
pos_word_count = last_word['pos'].N()
neg_word_count = last_word['neg'].N()
totalnumber = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
word_scores[word] = pos_score + neg_score
return word_scores
示例2: create_word_bigram_scores
def create_word_bigram_scores():
posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finderr = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in neg:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例3: create_words_bigrams_scores
def create_words_bigrams_scores():
posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word]+=1
cond_word_fd['pos'][word]+=1
for word in neg:
word_fd[word]+=1
cond_word_fd['neg'][word]+=1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例4: create_word_bigram_scores
def create_word_bigram_scores():
posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
for word in neg:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例5: _get_bigram_scores
def _get_bigram_scores(self, posdata, negdata):
pos_words = list(itertools.chain(*posdata))
neg_words = list(itertools.chain(*negdata))
pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = pos_words + pos_bigrams
neg = neg_words + neg_bigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in neg:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例6: create_bigram_scores
def create_bigram_scores():
posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
pos = posBigrams
neg = negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in neg:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例7: get_unibigram_features
def get_unibigram_features(all_words, uni_feanum, bi_feanum):
word_fd = nltk.FreqDist(all_words)
bigram_fd = nltk.FreqDist(nltk.bigrams(all_words))
if uni_feanum == 'max':
uni_feanum = len(list(word_fd.keys()))
elif uni_feanum > len(list(word_fd.keys())):
uni_feanum = len(list(word_fd.keys()))
if bi_feanum == 'max':
bi_feanum = len(list(bigram_fd.keys()))
elif bi_feanum > len(list(bigram_fd.keys())):
bi_feanum = len(list(bigram_fd.keys()))
finder = BigramCollocationFinder(word_fd, bigram_fd)
bigrams = finder.nbest(BigramAssocMeasures.chi_sq, bi_feanum)
print "the number of unigram features is", uni_feanum
print "the number of bigram features is", bi_feanum
featuples = word_fd.most_common(uni_feanum)
selected_words = []
for i in range(uni_feanum):
selected_words.append(featuples[i][0])
features = []
for ngram in itertools.chain(selected_words, bigrams):
features.append(ngram)
return features
示例8: create_word_bigram_scores
def create_word_bigram_scores(posWords, negWords):
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in neg:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例9: best_bigrams
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300):
sents_pos = []
sents_neg = []
# Separate positive and negative sentences.
for tag, sent in sents_tagged:
if tag == 1:
sents_pos.append(sent)
elif tag == -1:
sents_neg.append(sent)
# Extract words from positive and negative sentences.
words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation]
words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation]
# Find the best bigrams for positive sentences based on informative collocations
bigram_finder1 = BigramCollocationFinder.from_words(words_pos)
bigrams_best_pos = bigram_finder1.nbest(score_fn, n)
# Find the best bigrams for negative sentences based on informative collocations
bigram_finder2 = BigramCollocationFinder.from_words(words_neg)
bigrams_best_neg = bigram_finder2.nbest(score_fn, n)
bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg)))
# Select only the bigrams that have either one of the word greater than length 3
bigrams_best = [bigram for bigram in bigrams_all
if len(bigram[0]) > 3 and len(bigram[1]) > 3
and bigram[0] not in ex and bigram[1] not in ex ]
return bigrams_best
示例10: create_word_bigram_scores
def create_word_bigram_scores(posWords, negWords, n = 5000):
# (posWords,negWords) = readwordarr()
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
bigramfinder = BigramCollocationFinder.from_words(posWords)
posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
bigramfinder = BigramCollocationFinder.from_words(negWords)
negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
posWords = posWords + posbigrams
negWords = negWords + negbigrams
wordscores = {}
wordfd = FreqDist()
conditionwordfd = ConditionalFreqDist()
for word in posWords:
wordfd[word]+=1
conditionwordfd['pos'][word]+=1
for word in negWords:
wordfd[word]+=1
conditionwordfd['neg'][word]+=1
pos_word_count = conditionwordfd['pos'].N()
neg_word_count = conditionwordfd['neg'].N()
totalcount = pos_word_count + neg_word_count
for word,freq in wordfd.items():
pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
wordscores[word] = pos_score + neg_score
return wordscores
示例11: create_word_bigram_scores
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
'''
以双词来统计词的信息量
'''
bigram_finder = BigramCollocationFinder.from_words(posWords)
posBigrams = bigram_finder.nbest(score_method, 5000)
bigram_finder = BigramCollocationFinder.from_words(negWords)
negBigrams = bigram_finder.nbest(score_method, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in neg:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例12: create_word_bigram_scores
def create_word_bigram_scores():
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
return get_scores(pos, neg)
示例13: create_word_bigram_scores
def create_word_bigram_scores():
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
objWords = list(itertools.chain(*objdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
bigram_finder = BigramCollocationFinder.from_words(objWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
objBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
obj = objWords + objBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in neg:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
for word in objWords:
word_fd[word] += 1
cond_word_fd['obj'][word] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
obj_word_count = cond_word_fd['obj'].N()
total_word_count = pos_word_count + neg_word_count + obj_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
word_scores[word] = pos_score + neg_score + obj_score
return word_scores
示例14: best_bigram_word_features
def best_bigram_word_features(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
d.update(best_word_features(words))
return d
示例15: collocations
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not (
'_collocations' in self.__dict__
and self._num == num
and self._window_size == window_size
):
self._num = num
self._window_size = window_size
# print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))