本文整理汇总了Python中nltk.probability.FreqDist.inc方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.inc方法的具体用法?Python FreqDist.inc怎么用?Python FreqDist.inc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.inc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getWordFrequencies
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def getWordFrequencies(self, sentences):
freq_dist = FreqDist()
for sentence in sentences:
for token in nltk.word_tokenize(sentence):
if token not in string.punctuation:
freq_dist.inc(token)
return freq_dist
示例2: high_information_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd.inc(word)
label_word_fd[label].inc(word)
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].iteritems():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例3: text_to_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def text_to_dict(docs, metric):
""" Create dictionaries of term frequencies based on documents
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
num_docs = len(docs)
# Build dictionaries
dicts = []
for i, fd in enumerate(tf_dists):
if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists))
d = {}
if metric == FrequencyMetrics.TF:
for word in fd.samples():
d[word] = fd.freq(word)
elif metric == FrequencyMetrics.TF_IDF:
for word in fd.samples():
d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
else:
raise ValueError("No such feature type: %s" % feature_type);
dicts.append(d)
return dicts
示例4: create_bigram_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def create_bigram_scores():
posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1)
negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
pos = posBigrams
neg = negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in neg:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例5: train_supervised
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def train_supervised(self, labelled_sequences, **kwargs):
"""
Supervised training maximising the joint probability of the symbol and
state sequences. This is done via collecting frequencies of
transitions between states, symbol observations while within each
state and which states start a sentence. These frequency distributions
are then normalised into probability estimates, which can be
smoothed if desired.
:return: the trained model
:rtype: HiddenMarkovModelTagger
:param labelled_sequences: the training data, a set of
labelled sequences of observations
:type labelled_sequences: list
:param kwargs: may include an 'estimator' parameter, a function taking
a FreqDist and a number of bins and returning a CProbDistI;
otherwise a MLE estimate is used
"""
# default to the MLE estimate
estimator = kwargs.get('estimator')
if estimator is None:
estimator = lambda fdist, bins: MLEProbDist(fdist)
# count occurrences of starting states, transitions out of each state
# and output symbols observed in each state
known_symbols = set(self._symbols)
known_states = set(self._states)
starting = FreqDist()
transitions = ConditionalFreqDist()
outputs = ConditionalFreqDist()
for sequence in labelled_sequences:
lasts = None
for token in sequence:
state = token[_TAG]
symbol = token[_TEXT]
if lasts is None:
starting.inc(state)
else:
transitions[lasts].inc(state)
outputs[state].inc(symbol)
lasts = state
# update the state and symbol lists
if state not in known_states:
self._states.append(state)
known_states.add(state)
if symbol not in known_symbols:
self._symbols.append(symbol)
known_symbols.add(symbol)
# create probability distributions (with smoothing)
N = len(self._states)
pi = estimator(starting, N)
A = ConditionalProbDist(transitions, estimator, N)
B = ConditionalProbDist(outputs, estimator, len(self._symbols))
return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
示例6: train
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def train(labeled_featuresets, estimator=ELEProbDist):
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
for featureset, label in labeled_featuresets:
label_freqdist.inc(label)
for fname, fval in featureset.items():
feature_freqdist[label, fname].inc(fval)
feature_values[fname].add(fval)
fnames.add(fname)
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
feature_freqdist[label, fname].inc(None, num_samples-count)
feature_values[fname].add(None)
label_probdist = estimator(label_freqdist)
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label,fname] = probdist
return NaiveBayesClassifier(label_probdist, feature_probdist)
示例7: __init__
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
class VocabBuilder:
"""
Creates a vocabulary after scanning a corpus.
"""
def __init__(self, lang="english", min_length=3, cut_first=100):
"""
Set the minimum length of words and which stopword list (by language) to
use.
"""
self._counts = FreqDist()
self._stop = set(stopwords.words(lang))
self._min_length = min_length
self._cut_first = cut_first
print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))
def scan(self, words):
"""
Add a list of words as observed.
"""
for ii in [x.lower() for x in words if x.lower() not in self._stop \
and len(x) >= self._min_length]:
self._counts.inc(ii)
def vocab(self, size=5000):
"""
Return a list of the top words sorted by frequency.
"""
if len(self._counts) > self._cut_first + size:
return self._counts.keys()[self._cut_first:(size + self._cut_first)]
else:
return self._counts.keys()[:size]
示例8: build_freqdists
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def build_freqdists(self, wordcount_range=150000):
"""
Build word and label freq dists from the stored words with 'wordcount_range' words
and store the resulting FreqDists in Redis.
This cannot be cached as we have to continously update these values
from incremented word counts.
"""
word_freqdist = FreqDist()
label_word_freqdist = ConditionalFreqDist()
pos_words = self.r.zrange('positive_wordcounts', 0, wordcount_range, withscores=True, desc=True)
neg_words = self.r.zrange('negative_wordcounts', 0, wordcount_range, withscores=True, desc=True)
assert pos_words and neg_words, 'Requires wordcounts to be stored in redis.'
for word,count in pos_words:
word_freqdist.inc(word, count=count)
label_word_freqdist['pos'].inc(word, count=count)
for word,count in neg_words:
word_freqdist.inc(word, count=count)
label_word_freqdist['neg'].inc(word, count=count)
#storing for use later, these values are always computed
self.r.set('word_fd', pickle.dumps(word_freqdist))
self.r.set('label_fd', pickle.dumps(label_word_freqdist))
示例9: __setTermsCHISQUARE__
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def __setTermsCHISQUARE__(self,size):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in self.reader.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in self.reader.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
wordScores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
wordScores[word] = pos_score + neg_score
termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
self.terms = [w for (w,s) in termScore];
示例10: clean_train_data_and_find_best_features
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def clean_train_data_and_find_best_features(self):
#Top n best unigram features are selected
freq_dist_obj = FreqDist()
cond_freq_dist_obj = ConditionalFreqDist()
self.book_category_set = set()
for instance in self.book_instances:
try:
raw_data = instance and instance.strip() and instance.strip().split("\t")
if not raw_data or len(raw_data) != 4 : continue
bookid = raw_data[0]
self.book_category_set.add(bookid)
features = []
features.extend(self.clean_book_title(raw_data[2]))
features.extend(self.clean_author_name(raw_data[3]))
features.extend(self.bookid_to_toc_dict.get(raw_data[1], []))
for feat in features:
freq_dist_obj.inc(feat)
cond_freq_dist_obj[bookid].inc(feat)
except:
self.logging.info("Exception while running this instance %s \n" % instance)
total_word_count = 0
for bookid in self.book_category_set:
total_word_count += cond_freq_dist_obj[bookid].N()
word_score_dict = {}
for word, freq in freq_dist_obj.iteritems():
score = 0
if word and word.lower() in self.stopwords_set:continue
for bookid in self.book_category_set:
score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count)
word_score_dict[word] = score
self.select_top_n_best_features(word_score_dict)
示例11: create_word_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def create_word_scores():
posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例12: get_bestwords
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
if os.path.exists(cache_path):
bestwords = pickle.load(open(cache_path, 'r'))
print 'Loaded from cache'
print 'bestwords count = %d' % (len(bestwords))
return bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos_contents = contents[labels == 1]
neg_contents = contents[labels != 0]
pos_words = set()
neg_words = set()
for pos_content in pos_contents:
pos_words = pos_words.union(word_tokenize(pos_content))
for neg_content in neg_contents:
neg_words = neg_words.union(word_tokenize(neg_content))
for word in pos_words:
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in neg_words:
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
bestwords = set([w for w, s in best])
print 'all words count = %d' % (len(word_scores))
print 'bestwords count = %d' % (len(bestwords))
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
f = open(cache_path, 'w')
pickle.dump(bestwords, f)
print 'Dumped to cache'
return bestwords
示例13: best_word_feats
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def best_word_feats(self, words):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return dict([(word, True) for word in words if word in bestwords])
示例14: classify
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def classify(self, feats):
counts = FreqDist()
for classifier in self._classifiers:
counts.inc(classifier.classify(feats))
return counts.max()
示例15: train_emission_number_distribution
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import inc [as 别名]
def train_emission_number_distribution(self, inputs):
"""
Trains the distribution over the number of notes emitted from a
chord class. It's not conditioned on the chord class, so the only
training data needed is a segmented MIDI corpus.
@type inputs: list of lists
@param inputs: training data. The same format as is produced by
L{jazzparser.taggers.segmidi.midi.midi_to_emission_stream}
"""
self.add_history(
"Training emission number probabilities using %d MIDI segments"\
% len(inputs))
emission_number_counts = FreqDist()
for sequence in inputs:
for segment in sequence:
notes = len(segment)
# There should very rarely be more than the max num of notes
if notes <= self.max_notes:
emission_number_counts.inc(notes)
# Apply simple laplace smoothing
for notes in range(self.max_notes):
emission_number_counts.inc(notes)
# Make a prob dist out of this
emission_number_dist = prob_dist_to_dictionary_prob_dist(\
mle_estimator(emission_number_counts, None))
self.emission_number_dist = emission_number_dist