本文整理汇总了Python中nltk.FreqDist方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.FreqDist方法的具体用法?Python nltk.FreqDist怎么用?Python nltk.FreqDist使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.FreqDist方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: do_analysis
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def do_analysis(dataset_obj):
# 1. all sample classification distribution
# 2. all sentence sample classification distribution
sample_num = dataset_obj.sample_num
collect = []
sent_collect = []
for trees in dataset_obj.nn_data:
for sample in trees:
sentiment_float = sample['root_node']['sentiment_label']
sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
if sample['is_sent']:
sent_collect.append(sentiment_int)
collect.append(sentiment_int)
all_pdf = nltk.FreqDist(collect)
sent_pdf = nltk.FreqDist(sent_collect)
print('sample_num:', sample_num)
print('all')
print(all_pdf.tabulate())
print('sent')
print(sent_pdf.tabulate())
示例2: profile
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def profile(self, text):
''' Create FreqDist of trigrams within text '''
from nltk import word_tokenize, FreqDist
clean_text = self.remove_punctuation(text)
tokens = word_tokenize(clean_text)
fingerprint = FreqDist()
for t in tokens:
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
for cur_trigram in token_trigrams:
if cur_trigram in fingerprint:
fingerprint[cur_trigram] += 1
else:
fingerprint[cur_trigram] = 1
return fingerprint
示例3: create_freq_dist
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def create_freq_dist(in_lst, exclude):
"""Create a frequency distribution.
Parameters
----------
in_lst : list of str
Words to create the frequency distribution from.
exclude : list of str
Words to exclude from the frequency distribution.
Returns
-------
freqs : nltk.FreqDist
Frequency distribution of the words.
Examples
--------
Compute the frequency distribution of a collection of words:
>>> ArticlesAll.create_freq_dist(in_lst=['brain', 'brain', 'head', 'body'], exclude=['body'])
FreqDist({'brain': 2, 'head': 1})
If you want to visualize a frequency distribution, you can plot them as a wordcloud:
>>> from lisc.plts.words import plot_wordcloud
>>> freq_dist = nltk.FreqDist({'frontal': 26, 'brain': 26, 'lobe': 23, 'patients': 19})
>>> plot_wordcloud(freq_dist, len(freq_dist))
"""
freqs = nltk.FreqDist(in_lst)
for excl in exclude:
try:
freqs.pop(excl.lower())
except KeyError:
pass
return freqs
示例4: bigram_counts
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def bigram_counts(word_list):
bgs = nltk.bigrams(word_list)
fdist = nltk.FreqDist(bgs)
d = Counter()
for k, v in fdist.items():
d[k] = v
return d
示例5: describe
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and
returns a dictionary with a variety of metrics
concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
for sent in para:
for word, tag in sent:
counts['words'] += 1
tokens[word] += 1
# Return data structure with information
return {
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
}
示例6: _calculate_word_scores
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def _calculate_word_scores(self, phrase_list):
word_freq = nltk.FreqDist()
word_degree = nltk.FreqDist()
for phrase in phrase_list:
# degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
# SML above cost error
degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1
for word in phrase:
# word_freq.inc(word)
# SML error above:
word_freq[word] += 1
# word_degree.inc(word, degree) # other words
word_degree[word] = degree
for word in word_freq.keys():
word_degree[word] = word_degree[word] + word_freq[word] # itself
# word score = deg(w) / freq(w)
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_degree[word] / word_freq[word]
return word_scores
示例7: load_data
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
global N, words, labels
posts = corpus.xml_posts()[:10000]
freqs = [ FreqDist(post.text) for post in posts ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
labels = list(set([ post.get('class') for post in posts ]))
data = []
N = len(words)
for post, dist in zip(posts, freqs):
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, labels.index(post.get('class'))))
return data
示例8: load_data
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, V.w))
return data
示例9: load_data
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
x = volumize(dist)
data.append((x, x.w))
return data
示例10: test
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def test():
gt = GetTweets()
documents = gt.get_hashtag('ferguson', count=20)
documents += gt.get_hashtag('police', count=21)
print 'Query:', documents[-1]
tokenizer = RegexpTokenizer('\w+')
vols = []
for doc in documents:
samples = []
for token in tokenizer.tokenize(doc):
word = token.lower()
if word not in ENGLISH_STOP_WORDS and word not in punctuation:
samples.append(word)
vols.append(volumize(FreqDist(samples)))
vectors = [ doc_code(v) for v in vols[:-1] ]
query_vec = doc_code(vols[-1])
sims = [ cos(v, query_vec) for v in vectors ]
m = max(sims)
print m, documents[sims.index(m)]
示例11: gene_token_freq_info
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def gene_token_freq_info(context_token, question_token):
def look_up_dict(t_dict, t):
try:
return t_dict[t]
except KeyError:
return 0
context_token_dict = dict(nltk.FreqDist(context_token))
question_token_dict = dict(nltk.FreqDist(question_token))
# context tokens in context and question dicts
context_tf = []
for token in context_token:
context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))
# question tokens in context and question dicts
question_tf = []
for token in context_token:
question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))
return {'context':context_tf, 'question':question_tf}
示例12: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs):
self.terms = terms # the terms used to build the lexicon
self.documents = documents
self.classes = classes
self.terms_frequency = frequency
self.terms_frequency_per_class = dict()
self.main_class = main_class
# the minimum support for a term (i.e., number of documents in the class of interest in order to be considered)
self.min_docs = min_docs
self.class_occ = dict()
for c in class_types:
self.terms_frequency_per_class[c]=nltk.FreqDist()
self.class_occ[c] = classes.count(c)
for i, doc in enumerate(self.documents):
cls = self.classes[i]
for t in doc:
self.terms_frequency_per_class[cls].inc(t)
# the scoring functions return the list of discriminative terms for the class of interest according to each metric
示例13: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def __init__(self, order, alpha, sentences):
self.order = order
self.alpha = alpha
if order > 1:
self.backoff = LangModel(order - 1, alpha, sentences)
self.lexicon = None
else:
self.backoff = None
self.n = 0
self.ngramFD = nltk.FreqDist()
lexicon = set()
for sentence in sentences:
words = nltk.word_tokenize(sentence)
wordNGrams = nltk.ngrams(words, order)
for wordNGram in wordNGrams:
self.ngramFD[wordNGram] += 1
# self.ngramFD.inc(wordNGram)
if order == 1:
lexicon.add(wordNGram)
self.n += 1
self.v = len(lexicon)
示例14: build_vocab
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def build_vocab(docs, save_path):
print('Building vocab ...')
sents = itertools.chain(*[text.split('<sssss>') for text in docs])
tokenized_sents = [sent.split() for sent in sents]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents))
print("%d unique words found" % len(word_freq.items()))
# Cut-off
retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF]
print("%d words retained" % len(retained_words))
# Get the most common words and build index_to_word and word_to_index vectors
# Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding
word_to_index = {'PAD': 0, 'UNK': 1}
for i, w in enumerate(retained_words):
word_to_index[w] = i + 2
index_to_word = {i: w for (w, i) in word_to_index.items()}
print("Vocabulary size = %d" % len(word_to_index))
with open('{}-w2i.pkl'.format(save_path), 'wb') as f:
pickle.dump(word_to_index, f)
with open('{}-i2w.pkl'.format(save_path), 'wb') as f:
pickle.dump(index_to_word, f)
return word_to_index
示例15: get_min_count
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import FreqDist [as 别名]
def get_min_count(sents):
'''
Args:
sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]]
Returns:
min_count: A uint. Should be set as the parameter value of word2vec `min_count`.
'''
global vocab_size
from itertools import chain
fdist = nltk.FreqDist(chain.from_iterable(sents))
min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word
return min_count