本文整理汇总了Python中nltk.probability.FreqDist.values方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.values方法的具体用法?Python FreqDist.values怎么用?Python FreqDist.values使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.values方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: alpha
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def alpha(self):
"""Krippendorff 1980
"""
# check for degenerate cases
if len(self.K) == 0:
raise ValueError("Cannot calculate alpha, no data present!")
if len(self.K) == 1:
log.debug("Only one annotation value, allpha returning 1.")
return 1
if len(self.C) == 1 and len(self.I) == 1:
raise ValueError("Cannot calculate alpha, only one coder and item present!")
total_disagreement = 0.0
total_ratings = 0
all_valid_labels_freq = FreqDist([])
total_do = 0.0 # Total observed disagreement for all items.
for i, itemdata in self._grouped_data('item'):
label_freqs = FreqDist(x['labels'] for x in itemdata)
labels_count = sum(label_freqs.values())
if labels_count < 2:
# Ignore the item.
continue
all_valid_labels_freq += label_freqs
total_do += self.Disagreement(label_freqs) * labels_count
do = total_do / sum(all_valid_labels_freq.values())
de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
k_alpha = 1.0 - do / de
return k_alpha
示例2: char_freq
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def char_freq(lines):
""" 返回 DataFrame,按字符频率倒序排列 """
corpus = nltk.Text(chain.from_iterable(lines)) # 需要一个长字符串,而不是字符串列表
wc = FreqDist(corpus)
df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()})
df.sort('freq', ascending=False, inplace=True)
df['idx'] = np.arange(len(wc.values()))
return df
示例3: get_top_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def get_top_words(directory, n, file):
num_docs = 0.0
flist = {}
result = {}
for f in os.listdir(directory):
#stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt"
num_docs+=1
rawContents = load_file_tokens(directory+'/'+f)
fdist = FreqDist( rawContents )
normalF = max(fdist.values())
for key in fdist.keys():
fdist[key]=float(float(fdist[key])/normalF)
flist[directory+'/'+f] = fdist
for key in flist[file].keys():
num_appear=0
for key_file in flist.keys():
if key in flist[key_file].keys():
num_appear+=1
result[key] = flist[file][key]*math.log(num_docs/(num_appear))
sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True)
top_x = sorted_x[:n]
result = []
for item in top_x:
result.append(item[0])
return result
示例4: createPDwithTeleport
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def createPDwithTeleport(readerWordlist,mergedWordList):
### teleporation paramerter with value of 1 percent
corpusPD = {}
readerPD = {}
unigramReaderWordList = FreqDist(readerWordlist)
unigramCorpusWordList = FreqDist(mergedWordList)
for word in unigramCorpusWordList.keys():
corpusPD[word] = unigramCorpusWordList[word]/float(sum(unigramCorpusWordList.values()))
if word in unigramReaderWordList:
readerPD[word] = unigramReaderWordList[word]/float(sum(unigramReaderWordList.values()))
else:
readerPD[word] = 0
readerPD[word] = 0.99*readerPD[word] + 0.01*corpusPD[word]
return readerPD
示例5: wordprefixsuffixsubstringsprobdist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def wordprefixsuffixsubstringsprobdist():
for w in englishdicttxt:
wtok=w.split()
if len(wtok) > 0:
computeprefixessuffixessubstrings(wtok[0])
wordlist.append(wtok[0])
#prefixf=open("WordPrefixesProbabilities.txt","w")
#suffixf=open("WordSuffixesProbabilities.txt","w")
prefixdict=FreqDist(prefixes)
suffixdict=FreqDist(suffixes)
substringsdict=FreqDist(suffixes)
totalprefixes=sum(prefixdict.values())
totalsuffixes=sum(suffixdict.values())
totalsubstrings=sum(substringsdict.values())
for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
prefixprobdict[pk] = float(pv)/float(totalprefixes)
for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
suffixprobdict[pk] = float(pv)/float(totalsuffixes)
for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
substringsprobdict[pk] = float(pv)/float(totalsubstrings)
#json.dump(prefixprobdict,prefixf)
#json.dump(suffixprobdict,suffixf)
#print "prefix probabilities:",prefixprobdict
#print "suffix probabilities:",suffixprobdict
return (prefixprobdict, suffixprobdict, substringsprobdict)
示例6: plot_dist_productions_by_frequency
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def plot_dist_productions_by_frequency(productions):
f= FreqDist(productions)
fdd = FreqDist(f.values())
x = []
y = []
for k in fdd.keys():
x.append(k)
y.append(fdd[k])
plt.plot(x,y,lw=2,color= 'b')
plt.title('Productions by frequency' )
plt.xlabel('frequency')
plt.ylabel('number of rules with frequency')
plt.show()
示例7: _train
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
tag_prob = FreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
tag_prob.inc(tag)
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context,
# calculate the entropy. Only include contexts that
# lower then `cutoff` .
total_tags = float(sum(tag_prob.values()))
tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
useful_contexts_after_filter = useful_contexts.copy()
most_high = FreqDist()
for context in useful_contexts:
dd = fd[context]
# total_tags = float(sum(dd.values()))
# tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
h = self.H(dd.keys(),tags_probs)
if h > cutoff:
useful_contexts_after_filter.remove(context)
continue
most_high[context] = h
print most_high.keys()
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is.
for context in useful_contexts_after_filter:
best_tag = fd[context].max()
hits = fd[context][best_tag]
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
示例8: filterTokens
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def filterTokens(tokens, typefeatures=None):
all_terms = FreqDist(tokens)
if typefeatures == 'unigrams':
minimal = 2
elif typefeatures == 'bigrams':
minimal = 2
else:
minimal = 1
other = FreqDist()
for freq,term in zip(all_terms.values(),all_terms.keys()):
if freq >= minimal:
other.inc(term, freq)
else:
break
return other
示例9: get_buzzwords
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def get_buzzwords(docs):
buzzwords = []
for doc in docs:
freqdist = FreqDist(docs[doc])
vocab = freqdist.keys()
freqs = freqdist.values()
buzzwords = buzzwords + vocab[:50]
buzzwords = set(buzzwords)
freq_counts = {}
for buzzword in buzzwords:
print buzzword
l = []
for doc in docs:
freqdist = FreqDist(docs[doc])
t = (doc, freqdist[buzzword])
l.append(t)
freq_counts[buzzword] = l
dump_content('freqs', freq_counts)
return freq_counts
示例10: getFreq
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def getFreq(self, text, normalize=True):
stop_words = stopwords.words(self.detectLanguage(text))
words = self.getTokens(text)
clean_words = filter(lambda word: not word in stop_words and not word in punctuation, words)
fdist = FreqDist(clean_words)
#==============================================================================
# # same result
# fdist = FreqDist()
# for word in word_tokenize(text):
# word = word.lower()
# if not word in stop_words and not word in punctuation:
# fdist[word] += 1
#==============================================================================
# normalization by dividing on max freqency
if normalize:
norm = float(max(fdist.values()))
for word in fdist.keys():
fdist[word] = fdist[word] / norm
# remove too frequent and too rare words
if fdist[word] >= self._upper_bound or fdist[word] <= self._lower_bound:
del fdist[word]
return fdist
示例11: for
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)])
outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
# Write out the distribution of words in the document
with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out:
for k,v in fdist_words.most_common():
prozent = fdist_words.freq(k)
out.write("{},{},{}\n".format(k,v, prozent))
# Write out the distribution of ngrams in the document
with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out:
for k,v in fdist_ngrams.most_common():
prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams))
out.write("{},{},{}\n".format(k,v, prozent))
# Write the size of bins of words that appear with the same frequency
with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out:
for i in sorted(set(fdist_words.values())):
bin_size = fdist_words.Nr(i)
out.write("{},{}\n".format(i,bin_size))
print('Output distributions saved in \'output\' folder.')
print('Output bins saved in \'bins\' folder.')
# If there are many documents -> compare their most common words and ngrams
if len(params.files) > 1:
print("Pairwise overlap between {} most frequent words:".format(params.n))
short_names = [f.name[-15:] for f in params.files]
for i, list1 in enumerate(words_most_common):
for j, list2 in enumerate(words_most_common[i+1:]):
print("{} | {} | ".format(short_names[i], short_names[i+j+1]), end="")
overlap = len([w for w in list1 if w in list2])
print(overlap)
print("Pairwise overlap between {} most frequent letters and letter pairs:".format(params.m))
short_names = [f.name[-15:] for f in params.files]