本文整理汇总了Python中nltk.FreqDist.tabulate方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.tabulate方法的具体用法?Python FreqDist.tabulate怎么用?Python FreqDist.tabulate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.tabulate方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import tabulate [as 别名]
word_len = [len(w) for w in text1]
print word_len
# Example Description
# fdist = FreqDist(samples) create a frequency distribution containing the given samples
# fdist[sample] += 1 increment the count for this sample
# fdist['monstrous'] count of the number of times a given sample occurred
# fdist.freq('monstrous') frequency of a given sample
# fdist.N() total number of samples
# fdist.most_common(n) the n most common samples and their frequencies
# for sample in fdist: iterate over the samples
# fdist.max() sample with the greatest count
# fdist.tabulate() tabulate the frequency distribution
# fdist.plot() graphical plot of the frequency distribution
# fdist.plot(cumulative=True) cumulative plot of the frequency distribution
# fdist1 |= fdist2 update fdist1 with counts from fdist2
# fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2
fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
示例2: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import tabulate [as 别名]
#!/usr/bin/python
# coding: utf-8
# 2013/03/20
from nltk import FreqDist
fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
示例3: lemma
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import tabulate [as 别名]
# lemma
def lemma(text):
lmtzr = WordNetLemmatizer()
return [lmtzr.lemmatize(w) for w in text]
nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title) # Frequency distribution of text
fdist_title.most_common(50) # most common 50
fdist_title['science'] # return count of a given word
fdist_title.max() # max counts
fdist_title.plot(50, cumulative=True) # plot
fdist_title.plot(50)
fdist_title.tabulate(50) # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))
# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
bigrams_title.plot(50,cumulative=True)
trigrams_title.most_common(20)
示例4: sorted
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import tabulate [as 别名]
# Words that are more frequent than 7 times and are more than 7 characters long
rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7)
### SUMMARY
# Other functions and attributes of frequency distribution object
fd = FreqDist(brown.words())
fd['County'] # count of a specific word
fd.freq('County') # frequency of a specific word
fd.N() # total number of samples
fd.most_common(10)
for sample in fd:
print sample
fd.max()
fd.tabulate()
fd.plot()
fd1 |= fd2 # update fd1 with counts from fd2
fd1 < fd2 # test if samples in fd1 occur less frequenctly than in fd2
### IMPORTING TEXT
# NLTK comes with a collection of texts to get started. To import a specific text:
from nltk.book import text1
from nltk.book import sent7
### USING CONDITIONALS
# select words based on their length
[w for w in sent7 if len(w) < 4]
# select words based on other attributes
w.startswith('t') # same as w[0]=='t'
示例5: print
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import tabulate [as 别名]
print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist) # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化
print(fdist.most_common(5)) # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()]) # 26 个字母使用频率排序
##################################################################
## most_common(n); 得到前 n 个按频率排序后的词
print(fd.most_common(5)) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
fd.tabulate() # 表格形式给出 most_common()
# 或者使用 Counter 来实现
from collections import Counter
print(Counter(fd).most_common(5)) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.
# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .
# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系
##################################################################
## 使用 NLTK 对齐普夫定律进行作图