本文整理汇总了Python中nltk.probability.FreqDist.plot方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.plot方法的具体用法?Python FreqDist.plot怎么用?Python FreqDist.plot使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.plot方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fun10
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def fun10():
"""frequency distribution"""
fdist1 = FreqDist(text1)
# print fdist1
vocabulary1 = fdist1.keys()
# print vocabulary1[:50]
fdist1.plot(50, cumulative=True)
示例2: main
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def main():
index = get_index("index.data")
results = bfs('Obama', 'GAB', index)
print_results(results)
fdistAB = FreqDist([rel.A() for rel in results] + [rel.B() for rel in results])
fdistAB.plot(10)
示例3: main
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def main():
argparser = argparse.ArgumentParser(description='text file')
argparser.add_argument('file', type=str, help='file to produce frequency distribution for')
args = argparser.parse_args()
#toker = WhitespaceTokenizer()
f = open(args.file)
text = f.read()
print(text)
fdist = FreqDist(text)
print(fdist.freq('28') * 100)
fdist.plot()
示例4: testFunc
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def testFunc():
fw = open("./MZI/data.doc", "r", encoding="utf8");
text = fw.read();
tockens = getWordList(text)
print(len(set(tockens)))
from nltk.probability import FreqDist
from nltk.util import bigrams
fdist = FreqDist(w for w in tockens if len(w) > 1);
fdist.tabulate(50);
big = list(bigrams(w for w in tockens if len(w) > 1));
print(big[:100]);
fdist = FreqDist(str(w) for w in big);
fdist.tabulate(10);
fdist.plot(50)
示例5: create_enhanced_dale_chall_list
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def create_enhanced_dale_chall_list(self):
#list of sites used to create list of most frequent words
alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
#bring all privacy texts into one list
corpus = []
data = get_all_policies()
for site in data:
if site in alexa_list:
corpus.append(data[site]["text"])
#get the words of this list into a list of words
t = textanalyzer("eng")
words = t.getWords("".join(corpus))
#open the dale chall wordlist
dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
#create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
new_corpus = []
for word in words:
if word.lower() not in dale_chall_list and word not in alexa_list:
new_corpus.append(word.lower())
#create a frequency distribution of the words of this list of words
fdist = FreqDist(new_corpus)
#plot this
fdist.plot(80, cumulative=True)
#make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
most_frequ = []
cum_percentage = 0.0
for sample in fdist:
cum_percentage += fdist.freq(sample)
most_frequ.append(sample)
if cum_percentage > 0.33:
break
#write those into a file
privacy_file = open("privacy_wordlist.txt", "w")
privacy_file.write(";".join(most_frequ))
示例6: content_fraction
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
# def content_fraction(text):
# stopwords = nltk.corpus.stopwords.words('spanish')
# content = [w for w in text if w.lower() not in stopwords]
# return len(content) / len(text)
# content_fraction(text)
# Step 4: stem words
# SOME INITIAL EXPLORATIONS OF THE TEXT
sorted(set(text)) # displays sorted unique words
fdist = FreqDist(text) # creates a frequency distribution for words
vocabulary = fdist.keys() # creates frequency distributions vocabularies
vocabulary[:50] # displays 50 most frequent words in text
fdist.plot(50, cumulative=True) # frequency distribution for 50 most frequent words
text.collocations() # common word collocations
# APPROACH 1: POINTWISE MUTUAL INFORMATION (PMI)
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
#quadgram_measures = nltk.collocations.QuadgramAssocMeasures()
finder_bi = BigramCollocationFinder.from_words(text)
finder_tri = TrigramCollocationFinder.from_words(text)
finder_quad = QuadgramCollocationFinder.from_words(text)
示例7: open
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
content = open(file_name, 'rb').read()
cutedText = " ".join(jieba.cut(content))
#nltkText = nltk.corpus.gutenberg.raw(cutedText)
fd = FreqDist(cutedText)
items = fd.items()
print items[:30]
#fd.plot()
#print cutedText
print dir(cutedText)
#print dir(nltkText)
print cutedText.count(u'ÃÏ¿Ì')
tags = jieba.analyse.extract_tags(content, topK=30)
fd = FreqDist(tags)
for keyword in tags:
print "result of ",keyword
count = cutedText.count(keyword)
print count
fd[keyword] = count
#cutedText.split().concordance(keyword)
print fd
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.xlabel(u'')
plt.ylabel(u'¥Œ ˝')
plt.title(u'')
fd.plot()
示例8: len
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
# Compute the Percentage of Hapax Legomena's Occurrences and the longest in them
hapax_legomenas = fdist.hapaxes() # Get the list of words that appeared just once in corpus
hapax_legomena_counts = len(hapax_legomenas) # Get the count of them
percentage_of_hapax_legomena = (hapax_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Hapax Legomena Occurrences", percentage_of_hapax_legomena)
max_len_happax_legomena = max([len(word) for word in hapax_legomenas])
print("Longest happax Legomena's are", [word for word in hapax_legomenas if len(word) == max_len_happax_legomena])
# Compute the Percentage of dis legomena Occurrences and the longest in them
dis_legomenas = [key for key, value in fdist.items() if value == 2] # Get the words that occurred just twice
dis_legomena_counts = len(dis_legomenas) * 2 # Get their counts
percentage_of_dis_legomena = (dis_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Dis Legomena Occurrences", percentage_of_dis_legomena)
max_len_dis_legomena = max([len(word) for word in dis_legomenas])
print("Longest Dis Legomena's are ", [word for word in dis_legomenas if len(word) == max_len_dis_legomena])
# Plot the r vs Nr graph
fdist.plot(50)
# Compute the log scaled version of r vs Nr
log_rvsNr = {log(key):log(value) for key, value in (fdist.r_Nr()).items() if value!=0}
# Plot the graph of log(r) vs log(Nr)
plot.plot(log_rvsNr.keys(), log_rvsNr.values(), 'r.')
plot.axis([-1, 11, -1, 11])
plot.xlabel('log(r)')
plot.ylabel('log(Nr)')
plot.title('log(r) vs log(Nr) Brown Corpus')
plot.show()
示例9: print
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
#*******************************************************************************
# Question: Are there differences between word-length frequencies of converted
# vs. unconverted requests?
# Answer: No
# Correlation Coefficient:
#*******************************************************************************
print('Begin calculating word length frequencies...')
cnvtText = ' '.join([item['request_text'] for item in data
if len(item['request_text'])>0
and item['requester_received_pizza']==1])
wl1 = [len(word) for word in nltk.word_tokenize(cnvtText) if word.isalpha()]
wl1fd = FreqDist(wl1)
if graphs == 'yes': wl1fd.plot()
## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 18
print('...Word length frequencies for successful requests have been plotted.')
uncnvtText = ' '.join([item['request_text'] for item in data
if len(item['request_text'])>0
and item['requester_received_pizza']==0])
wl2 = [len(word) for word in nltk.word_tokenize(uncnvtText) if word.isalpha()]
wl2fd = FreqDist(wl2)
if graphs == 'yes': wl2fd.plot()
## 4, 3, 2, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 17, 35, 20
print('...Word length frequencies for unsuccessful requests have been plotted.')
#*******************************************************************************
示例10: PlaintextCorpusReader
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
# -*- coding: utf-8
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import LineTokenizer
FIRST = 0
END = 150
corpus_root = './data'
fileids = 'data_title_sample'
wordlists = PlaintextCorpusReader(corpus_root,
fileids,
sent_tokenizer=LineTokenizer(),
encoding='utf-8')
tokens = []
for word in wordlists.words() :
try :
tokens += [ word.lower() ]
except :
pass
fdist = FreqDist(tokens)
fdist.plot(FIRST,END)
for k,v in fdist.items() :
print "{} {}".format(k.encode("utf-8"),v)
示例11: plot_html_results
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
def plot_html_results(self, lemmatized_list_by_verb_noun_adj_adv, number_of_cat):
fdist = FreqDist(w for w in lemmatized_list_by_verb_noun_adj_adv)
fdist.plot(number_of_cat)
示例12: set
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
# Creating main text object based on the Wall Street Journal corpora
# Setting all words to lowercase and removing non-alphabetical entrys
myText = [ word.lower() for word in text7 if word.isalpha() ]
# Creating text object based on myText, without repetitions
myTextSet = set( myText )
# Creating a frequency distribution with myText
fdMyText = FreqDist(myText)
# Creating histogram, and copying to file, in order of appearance
histogram = [ "%s - %s" % ( word, fdMyText[word] ) for word in myTextSet ]
fileObj = open("histogram.txt","w")
for wordInfo in histogram:
fileObj.write("%s\n" % (wordInfo) )
fileObj.close()
# Creating sorted list of the most frequent words, to the less frequent words,
# of the reuters text and copying to file
sortedList = fdMyText.keys()
fileObj = open("sortedHistogram.txt","w")
for word in sortedList:
fileObj.write("%s - %d\n" % (word, fdMyText[word]) )
fileObj.close()
# Only showing 50 most frequent words in plot because of limited monitor space
fdMyText.plot(50)
示例13: open
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
from nltk.probability import FreqDist
with open('christ-and-satan.txt') as f:
cs_text = f.read()
word_list = cs_text.split()
first_letter = [word[0] for word in word_list if word[0].isalpha()]
letter_dist = FreqDist(first_letter)
letter_dist.plot(4,cumulative=True)
示例14: FreqDist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import plot [as 别名]
print state_union_text.count("war")
state_union_text.concordance("economy")
state_union_text.similar("economy")
state_union_text.common_contexts(["economy", "jobs"])
from nltk.probability import FreqDist
fdist = FreqDist(state_union_text)
result = fdist.most_common(15)
result
from nltk.corpus import stopwords
stopwords.words("english")
filtered = [w for w in state_union.words() if not w in stopwords.words("english")]
len(filtered)
fdist_filtered = FreqDist(filtered)
fdist_filtered.most_common(20)
fdist_filtered.freq("good")/fdist_filtered.freq("bad")
fdist_filtered.freq("bad")/fdist_filtered.freq("evil")
fdist_filtered.plot(30)