本文整理汇总了Python中nltk.FreqDist.plot方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.plot方法的具体用法?Python FreqDist.plot怎么用?Python FreqDist.plot使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.plot方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_tweets
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def process_tweets (hashtag,addl_stops=[]):
count=0
good_count=0
words_to_plot=[]
#Iterate through all chunked files with relevant hashtag
for fname in os.listdir(os.getcwd()):
if fname.startswith(hashtag):
with open(fname,'r') as data_file:
data=data_file.read()
# Parse raw string since json.load() approach wasn't working
data=data.split("\n\x00,")
for tweet in data:
count+=1
# Tweets have a well-defined structure, so we can parse them
# manually (even though the JSON approach would be cleaner)
text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
# Skip tweets that contain Unicode
if text.find('\u')>=0:
continue
else:
good_count+=1
# Tokenize and count word frequency, ignoring case
words = word_tokenize(text)
clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
words_to_plot=words_to_plot+clean_words
#Create frequency histogram of 50 most common words and print summary of activity
fdist=FreqDist(words_to_plot)
fdist.plot(50)
print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
return words_to_plot
示例2: freq_dist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
"""Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
limit truncates the freq_dist to the limit most common items
return_counts determines whether a list of tuples (word, count) are returned,
or whether a list of just the limit most used words is returned
"""
for f in filtering_functions + [str.lower, str.strip]:
input = map(f, input)
nltk_fdist = FreqDist(list(input))
if plot: #use nltks built in plotting function before destroying the data structure
nltk_fdist.plot(limit) if limit else nltk_fdist.plot()
fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0])) #alphabetically sort equally counted items
fidst = fdist[0:limit] if limit else fdist #apply limit
fdist = [i[0] for i in fdist] if not return_counts else fdist #remove counts if desired
return fdist
示例3: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def main():
# класс частного распределения
fd_text1 = FreqDist(book.text1)
print(str.format('Объект частотного распределения: {}', fd_text1))
print(str.format(
'50 наиболее встречаемых слов: {}', fd_text1.most_common(50)
))
fd_text1.plot(50, cumulative=True)
示例4: freq
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def freq(tokens, n=None):
'''
This function takes a list of tokens and returns a list of the top n most
frequent tokens
It also prints a frequency distribution of the top 50 tokens
'''
fdist2 = FreqDist(tokens)
fdist2.plot(50, cumulative=True)
[i[0] for i in fdist2.items()[:20]]
if n is None:
print fdist2.items()[:20]
return [i[0] for i in fdist2.items()[:20]]
else:
print fdist2.items()[:n]
return [i[0] for i in fdist2.items()[:n]]
示例5: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def main():
obj = TweetFilter()
with open("NBA_Warriors.txt", "r") as myFile:
data = myFile.read().replace('\n',' ')
data = unicode(data, 'utf-8')
# This tokenizes each of the word in data
tokenz = word_tokenize(data)
# This passes the tokenz to the filter function
newTokenz = obj.filter(tokenz)
# Run a frequency distribution on the entire word list
fdist1 = FreqDist(newTokenz)
# Plots the top 30 words
fdist1.plot(30, cumulative=False)
示例6: fenci
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def fenci(data):
data = re.compile(r'\s+').sub('', data)
data = re.compile(r'\!\[.*?\]\(.*?\)').sub('', data)
seg_list = jieba.cut(data)
# 基于TF-IDF算法的关键词抽取
tags = jieba.analyse.extract_tags(data, topK=50)
print ','.join(tags)
# 基于TextRank算法的关键词抽取
tags2 = jieba.analyse.textrank(data, topK=50)
print ','.join(tags2)
fdist = FreqDist([seg for seg in seg_list])
fdist.plot(50)
示例7: lemmas_distribution_rus
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def lemmas_distribution_rus(dist):
dict_file = codecs.open('literature/processed_vocabulary',encoding='utf-8')
dict_text = dict_file.readlines()
dict_file.close()
dict_dict = {}
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
from collections import defaultdict
lemmas_dist = defaultdict(int)
for line in dict_text:
line_list = line.split(':')
dict_dict[line_list[0]] = line_list[1]
for word in dist.keys():
if word in dict_dict:
lemmas_dist[dict_dict[word]] += 1
else:
p = morph.parse(word)
if len(p) > 0:
print word
print p[0].normal_form
lemmas_dist[p[0].normal_form] += 1
print lemmas_dist[p[0].normal_form]
lemmas_dist = FreqDist(lemmas_dist)
lemmas_dist.plot(100)
示例8: CountFreq
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
class CountFreq(object):
def __init__(self, *args, **kwargs):
self.txt_file = codecs.open('new1.txt', encoding='utf-8')
self.stop_words = stopwords.words('english')
self.clean_words = []
self.loose_words = loose_words
def clean_text(self):
'''
this method will clean all the data in new1.txt as well as transfer the data from a text file to
a tokenized format that will be readily available for nltk to work with.
:return: sanitized and tokenized words.
'''
stop = self.stop_words
text = self.txt_file
for lines in text:
clean_words = [word for word in lines.lower().split() if word not in stop]
self.clean_words.append(clean_words)
self.clean_words = [val for sublist in self.clean_words for val in sublist]
return self.clean_words
def word_freq(self):
'''
single word frequency without any context. This will result in the top 100 words that will be shown and
identified as the most repeated words. However, rigorous filtration will be applied to the printed words
getting rid of words that are not Nouns
:return: the frequency distribution, obj.
'''
classified_text = pt(self.clean_words)
noun_descriptor = [word for word, pos in classified_text if pos == 'NN']
revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words]
self.fdist = FreqDist(revised_noun_descriptor)
return self.fdist
def graph_freq(self, cumulative):
'''
:param cumulative: Boolean value, when true it graphs the cumulative text score producing a diminishing
return graph
:return: a matplotlib graph
'''
return self.fdist.plot(100, cumulative=cumulative)
示例9: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
word_len = [len(w) for w in text1]
print word_len
# Example Description
# fdist = FreqDist(samples) create a frequency distribution containing the given samples
# fdist[sample] += 1 increment the count for this sample
# fdist['monstrous'] count of the number of times a given sample occurred
# fdist.freq('monstrous') frequency of a given sample
# fdist.N() total number of samples
# fdist.most_common(n) the n most common samples and their frequencies
# for sample in fdist: iterate over the samples
# fdist.max() sample with the greatest count
# fdist.tabulate() tabulate the frequency distribution
# fdist.plot() graphical plot of the frequency distribution
# fdist.plot(cumulative=True) cumulative plot of the frequency distribution
# fdist1 |= fdist2 update fdist1 with counts from fdist2
# fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2
fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
示例10: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
# We can find the FIRST position of given word:
myText.index('about')
# Frequency distribution
from nltk import FreqDist
fdist1 = FreqDist(text1)
vocabulary = fdist1.keys()
frequencies = fdist1.values()
fdist1['whale']
fdist1.plot(20)
fdist1.plot(20, cumulative = True)
# List comprehension
# Counting the number of characters in each word in a text
[len(w) for w in text1]
# Bigram function returns a list of bigrams
from nltk import bigrams, trigrams
bigrams(myText2)
trigrams(myText2)
bigramsText1 = bigrams(text1) # bigramsText1[0] is the tuple containing the first bigram
示例11: create_dist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def create_dist(nltk_text, stopwords):
dist = FreqDist(w.lower() for w in nltk_text if len(w)>=3 and w.isalnum() and w.lower() not in stopwords)
dist.plot(50)
print "Number of wordforms"+str(len(dist))
return dist
示例12: BeautifulSoup
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
# soup = BeautifulSoup(open(path))
# chapter = soup.findAll(text=True)[0]
file = open(path)
chapter = file.read()
chapter_tuple = (chapter, 'real')
words = [ w.lower() for w in word_tokenize(chapter) ]
real_chapters.append(chapter_tuple)
real_words.extend(words)
word_total = len(real_words)
harry_total = real_words.count('harry')
fd = FreqDist(real_words)
fd.plot(26)
# filtered_real_words = [ w.lower() for w in real_words if w.isalpha() ]
filtered_real_words = [ w for w in real_words if w.isalpha() and w not in stop ]
Rowling = filtered_real_words
fd = FreqDist(filtered_real_words)
fd.plot(26)
file = open('ao_hp_stories.jl')
ao_chapters = []
ao_words = []
AO3 = []
AO3_normed = []
示例13: open
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
import json
from textstat.textstat import textstat
from nltk import FreqDist
from matplotlib.pyplot import *
filename = 'bieber-raw-test.json'
READ = 'rb'
TEXT=1
stopwords = open('stopwords',READ).read().splitlines()
tweets = json.load(open(filename,READ))
#Identify retweets
words = ' '.join([tweet['text'] for tweet in tweets]).split()
fdist = FreqDist(words)
fdist.plot(20)
tight_layout()
示例14: lemma
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
# lemma
def lemma(text):
lmtzr = WordNetLemmatizer()
return [lmtzr.lemmatize(w) for w in text]
nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title) # Frequency distribution of text
fdist_title.most_common(50) # most common 50
fdist_title['science'] # return count of a given word
fdist_title.max() # max counts
fdist_title.plot(50, cumulative=True) # plot
fdist_title.plot(50)
fdist_title.tabulate(50) # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))
# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
示例15: stem
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def stem(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
stem, suffix = re.findall(regexp, word)[0]
return stem
def lexical_diversity(text):
return len(text) / len(set(text))
nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))
# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)
len(tokens)