当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.plot方法代码示例

本文整理汇总了Python中nltk.FreqDist.plot方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.plot方法的具体用法?Python FreqDist.plot怎么用?Python FreqDist.plot使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.FreqDist的用法示例。


在下文中一共展示了FreqDist.plot方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_tweets

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def process_tweets (hashtag,addl_stops=[]):
    count=0
    good_count=0
    words_to_plot=[]
    #Iterate through all chunked files with relevant hashtag
    for fname in os.listdir(os.getcwd()):
        if fname.startswith(hashtag):
            with open(fname,'r') as data_file:
                data=data_file.read()
                # Parse raw string since json.load() approach wasn't working
                data=data.split("\n\x00,")
            for tweet in data:
                count+=1
        
                # Tweets have a well-defined structure, so we can parse them 
                # manually (even though the JSON approach would be cleaner)
                text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
                
                # Skip tweets that contain Unicode
                if text.find('\u')>=0:
                    continue
                else:
                    good_count+=1
                    # Tokenize and count word frequency, ignoring case
                    words = word_tokenize(text)
                    clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
                    words_to_plot=words_to_plot+clean_words             
    
    #Create frequency histogram of 50 most common words and print summary of activity 
    fdist=FreqDist(words_to_plot)
    fdist.plot(50)
    print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
    return words_to_plot
开发者ID:lanorzi,项目名称:MIDS-W205_A2-1,代码行数:35,代码来源:create_histograms.py

示例2: freq_dist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
    """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
       
       Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
       That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
       
       limit truncates the freq_dist to the limit most common items
       
       return_counts determines whether a list of tuples (word, count) are returned, 
          or whether a list of just the limit most used words is returned
    """
    for f in filtering_functions + [str.lower, str.strip]:
        input = map(f, input) 
    
    nltk_fdist = FreqDist(list(input))    
    
    if plot: #use nltks built in plotting function before destroying the data structure
        nltk_fdist.plot(limit) if limit else nltk_fdist.plot()      
    
    fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0]))   #alphabetically sort equally counted items
    fidst = fdist[0:limit] if limit else fdist                                  #apply limit
    fdist = [i[0] for i in fdist] if not return_counts else fdist               #remove counts if desired
        

    
    return fdist
开发者ID:SumAll,项目名称:python3-analysis-tools,代码行数:28,代码来源:statistical_functions.py

示例3: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def main():
    # класс частного распределения
    fd_text1 = FreqDist(book.text1)

    print(str.format('Объект частотного распределения: {}', fd_text1))

    print(str.format(
        '50 наиболее встречаемых слов: {}', fd_text1.most_common(50)
    ))

    fd_text1.plot(50, cumulative=True)
开发者ID:arheo,项目名称:python_core,代码行数:13,代码来源:app3.py

示例4: freq

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def freq(tokens, n=None):
    '''
    This function takes a list of tokens and returns a list of the top n most 
    frequent tokens
    
    It also prints a frequency distribution of the top 50 tokens
    '''
    fdist2 = FreqDist(tokens)
    fdist2.plot(50, cumulative=True)
    [i[0] for i in fdist2.items()[:20]]
    if n is None:    
        print fdist2.items()[:20]
        return [i[0] for i in fdist2.items()[:20]]
    else:
        print fdist2.items()[:n]
        return [i[0] for i in fdist2.items()[:n]]
开发者ID:SpacaB,项目名称:Concordance-Collages,代码行数:18,代码来源:preprocess.py

示例5: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def main():
    obj = TweetFilter()
    with open("NBA_Warriors.txt", "r") as myFile:
        data = myFile.read().replace('\n',' ')
        data = unicode(data, 'utf-8')

    # This tokenizes each of the word in data
    tokenz = word_tokenize(data)

    # This passes the tokenz to the filter function
    newTokenz = obj.filter(tokenz)

    # Run a frequency distribution on the entire word list
    fdist1 = FreqDist(newTokenz)

    # Plots the top 30 words
    fdist1.plot(30, cumulative=False)
开发者ID:howardwen,项目名称:MIDS-W205_A2,代码行数:19,代码来源:Text_Processing.py

示例6: fenci

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def fenci(data):

    data = re.compile(r'\s+').sub('', data)
    data = re.compile(r'\!\[.*?\]\(.*?\)').sub('', data)

    seg_list = jieba.cut(data)

    # 基于TF-IDF算法的关键词抽取
    tags = jieba.analyse.extract_tags(data, topK=50)
    print ','.join(tags)

    # 基于TextRank算法的关键词抽取
    tags2 = jieba.analyse.textrank(data, topK=50)
    print ','.join(tags2)


    fdist = FreqDist([seg for seg in seg_list])
    fdist.plot(50)
开发者ID:cbbing,项目名称:wealth_spider,代码行数:20,代码来源:fenci.py

示例7: lemmas_distribution_rus

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def lemmas_distribution_rus(dist):
    dict_file = codecs.open('literature/processed_vocabulary',encoding='utf-8')
    dict_text = dict_file.readlines()
    dict_file.close()
    dict_dict = {}
    import pymorphy2
    morph = pymorphy2.MorphAnalyzer()
    from collections import defaultdict
    lemmas_dist = defaultdict(int)    
    for line in dict_text:
        line_list = line.split(':')
        dict_dict[line_list[0]] = line_list[1]
    for word in dist.keys():
        if word in dict_dict:
            lemmas_dist[dict_dict[word]] += 1
        else:
            p = morph.parse(word)
            if len(p) > 0:
                print word
                print p[0].normal_form
                lemmas_dist[p[0].normal_form] += 1
                print lemmas_dist[p[0].normal_form]
    lemmas_dist = FreqDist(lemmas_dist)
    lemmas_dist.plot(100)
开发者ID:denis-gordeev,项目名称:nltk_imageboards,代码行数:26,代码来源:script2new.py

示例8: CountFreq

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
class CountFreq(object):
    def __init__(self,  *args, **kwargs):
        self.txt_file = codecs.open('new1.txt', encoding='utf-8')
        self.stop_words = stopwords.words('english')
        self.clean_words = []
        self.loose_words = loose_words

    def clean_text(self):
        '''
        this method will clean all the data in new1.txt as well as transfer the data from a text file to
        a tokenized format that will be readily available for nltk to work with.
        :return: sanitized and tokenized words.
        '''
        stop = self.stop_words
        text = self.txt_file
        for lines in text:
            clean_words = [word for word in lines.lower().split() if word not in stop]
            self.clean_words.append(clean_words)
        self.clean_words = [val for sublist in self.clean_words for val in sublist]
        return self.clean_words

    def word_freq(self):
        '''
        single word frequency without any context. This will result in the top 100 words that will be shown and
        identified as the most repeated words. However, rigorous filtration will be applied to the printed words
        getting rid of words that are not Nouns
        :return: the frequency distribution, obj.
        '''
        classified_text = pt(self.clean_words)
        noun_descriptor = [word for word, pos in classified_text if pos == 'NN']
        revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words]
        self.fdist = FreqDist(revised_noun_descriptor)
        return self.fdist

    def graph_freq(self, cumulative):
        '''

        :param cumulative: Boolean value, when true it graphs the cumulative text score producing a diminishing
        return graph
        :return: a matplotlib graph
        '''

        return self.fdist.plot(100, cumulative=cumulative)
开发者ID:cmwaura,项目名称:Spider_crawl_indeed,代码行数:45,代码来源:utils.py

示例9: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
开发者ID:loveclj,项目名称:python,代码行数:32,代码来源:counting_other3.4.py

示例10: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
# We can find the FIRST position of given word:
myText.index('about')

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale']

fdist1.plot(20)

fdist1.plot(20, cumulative = True)   

# List comprehension
# Counting the number of characters in each word in a text
[len(w) for w in text1]

# Bigram function returns a list of bigrams
from nltk import bigrams, trigrams

bigrams(myText2)

trigrams(myText2)

bigramsText1 = bigrams(text1) # bigramsText1[0] is the tuple containing the first bigram
开发者ID:STIMALiU,项目名称:TextMiningCourse,代码行数:32,代码来源:Intro2NLTK.py

示例11: create_dist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def create_dist(nltk_text, stopwords):
    dist = FreqDist(w.lower() for w in nltk_text if len(w)>=3 and w.isalnum() and w.lower() not in stopwords)
    dist.plot(50)
    print "Number of wordforms"+str(len(dist))
    return dist
开发者ID:denis-gordeev,项目名称:nltk_imageboards,代码行数:7,代码来源:script2new.py

示例12: BeautifulSoup

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
        # soup = BeautifulSoup(open(path))
        # chapter = soup.findAll(text=True)[0]
        file = open(path)
        chapter = file.read()
        chapter_tuple = (chapter, 'real')

        words = [ w.lower() for w in word_tokenize(chapter) ]

        real_chapters.append(chapter_tuple)
        real_words.extend(words)

word_total  = len(real_words)
harry_total = real_words.count('harry')

fd = FreqDist(real_words)
fd.plot(26)

# filtered_real_words = [ w.lower() for w in real_words if w.isalpha() ]
filtered_real_words = [ w for w in real_words if w.isalpha() and w not in stop ]

Rowling = filtered_real_words

fd  = FreqDist(filtered_real_words)
fd.plot(26)

file = open('ao_hp_stories.jl')

ao_chapters = []
ao_words    = []
AO3         = []
AO3_normed  = []
开发者ID:syntactician,项目名称:corpora,代码行数:33,代码来源:fanfiction.py

示例13: open

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
import json

from textstat.textstat import textstat

from nltk import FreqDist
from matplotlib.pyplot import *

filename = 'bieber-raw-test.json'
READ = 'rb'
TEXT=1
stopwords = open('stopwords',READ).read().splitlines()
tweets = json.load(open(filename,READ))
#Identify retweets

words = ' '.join([tweet['text'] for tweet in tweets]).split()

fdist = FreqDist(words)

fdist.plot(20)
tight_layout()
开发者ID:seanmaskey,项目名称:Maui,代码行数:22,代码来源:sxsw.py

示例14: lemma

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]

# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
开发者ID:dmml,项目名称:NLTK,代码行数:32,代码来源:dsc.py

示例15: stem

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import plot [as 别名]
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem


def lexical_diversity(text):
    return len(text) / len(set(text))

nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))


# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)
len(tokens)
开发者ID:dmml,项目名称:NLTK,代码行数:33,代码来源:kd.py


注:本文中的nltk.FreqDist.plot方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。