当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.most_common方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.most_common方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.most_common方法的具体用法?Python FreqDist.most_common怎么用?Python FreqDist.most_common使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.most_common方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
    def __init__(self, words, sentences, language):
        self.num_words = len(words)
        self.unique_words = len(set(words))
        self.num_sentences = len(sentences)
        self.average_sentence_length = round(self.num_words / self.num_sentences)
        self.lexical_diversity = round(self.num_words / self.unique_words)

        fdist = FreqDist(words)
        stop_words = stopwords.words(language)
        not_stopwords = [w for w in words if w not in stop_words]
        fdist2 = FreqDist(not_stopwords)
        self.fifty_first_words = fdist.most_common(50)
        self.hundreds_nsw = fdist2.most_common(300)

        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(words)
        finder.apply_freq_filter(10)
        self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)

        trigram_measures = TrigramAssocMeasures()
        finder3 = TrigramCollocationFinder.from_words(words)
        finder3.apply_freq_filter(10)
        self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)

        self.stcs_width_words = [' '.join(sent) for sent in sentences
                                 if "malheureusement" in sent.lower()]
开发者ID:Raveline,项目名称:journal-imaginaire,代码行数:28,代码来源:analyst.py

示例2: freqSingle

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def freqSingle(list):
    global nouns
    global adjectives
    
    #fig, axs = plt.subplots(1,2)

    varPOS = [nltk.pos_tag(list)] 
    
    ##################################################
    #ALL LANGUAGE - FREQUENCY
    varAll = FreqDist(list)
    
    ##USE THIS TO PRINT THE TOP WORDS
    print "TOP TERMS"
    varAll_common = varAll.most_common(25)
    print varAll_common
    print ""
    
    #PLOT TOP TERMS
    #varAll.plot(25, cumulative=False, title='All Language')
    #plt.show()
    
    ##################################################
    #NOUNS - FREQUENCY
    nouns = []
    for word,pos in varPOS[0]:
        if pos in ['NN', 'NNP']:
            nouns.append(word)
    varNouns = FreqDist(nouns)

    ##USE THIS TO PRINT THE TOP NOUNS
    print "TOP NOUNS"
    varNouns_common = varNouns.most_common(25)
    print varNouns_common
    print ""
    
    #PLOT TOP NOUNS
    #varNouns.plot(25, cumulative=False, title='Nouns')
    #plt.show()
    
    ##################################################
    #ADJECTIVES - FREQUENCY
    adjectives = []
    for word,pos in varPOS[0]:
        if pos in ['JJ', 'JJR', 'JJS']:
            adjectives.append(word)
    varAdjectives = FreqDist(adjectives)
    
    
    ##USE THIS TO PRINT THE TOP ADJECTIVES
    print "TOP ADJECTIVES"
    varAdjectives_common = varAdjectives.most_common(25)
    print varAdjectives_common
    print ""
开发者ID:chriskerns,项目名称:trendology,代码行数:56,代码来源:TRENDOLOGY_3_text_analysis.py

示例3: _count

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
 def _count(self, words):
     """
     >>> wordCounter()._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
     [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
     >>> wordCounter()._count([])
     []
     >>> wordCounter(words_per_message=-1)._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
     [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
     """
     fdist1 = FreqDist(words)
     if (self.words_per_message > 0):
         return fdist1.most_common(self.words_per_message)
     else:
         return fdist1.most_common()
开发者ID:minezy,项目名称:minezy_proto,代码行数:16,代码来源:word_counter.py

示例4: trigramAll

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def trigramAll():
    to_save_folder = "./#Trigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"];
        tri_list = [];
        vlen = len(valid_word);
        for i in range(0,vlen-2):
            tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]);

        fdist = FreqDist(w for w in tri_list);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n";

        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:31,代码来源:MakeNormalData.py

示例5: sentanceLenFrequency

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def sentanceLenFrequency():
    to_save_folder = "./#SentanceLenFreq[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        sents = getSentancesTokens(text);
        freq=[]
        for sent in sents:
            sent_len = getSentanceLen(sent);
            if sent_len==147:
                print(sent);
            if sent_len>0:
                freq.append(sent_len)

        fdist = FreqDist(freq);
        keys = fdist.most_common();
        dataFreq = "Sentance Len,Freqency\n"
        for key in sorted(keys):
            dataFreq+= str(key[0])+","+str(key[1]) +"\n";
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder +"[data]"+ "[SentanceLen_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:32,代码来源:WordAndSentanceLenFreq.py

示例6: get_frequency

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def get_frequency(data_file,all_vocab):
    
    input_file = open(data_file, "r")
    input_file_contents = input_file.read()

    words = nltk.tokenize.word_tokenize(input_file_contents, 'english')
    fdist = FreqDist(words)
    print(fdist)

    output_file = open("../Training/vocab_freq.txt", "w")
    

    for word, frequency in fdist.most_common(4000):        
        if word in all_vocab  and word!='+' and word!='-':
            output_file.write(word + " : " + str(frequency) + "\n")        
            
    output_file.close()
    return 1

#data = "data.txt"
#stop_words = "stopwords.txt"

#accuracy= multinomial_naive_bayes_unigram(data, data, stop_words)
#print(accuracy)
#print("Separating Done!!")
开发者ID:Anand-M-P,项目名称:NLP-Project,代码行数:27,代码来源:training.py

示例7: most_frequent_words

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def most_frequent_words(path,top):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = get_bigrams(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        for key in keys:
            #print(str(key[0]) + " , " + str(key[1]) + "\n");
            word_set.add(key[0]);
    print(word_set);
    fw = open("./Features/Bigrams.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:34,代码来源:most_frequent_bigrams.py

示例8: label_clusters

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def label_clusters(business_id,K,clusters):
    '''
        Label the clusters of a particular run specified by business_id 
        as the most common noun in that cluster
    '''
    base='../Models/%s/Clusters/'%business_id
    sentence_count=FreqDist(clusters)
    total_sentences=len(clusters)
    labels=[]
    for i in range(0,K):
        f=open(base+'Cluster_%d'%i,'r')
        text=f.read().decode('utf-8')
        f.close()
        tokens=nltk.word_tokenize(text)
        tokens = [w for w in tokens if w.isalpha() and len(w) > 3 and w not in stopwords.words()]
        fd=FreqDist(tokens)
        frequent=fd.most_common(5)
        label="None"
        label_freq=0
        for f in frequent:
            if is_noun(f[0]):
                label,label_freq=f
                break
        
        relative_score=float(label_freq)/len(tokens)
        cluster_score=float(sentence_count[i])/total_sentences
        print "test label:",i,label
        labels.append((i,label,label_freq,len(tokens),sentence_count[i],total_sentences,relative_score*cluster_score))
    return labels
开发者ID:ParinSanghavi,项目名称:Mining-Quality-Parameters-from-Yelp-Reviews-for-Improving-Businesses,代码行数:31,代码来源:cluster_yelp.py

示例9: ngram4All

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def ngram4All():
    to_save_folder = "./#Ngram_4[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8")
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"]
        nlist4 = []
        vlen = len(valid_word);
        for i in range(0,vlen-3):
            nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3])

        fdist = FreqDist(w for w in nlist4)
        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = ""
        for key in keys:
            dataFreq += str(key[0])+ "," + str(key[1]) + "\n"
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8")
        writer.write(dataFreq)
        fw.close()
        writer.close()
开发者ID:olee12,项目名称:Stylogenetics,代码行数:29,代码来源:MakeNormalData.py

示例10: BigramAll

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:31,代码来源:MakeNormalData.py

示例11: word_tag_model

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)

    most_freq = (word for word, count in fd.most_common(limit))

    return dict((word, cfd[word].max()) for word in most_freq)
开发者ID:byam,项目名称:predictEPL,代码行数:9,代码来源:tag_util.py

示例12: wordLenFrequency

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def wordLenFrequency():
    to_save_folder = "./#WordLenFreq[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        freq=[]
        for word in words:
            word_len = getWordLen(word);
            if word_len==20:
                for char in word:
                    print(char,end=' ');
                print(word);
            if word_len>0:
                freq.append(word_len)

        fdist = FreqDist(freq);
        keys = fdist.most_common();
        dataFreq = "Word Len,Freqency\n"
        for key in sorted(keys):
            dataFreq+= str(key[0])+","+str(key[1]) +"\n";
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder+ "[WordLen_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:34,代码来源:WordAndSentanceLenFreq.py

示例13: experiments

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def experiments():
    f = open('classEvent_NEs.txt','r')
    # Displaying top occurring K NEs
    text= nltk.Text(f.read().split('\n'))
    freqd = FreqDist(text)
    most_common = freqd.most_common(15)
    # Sorting according to Type of entity
    for el in most_common:
        print el 
开发者ID:macmania,项目名称:Team-A-NLP,代码行数:11,代码来源:chunking.py

示例14: most_common_bigrams

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def most_common_bigrams(all_words, num_bigrams):
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    bigram_freq = dict(bigram_finder.ngram_fd.viewitems())
    for k, v in bigram_freq.items():
        if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]):
            del bigram_freq[k]

    fd = FreqDist(bigram_freq)
    return dict(fd.most_common(num_bigrams)).keys()
开发者ID:hermes95,项目名称:GoldwasserRsrch,代码行数:11,代码来源:Util.py

示例15: one_by_four

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def one_by_four(path,top=50):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    temp_set = set();
    writer_table = dict();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = getWordList(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        print(keys);
        writer_table[writer] = dict(keys);
        for key in keys:
            temp_set.add(key[0]);

    writers = writer_table.keys();
    for word in temp_set:
        for writer1 in writers:
            freq1 = writer_table[writer1].get(word,0);
            for writer2 in writers:
                if writer2 == writer1:
                    continue;
                freq2 = writer_table[writer2].get(word,0);
                if freq1  >= freq2*4:
                    print(writer1+" "+writer2+" "+str(freq1)+" " + str(freq2)+ " "+word);
                    word_set.add(word);





    print(word_set);
    fw = open("./Features/Modified word frequency.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:54,代码来源:one_by_four_frequency_feature.py


注:本文中的nltk.probability.FreqDist.most_common方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。