本文整理汇总了Python中nltk.probability.FreqDist.most_common方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.most_common方法的具体用法?Python FreqDist.most_common怎么用?Python FreqDist.most_common使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.most_common方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def __init__(self, words, sentences, language):
self.num_words = len(words)
self.unique_words = len(set(words))
self.num_sentences = len(sentences)
self.average_sentence_length = round(self.num_words / self.num_sentences)
self.lexical_diversity = round(self.num_words / self.unique_words)
fdist = FreqDist(words)
stop_words = stopwords.words(language)
not_stopwords = [w for w in words if w not in stop_words]
fdist2 = FreqDist(not_stopwords)
self.fifty_first_words = fdist.most_common(50)
self.hundreds_nsw = fdist2.most_common(300)
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(10)
self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)
trigram_measures = TrigramAssocMeasures()
finder3 = TrigramCollocationFinder.from_words(words)
finder3.apply_freq_filter(10)
self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)
self.stcs_width_words = [' '.join(sent) for sent in sentences
if "malheureusement" in sent.lower()]
示例2: freqSingle
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def freqSingle(list):
global nouns
global adjectives
#fig, axs = plt.subplots(1,2)
varPOS = [nltk.pos_tag(list)]
##################################################
#ALL LANGUAGE - FREQUENCY
varAll = FreqDist(list)
##USE THIS TO PRINT THE TOP WORDS
print "TOP TERMS"
varAll_common = varAll.most_common(25)
print varAll_common
print ""
#PLOT TOP TERMS
#varAll.plot(25, cumulative=False, title='All Language')
#plt.show()
##################################################
#NOUNS - FREQUENCY
nouns = []
for word,pos in varPOS[0]:
if pos in ['NN', 'NNP']:
nouns.append(word)
varNouns = FreqDist(nouns)
##USE THIS TO PRINT THE TOP NOUNS
print "TOP NOUNS"
varNouns_common = varNouns.most_common(25)
print varNouns_common
print ""
#PLOT TOP NOUNS
#varNouns.plot(25, cumulative=False, title='Nouns')
#plt.show()
##################################################
#ADJECTIVES - FREQUENCY
adjectives = []
for word,pos in varPOS[0]:
if pos in ['JJ', 'JJR', 'JJS']:
adjectives.append(word)
varAdjectives = FreqDist(adjectives)
##USE THIS TO PRINT THE TOP ADJECTIVES
print "TOP ADJECTIVES"
varAdjectives_common = varAdjectives.most_common(25)
print varAdjectives_common
print ""
示例3: _count
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def _count(self, words):
"""
>>> wordCounter()._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
[('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
>>> wordCounter()._count([])
[]
>>> wordCounter(words_per_message=-1)._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
[('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
"""
fdist1 = FreqDist(words)
if (self.words_per_message > 0):
return fdist1.most_common(self.words_per_message)
else:
return fdist1.most_common()
示例4: trigramAll
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def trigramAll():
to_save_folder = "./#Trigram[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1:
continue;
folder_name = "./" + folder + "/"
data_path = folder_name + "data.doc";
fw = open(data_path, "r", encoding="utf8");
text = fw.read();
words = word_tokenize(text);
valid_word = [w for w in words if len(w) > 1 and w != "``"];
tri_list = [];
vlen = len(valid_word);
for i in range(0,vlen-2):
tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]);
fdist = FreqDist(w for w in tri_list);
keys = fdist.most_common(len(fdist.keys()))
dataFreq = "";
for key in keys:
dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n";
make_sure_path_exists(to_save_folder + folder)
writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
示例5: sentanceLenFrequency
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def sentanceLenFrequency():
to_save_folder = "./#SentanceLenFreq[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1:
continue;
folder_name = "./" + folder + "/"
data_path = folder_name + "data.doc";
fw = open(data_path, "r", encoding="utf8");
text = fw.read();
words = word_tokenize(text);
sents = getSentancesTokens(text);
freq=[]
for sent in sents:
sent_len = getSentanceLen(sent);
if sent_len==147:
print(sent);
if sent_len>0:
freq.append(sent_len)
fdist = FreqDist(freq);
keys = fdist.most_common();
dataFreq = "Sentance Len,Freqency\n"
for key in sorted(keys):
dataFreq+= str(key[0])+","+str(key[1]) +"\n";
make_sure_path_exists(to_save_folder + folder)
writer = open(to_save_folder + folder + "/" + folder +"[data]"+ "[SentanceLen_Freq].csv", "w+", encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
示例6: get_frequency
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def get_frequency(data_file,all_vocab):
input_file = open(data_file, "r")
input_file_contents = input_file.read()
words = nltk.tokenize.word_tokenize(input_file_contents, 'english')
fdist = FreqDist(words)
print(fdist)
output_file = open("../Training/vocab_freq.txt", "w")
for word, frequency in fdist.most_common(4000):
if word in all_vocab and word!='+' and word!='-':
output_file.write(word + " : " + str(frequency) + "\n")
output_file.close()
return 1
#data = "data.txt"
#stop_words = "stopwords.txt"
#accuracy= multinomial_naive_bayes_unigram(data, data, stop_words)
#print(accuracy)
#print("Separating Done!!")
示例7: most_frequent_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def most_frequent_words(path,top):
root_path = "./"+path;
writers = os.listdir(root_path);
word_set = set();
for writer in writers:
if writer.find(".") != -1:
continue;
inside_folder = root_path + "//" +writer;
files = os.listdir(inside_folder);
formated_text = "";
for file in files:
file_path = root_path + "//" +writer+"//"+ file;
fw = open(file_path,"r",encoding="utf8");
article = fw.read();
#print(article);
formated_text+=" ";
formated_text += formatText(article);
fw.close();
words = get_bigrams(formated_text);
fdist = FreqDist(w for w in words if
len(w) > 1 and isEnglish(w) == False and w != "``");
keys = fdist.most_common(top);
for key in keys:
#print(str(key[0]) + " , " + str(key[1]) + "\n");
word_set.add(key[0]);
print(word_set);
fw = open("./Features/Bigrams.csv","w",encoding="utf8");
for word in word_set:
fw.write(word);
fw.write("\n");
fw.close();
示例8: label_clusters
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def label_clusters(business_id,K,clusters):
'''
Label the clusters of a particular run specified by business_id
as the most common noun in that cluster
'''
base='../Models/%s/Clusters/'%business_id
sentence_count=FreqDist(clusters)
total_sentences=len(clusters)
labels=[]
for i in range(0,K):
f=open(base+'Cluster_%d'%i,'r')
text=f.read().decode('utf-8')
f.close()
tokens=nltk.word_tokenize(text)
tokens = [w for w in tokens if w.isalpha() and len(w) > 3 and w not in stopwords.words()]
fd=FreqDist(tokens)
frequent=fd.most_common(5)
label="None"
label_freq=0
for f in frequent:
if is_noun(f[0]):
label,label_freq=f
break
relative_score=float(label_freq)/len(tokens)
cluster_score=float(sentence_count[i])/total_sentences
print "test label:",i,label
labels.append((i,label,label_freq,len(tokens),sentence_count[i],total_sentences,relative_score*cluster_score))
return labels
开发者ID:ParinSanghavi,项目名称:Mining-Quality-Parameters-from-Yelp-Reviews-for-Improving-Businesses,代码行数:31,代码来源:cluster_yelp.py
示例9: ngram4All
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def ngram4All():
to_save_folder = "./#Ngram_4[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1:
continue;
folder_name = "./" + folder + "/"
data_path = folder_name + "data.doc";
fw = open(data_path, "r", encoding="utf8")
text = fw.read();
words = word_tokenize(text);
valid_word = [w for w in words if len(w) > 1 and w != "``"]
nlist4 = []
vlen = len(valid_word);
for i in range(0,vlen-3):
nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3])
fdist = FreqDist(w for w in nlist4)
keys = fdist.most_common(len(fdist.keys()))
dataFreq = ""
for key in keys:
dataFreq += str(key[0])+ "," + str(key[1]) + "\n"
make_sure_path_exists(to_save_folder + folder)
writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8")
writer.write(dataFreq)
fw.close()
writer.close()
示例10: BigramAll
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def BigramAll():
to_save_folder = "./#Bigram[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1 :
continue;
folder_name = "./" + folder + "/"
data_path = folder_name+"data.doc";
fw = open(data_path,"r",encoding="utf8");
text = fw.read();
words = word_tokenize(text);
big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
myBig = []
for bi in big:
myBig.append(bi[0]+" "+bi[1]);
fdist = FreqDist(str(w) for w in myBig);
keys = fdist.most_common(len(fdist.keys()))
dataFreq = "";
for key in keys:
dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";
make_sure_path_exists(to_save_folder+folder)
writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
示例11: word_tag_model
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def word_tag_model(words, tagged_words, limit=200):
fd = FreqDist(words)
cfd = ConditionalFreqDist(tagged_words)
most_freq = (word for word, count in fd.most_common(limit))
return dict((word, cfd[word].max()) for word in most_freq)
示例12: wordLenFrequency
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def wordLenFrequency():
to_save_folder = "./#WordLenFreq[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1:
continue;
folder_name = "./" + folder + "/"
data_path = folder_name + "data.doc";
fw = open(data_path, "r", encoding="utf8");
text = fw.read();
words = word_tokenize(text);
freq=[]
for word in words:
word_len = getWordLen(word);
if word_len==20:
for char in word:
print(char,end=' ');
print(word);
if word_len>0:
freq.append(word_len)
fdist = FreqDist(freq);
keys = fdist.most_common();
dataFreq = "Word Len,Freqency\n"
for key in sorted(keys):
dataFreq+= str(key[0])+","+str(key[1]) +"\n";
make_sure_path_exists(to_save_folder + folder)
writer = open(to_save_folder + folder + "/" + folder+ "[WordLen_Freq].csv", "w+", encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
示例13: experiments
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def experiments():
f = open('classEvent_NEs.txt','r')
# Displaying top occurring K NEs
text= nltk.Text(f.read().split('\n'))
freqd = FreqDist(text)
most_common = freqd.most_common(15)
# Sorting according to Type of entity
for el in most_common:
print el
示例14: most_common_bigrams
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def most_common_bigrams(all_words, num_bigrams):
bigram_finder = BigramCollocationFinder.from_words(all_words)
bigram_freq = dict(bigram_finder.ngram_fd.viewitems())
for k, v in bigram_freq.items():
if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]):
del bigram_freq[k]
fd = FreqDist(bigram_freq)
return dict(fd.most_common(num_bigrams)).keys()
示例15: one_by_four
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import most_common [as 别名]
def one_by_four(path,top=50):
root_path = "./"+path;
writers = os.listdir(root_path);
word_set = set();
temp_set = set();
writer_table = dict();
for writer in writers:
if writer.find(".") != -1:
continue;
inside_folder = root_path + "//" +writer;
files = os.listdir(inside_folder);
formated_text = "";
for file in files:
file_path = root_path + "//" +writer+"//"+ file;
fw = open(file_path,"r",encoding="utf8");
article = fw.read();
#print(article);
formated_text+=" ";
formated_text += formatText(article);
fw.close();
words = getWordList(formated_text);
fdist = FreqDist(w for w in words if
len(w) > 1 and isEnglish(w) == False and w != "``");
keys = fdist.most_common(top);
print(keys);
writer_table[writer] = dict(keys);
for key in keys:
temp_set.add(key[0]);
writers = writer_table.keys();
for word in temp_set:
for writer1 in writers:
freq1 = writer_table[writer1].get(word,0);
for writer2 in writers:
if writer2 == writer1:
continue;
freq2 = writer_table[writer2].get(word,0);
if freq1 >= freq2*4:
print(writer1+" "+writer2+" "+str(freq1)+" " + str(freq2)+ " "+word);
word_set.add(word);
print(word_set);
fw = open("./Features/Modified word frequency.csv","w",encoding="utf8");
for word in word_set:
fw.write(word);
fw.write("\n");
fw.close();