本文整理汇总了Python中nltk.probability.FreqDist.items方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.items方法的具体用法?Python FreqDist.items怎么用?Python FreqDist.items使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.items方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: summarize
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def summarize(self, input, num_sentences ):
s=[]
punt_list=['.',',','!','?']
summ_sentences = []
sentences=input
#sentences = sent_tokenize(input)
lowercase_sentences =[sentence.lower()
for sentence in sentences]
#print lowercase_sentences
saito=' '.join(sentences)
s=input
ts=''.join([ o for o in s if not o in punt_list ]).split()
lowercase_words=[word.lower() for word in ts]
words = [word for word in lowercase_words if word not in stopwords.words()]
word_frequencies = FreqDist(words)
most_frequent_words = [pair[0] for pair in
word_frequencies.items()[:100]]
# add sentences with the most frequent words
if(len(s) < num_sentences):
num_sentences=len(s)
for word in most_frequent_words:
for i in range(len(lowercase_sentences)):
if len(summ_sentences) < num_sentences:
if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]):
summ_sentences.append(lowercase_sentences[i])
else:
break
if len(summ_sentences) >= num_sentences:
break
# reorder the selected sentences
summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) )
return summ_sentences
示例2: create_word_bigram_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def create_word_bigram_scores(posWords, negWords, n = 5000):
# (posWords,negWords) = readwordarr()
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
bigramfinder = BigramCollocationFinder.from_words(posWords)
posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
bigramfinder = BigramCollocationFinder.from_words(negWords)
negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
posWords = posWords + posbigrams
negWords = negWords + negbigrams
wordscores = {}
wordfd = FreqDist()
conditionwordfd = ConditionalFreqDist()
for word in posWords:
wordfd[word]+=1
conditionwordfd['pos'][word]+=1
for word in negWords:
wordfd[word]+=1
conditionwordfd['neg'][word]+=1
pos_word_count = conditionwordfd['pos'].N()
neg_word_count = conditionwordfd['neg'].N()
totalcount = pos_word_count + neg_word_count
for word,freq in wordfd.items():
pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
wordscores[word] = pos_score + neg_score
return wordscores
示例3: get_summarized
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def get_summarized(self, inputContent, num_sentences ):
base_words = [word.lower()
for word in nltk.word_tokenize(inputContent)]
words = [word for word in base_words if word not in stopwords.words()]
word_frequencies = FreqDist(words)
most_frequent_words = [pair[0] for pair in
word_frequencies.items()]
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
actual_sentences = sent_detector.tokenize(inputContent)
working_sentences = [sentence.lower()
for sentence in actual_sentences]
output_sentences = []
for word in most_frequent_words:
for i in range(0, len(working_sentences)):
if (word in working_sentences[i]
and actual_sentences[i] not in output_sentences):
output_sentences.append(actual_sentences[i])
break
if len(output_sentences) >= num_sentences: break
if len(output_sentences) >= num_sentences: break
return output_sentences
示例4: get_negative_grams
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def get_negative_grams(filePath,n):
l = list()
#Open the file and write on it the result
with codecs.open(filePath,'r') as myfile:
sentence=myfile.read()
sentence=sentence.replace('points forts', ' ')
sentence=sentence.replace('points faibles', ' ')
sentence=sentence.replace('commentaires', ' ')
n_grams = ngrams(sentence.split(), n)
s=''
for grams in n_grams:
if('est pas' in grams or 'ai pas' in grams or 'pas' in grams or 'cher' in grams):
s+=str(grams)+'\n'
l.append(grams)
'''fe = open('negative-'+str(n)+'-gram.txt', 'w')
fe.write(s)
fe.close()'''
Dict = FreqDist(l)
Dict = sorted(Dict.items(), key=operator.itemgetter(1), reverse=True)
t=''
for x in Dict:
t+= '(\''+str(x[0])+'\' , ' +str(x[1])+')\n'
fe = open('stats/Freq_negative-'+str(n)+'-gram.txt', 'w')
fe.write(t)
fe.close()
示例5: summarize
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def summarize(self, input, num_sentences):
punt_list = [".", ",", "!", "?"]
summ_sentences = []
sentences = sent_tokenize(input)
lowercase_sentences = [sentence.lower() for sentence in sentences]
# print lowercase_sentences
s = list(input)
ts = "".join([o for o in s if not o in punt_list]).split()
lowercase_words = [word.lower() for word in ts]
words = [word for word in lowercase_words if word not in stopwords.words()]
word_frequencies = FreqDist(words)
most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]]
# add sentences with the most frequent words
for word in most_frequent_words:
for i in range(0, len(lowercase_sentences)):
if len(summ_sentences) < num_sentences:
if lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]:
summ_sentences.append(sentences[i])
break
# reorder the selected sentences
summ_sentences.sort(lambda s1, s2: input.find(s1) - input.find(s2))
return " ".join(summ_sentences)
示例6: getBestWords
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def getBestWords(posWords, negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd["pos"][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd["neg"][word.lower()] += 1
pos_word_count = label_word_fd["pos"].N()
neg_word_count = label_word_fd["neg"].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
# best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
bestwords = set([w for w, s in sorted_x])
return bestwords
示例7: train_MLT
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def train_MLT(self, tagged_train_data, untagged_training_data):
"""
Builds a most likely tag tagger from the given tagged training data as WORDS
:param train_data:
:return: model
"""
# find the set of words
words = set()
for sent in untagged_training_data:
for word in sent:
words.add(word)
# Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........}
mlt_dict = dict()
# Initialize keys and values to it
for word in words:
mlt_dict[word] = dict()
# Compute the freq dist of tagged words
tagged_words_fdist = FreqDist(tagged_train_data)
for tagged_word, count in tagged_words_fdist.items():
(mlt_dict[tagged_word[0]])[tagged_word] = count
# Update the dict to contain the most likely tag for each word
#for word, inside_dict in mlt_dict.items():
# max_val = max(inside_dict.values())
# inside_dict =
print("Training is done!")
return mlt_dict
示例8: make_summary
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def make_summary( text):
sent = []
stemmed = []
tokens = word_tokenize(text)
sent = sent_tokenize(text)
for token in tokens:
if token in stopwords.words('english'):
tokens.remove(token)
stemmer = PorterStemmer()
for token in tokens:
stemmed.append(stemmer.stem(token))
#freq(stemmed)
for word in stemmed:
word.lower()
word_freq = FreqDist(stemmed)
most_freq_words = [pair[0] for pair in word_freq.items()[:60]]
working_sent = [sentence.lower() for sentence in sent]
out_sent = []
for word in most_freq_words:
for i in range(0,len(working_sent)):
if (word in working_sent[i] and sent[i] not in out_sent):
out_sent.append(sent[i])
break
if len(out_sent) >= 5:
break
if len(out_sent) >= 5:
break
return reorder(out_sent,text)
示例9: high_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos = 0
neg = 0
for review in posids:
pos += 1
if (pos != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
for review in negids:
neg += 1
if (neg != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
bestwords = set([w for w, s in best])
return bestwords
"""
示例10: create_word_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def create_word_scores():
posWords = pickle.load(open('pos_review.pkl', 'rb'))
negWords = pickle.load(open('neg_review.pkl', 'rb'))
posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组
negWords = list(itertools.chain(*negWords)) # 同理
word_fd = FreqDist() # 可统计所有词的词频
cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频
for word in posWords:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in negWords:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N() # 积极词的数量
neg_word_count = cond_word_fd['neg'].N() # 消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count),
total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count),
total_word_count) # 同理
word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores # 包括了每个词和这个词的信息量
示例11: create_word_bigram_scores
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def create_word_bigram_scores():
posdata = pickle.load(open('pos_review.pkl', 'rb'))
negdata = pickle.load(open('neg_review.pkl', 'rb'))
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams # 词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in neg:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例12: get_words_frequency
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def get_words_frequency(string, top_values):
"""
Gets the words frequency in a corpus
:param string: corpus
:param top_values: maximum of sorted values to return
:return: list of frequencies of the word in there synset form
"""
# import stop words from nltk corpus
stop_words_en_nltk = list(stopwords.words('english'))
# create additional stop words for puntuations and others
stop_words_en_custom = ['.', ',', '\'', '!', '(', ')', ':', ';', '?', '--', '*', '[', ']', '``', str("''"),
'&', '\'ll', '\'ve', '\'s', '\'re', 'a', 'b', 'c',
'i', '\'i', 'this', 'n\'t', 'a', 'could', 'should', 'would', 'can', 'will', 'shall',
'there', 'it', 'also', 'in', 'the', 'many', 'by', 'an',
'1990s', 'the', '+', '-', '...', '=', '%', '#', '[hide]', '[edit]', '.jpg', '/',
'be.v.01', 'have.v.01', 'use.v.01', 'besides.r.02', 'analysis.n.01', 'categorization.n.03',
'vitamin_e.n.01', 'vitamin_c.n.01', 'include.v.01', 'such.s.01', 'many.a.01', 'order.n.01',
'episode.n.01', 'show.n.01', 'not.r.01', 'standard.n.01', 'survey.n.01', 'factor.n.01',
'first.a.01']
until_number = 300
stop_words_en_custom_numbers = []
for value in [lambda i=i: i for i in range(until_number+1)]:
stop_words_en_custom_numbers.append(str(value()))
# add them together
stop_words_en = stop_words_en_nltk + stop_words_en_custom + stop_words_en_custom_numbers
words_list_tmp = word_tokenize(string.lower())
words_list = []
lemmatizer = WordNetLemmatizer()
for word in nltk.pos_tag(words_list_tmp):
tag = get_word_tag(word[1])
if tag is not '':
try:
synset_word = wordnet.synsets(lemmatizer.lemmatize(word[0], pos=tag), pos=tag)[0]
words_list.append(synset_word.name())
except:
pass
processed_word_list = [word for word in words_list if word not in stop_words_en]
text_obj = nltk.Text(processed_word_list)
fd = FreqDist(text_obj)
result = list(fd.items())
if top_values is not 0:
result.sort(key=lambda x: x[1], reverse=True)
result = result[:top_values]
return result
else:
return result
示例13: probDist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def probDist():
### files pointers to reading files
f1 = open(os.path.join('allfiles', 'document01-finance.txt'), "r")
f2 = open(os.path.join('allfiles', 'document02-finance.txt'), "r")
f3 = open(os.path.join('allfiles', 'document03-finance.txt'), "r")
f4 = open(os.path.join('allfiles', 'document04-ee.txt'), "r")
f5 = open(os.path.join('allfiles', 'document05-ee.txt'), "r")
### read the file content
line1 = f1.read()
line2 = f2.read()
line3 = f3.read()
line4 = f4.read()
line5 = f5.read()
### document01-finance.txt is the writer document and other files are
### are reader files so we get the word list from the write document
words = line1.split()
X_words = []
### create a dictionary to store the frequency of each term
dict_x1 = {}
### using nltk calcuate frequency of each word
unigramWordList = FreqDist(words)
datalen = len(unigramWordList) ### total words in the document
for k,v in unigramWordList.items():
#print k,v
X_words.append(k)
dict_x1[k] = (v/float(datalen))
pd_x1.append(v/float(datalen))
#print X_words
#print dict_x1
#print pd_x1
### create probability distribution of all files
for word in X_words:
pd_x2.append( line2.count(word)/float(datalen) )
pd_x3.append( line3.count(word)/float(datalen) )
pd_x4.append( line4.count(word)/float(datalen) )
pd_x5.append( line5.count(word)/float(datalen) )
#print pd_x2
#print pd_x3
### calculate total probability distribution across 3 files
line_S = line1+line2+line3+line4+line5
#print line_S
for word in X_words:
s.append( line_S.count(word)/float(datalen) )
print s
示例14: opinion_tokens_Fr
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def opinion_tokens_Fr(liste):
#Creat the results floder in case it doesn't exist
result = "stats"
if not os.path.exists(result):
os.mkdir(result,0777)
i=0
comments = ''
while(i<len(liste)):
comments+=liste[i]+'\n'
i+=1
comments=comments.lower()
#Open the file and write on it the result
f = open('opinions.txt', 'w')
f.write(comments)
f.close()
w=['"','→','–','’','»','«',',','.','[',']','|','{','}',':',';','!','?','(',')','_','-','=','/',
' qui ',' cette ',' mais ',' ou ',' où ',' et ',' donc ',' or ',' ni ',' car ',' la ',' là ',' le ',
' les ',' de ',' des ',' du ',' tout ',' tous ',' toutes ',' que ',' comme ',' si ',' quand ',' je ',
' tu ',' il ',' elle ',' nous ',' vous ',' ils ',' elles ',' un ',' une ',' au ',' aux ',' dans ',' ce '
,' se ',' ces ',' ses ',' on ',' en ',' leur ',' leurs ',' a ',' à ',' pour ',' par ',' sous ',' sur ']
#Open the file and write on it the result
with codecs.open('opinions.txt','r') as myfile:
content=myfile.read()
content=content.replace('points forts', ' ')
content=content.replace('points faibles', ' ')
content=content.replace('commentaires', ' ')
# remove numeric forms
content = ''.join([i for i in content if not i.isdigit()])
while w:
# remove conjuction, connectors, ...
content=content.replace(w.pop(0), ' ')
content = content.split()
tokenDict = FreqDist(content)
tokenDict = sorted(tokenDict.items(), key=operator.itemgetter(1), reverse=True)
s=''
for x in tokenDict:
s+= '(\''+x[0].decode('utf-8', 'ignore').encode('utf-8')+'\' , ' +str(x[1])+')\n'
fe = open('stats/freq_tokens.txt', 'w')
fe.write(s)
fe.close()
return tokenDict
示例15: main
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import items [as 别名]
def main():
# Number of words to display
count = 40
# Open files as strings
obama = open("obama.txt", "r").read()
bush = open("bush.txt", "r").read()
#Tokenize texts into words, then count frequencies for all words
top_obama = FreqDist(word.lower() for word in word_tokenize(obama))
top_bush = FreqDist(word.lower() for word in word_tokenize(bush))
#Return top {count} most occurring words
print "No stoplist".upper()
print "Obama/2009\t".upper(), " ".join(item[0] for item in top_obama.items()[:count])
print "Bush/2001\t".upper(), " ".join(item[0] for item in top_bush.items()[:count])
#Return most occurring words that are not in the NLTK English stoplist
print
print "Stoplisted".upper()
print "Obama/2009\t".upper(), " ".join([item[0] for item in top_obama.items() if not item[0] in stopwords.words('english')][:count])
print "Bush/2001\t".upper(), " ".join([item[0] for item in top_bush.items() if not item[0] in stopwords.words('english')][:count])