本文整理汇总了Python中nltk.FreqDist.items方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.items方法的具体用法?Python FreqDist.items怎么用?Python FreqDist.items使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.items方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: make_cutOff
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def make_cutOff(flatList, bottomCutOff, topCutOff):
'''
INPUT:
flatList is a 1-d list of all tokens in set of tweets and both bottom and
topCutOff are intergers
OUTPUT:
newVocab = a 1-d list of all tokens we want to keep
thrownOut = a 1-d list of all tokens to throw out
'''
fd = FreqDist(flatList)
newVocab = []
thrownOut = []
for item in fd.items()[:topCutOff]:
# append most common words
thrownOut.append(item)
for item in fd.items()[topCutOff:]:
if item[1] > bottomCutOff:
# append good words
newVocab.append(item[0])
else:
# append uncommon words
thrownOut.append(item)
print 'Cutoffs made...'
return newVocab, thrownOut
示例2: get_most_frequent
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def get_most_frequent(self, rawText, number = None, cleaning_level = 3):
cleaned_tokens_levels = TokensCleaner.clean(self, rawText, cleaning_level)
freq_distributions_levels = dict()
for level, cleand_tokens in cleaned_tokens_levels.items():
all_words = FreqDist(cleand_tokens)
if number == None:
freq_distributions_levels[level] = all_words.items()
else:
freq_distributions_levels[level] = all_words.items()[:number]
return freq_distributions_levels
示例3: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def main():
fileName = '../data/deals.txt'
words,lines = get_filter(fileName)
word_dist = FreqDist(words) # get distribution, descending order
print("Most Popular Term: ",word_dist.items()[0])# question 1
print("Least Popular Term: ", word_dist.items()[-1]) # question 2
# solution 1 for question 3
# print("Types of Guitars Found: ",len(count_guitar_types.count(lines)))
# Solutioin 2 , better and more reasonable, but could be better
print("Type of Guitars mentioned", count_guitar_types2.count(lines))
示例4: __getTimelineFeatures
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def __getTimelineFeatures(self, timeline):
logger.info(u"Get timeline features")
tweets = []
self.__changePhase(PHASE["GET_TIMELINE_URLS"])
for t in timeline:
try:
tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
except:
logger.exception(u"Error: \"" + unicode(t) + u"\"")
raise ValueError(t)
logger.debug(u"Tweet:" + unicode(tweet))
tweets.append(tweet)
urls = []
ti = 0
for tweet in tweets:
for url in tweet.urls():
self.__breakIfStopped()
self.__urlResolver.addUrlToQueue(url)
urls.append(url)
logger.info(u"Tweet:" + unicode(tweet))
ti += 1
self.__proc = 100 * float(ti) / float(len(tweets))
#Kategorie
self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
url2labels = {}
ui = 0
for url in urls:
self.__breakIfStopped()
if not url.isError():
logger.debug(u"Classify " + unicode(url.getUrl()))
url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
ui += 1
self.__proc = 100 * float(ui) / float(len(urls))
labelsFreq = FreqDist()
for labels in url2labels.values():
for label in labels:
labelsFreq.inc(label)
self.__catFreq = labelsFreq.items()
logger.info(u"Categories: " + unicode(labelsFreq.items()))
labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
#normalizacja
labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
logger.info(u"Category factors: " + unicode(labelsFreqValues))
#Języki
langFreq = FreqDist()
for u in urls:
langFreq.inc(u.lang())
self.__langFreq = langFreq.items()
logger.info(u"Languages: " + unicode(langFreq.items()))
return labelsFreqValues
示例5: __extract_bigram_words
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def __extract_bigram_words(self, bigrams, values):
bigrams_number_per_value = self.__configuration_map["most_frequent_bigrams_number_per_value"]
most_frequent_bigrams = {}
for value in values:
fdist = FreqDist(bigrams[value])
most_frequent_bigrams[value] = fdist.items()[:bigrams_number_per_value]
return most_frequent_bigrams
示例6: findBestWords
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for category, words in wordsInCategories:
word_fd.update(words)
label_word_fd[category].update(words)
word_counts = {}
for condition in label_word_fd.conditions():
word_counts[condition] = label_word_fd[condition].N()
total_word_count = 0
for condition, count in word_counts.items():
total_word_count += count
word_scores = {}
for word, freq in word_fd.items():
score = 0
for condition, count in word_counts.items():
score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
word_scores[word] = score
best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
return set([w for w, s in best])
示例7: freq_dist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
"""Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
limit truncates the freq_dist to the limit most common items
return_counts determines whether a list of tuples (word, count) are returned,
or whether a list of just the limit most used words is returned
"""
for f in filtering_functions + [str.lower, str.strip]:
input = map(f, input)
nltk_fdist = FreqDist(list(input))
if plot: #use nltks built in plotting function before destroying the data structure
nltk_fdist.plot(limit) if limit else nltk_fdist.plot()
fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0])) #alphabetically sort equally counted items
fidst = fdist[0:limit] if limit else fdist #apply limit
fdist = [i[0] for i in fdist] if not return_counts else fdist #remove counts if desired
return fdist
示例8: count_pos
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def count_pos(input, language):
if language == 'english-nltk':
words = word_tokenize(input)
pos = pos_tag(words)
elif language == 'english':
s = pattern.en.parsetree(input, relations=True, lemmata=True)
words = []
pos = []
for sentence in s:
for w in sentence.words:
words.append(w.string)
pos.append((w.string, clean_text.clean_pos(w.type)))
elif language == 'spanish':
s = pattern.es.parsetree(input, relations=True, lemmata=True)
words = []
pos = []
for sentence in s:
for w in sentence.words:
words.append(w.string)
pos.append((w.string, clean_text.clean_pos(w.type)))
elif language == 'dutch':
words = word_tokenize(input, 'dutch')
tagger = nltk.data.load('taggers/alpino_aubt.pickle')
pos = tagger.tag(words)
tags = FreqDist(tag for (word, tag) in pos)
relative_frequency = []
for item in tags.items():
relative_frequency.append((item[0], float(item[1])/tags.N()))
return relative_frequency
示例9: preprocess
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def preprocess(content):
stopset = set(stopwords.words('english'))
#replace punctuation and tag with space
tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower()))
pos_list = pos_tag(tokens)
s_tokens = list()
#noun and verb only
for pos in pos_list:
#print pos[1]
#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
if pos[1] in ['NN', 'NNS']:
s_tokens.append(pos[0])
wordfreq = FreqDist(s_tokens)
stemfreq = dict()
st = LancasterStemmer()
for word, freq in wordfreq.items():
#stopwords
if word in stopset:
del wordfreq[word]
continue
#tiny words
if len(word) <= 2:
del wordfreq[word]
continue
#stemmer
stem = st.stem(word)
try:
stemfreq[stem]+=freq
except:
stemfreq[stem]=freq
return stemfreq
示例10: summarize
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def summarize(self, text):
# get words from text
words = word_tokenize(text)
# filter out stop words and lower case
words = [word.lower() for word in words if word not in self.stopwords]
# filter non-alphameric chars from words
words = [filter(unicode.isalnum, word) for word in words]
words = filter(lambda w: len(w) > 0, words) # Remove empty words
# stemming
words = [self.pst.stem(word) for word in words]
word_frequencies = FreqDist(words)
most_frequent = [word[0] for word in word_frequencies.items()[:self.top_words_count]]
# get sentences
sentences = sent_tokenize(text)
sentence_score = defaultdict(int)
for i in range(len(sentences)):
sentence = sentences[i]
sentence_words = word_tokenize(sentence)
sentence_words = [self.pst.stem(word).lower() for word in sentence_words if word not in self.stopwords]
for sentence_word in sentence_words:
if sentence_word in most_frequent:
sentence_score[i] += 1
sorted_wordcounts = sorted(sentence_score.iteritems(), key=operator.itemgetter(1), reverse=True)[:self.number_of_sentences]
summary = "\n".join([sentences[num] for num, count in sorted_wordcounts])
return summary
示例11: termfreq
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def termfreq(storytext, filename):
'''
This function takes a speech/text/article, preprocesses it into tokens,
removes stopwords, and outputs a csv of term counts and frequencies
relative to the size of the speech/text/article
'''
# Split into tokens, remove stopwords
tokens = make.preprocess(storytext)
stops = make.filter_stopwords(tokens)
numstops = len(stops)
# Create a FreqDist and turn it into a list of tuples
freq = FreqDist(stops)
data = freq.items()[:numstops]
# Build a pandas DataFrame of that list
df = pd.DataFrame(data)
df.columns = ['word', 'count']
# Add a 'relative frequency' column to the DataFrame
a = []
for i in df['count']:
a.append(i/numstops)
df['pct'] = a
# Write the file to csv
df.to_csv('%s.csv' % filename, sep=',')
print df
print 'Check your files for the csv!'
示例12: top_words_from_corpus
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def top_words_from_corpus(self, num_words, test_name):
corpus_tokens = []
for i in self.corpus_vars["corpus_member_ids"]:
title = 'document_' + str(i)
doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
corpus_tokens += doc_tokens
top_words = []
fdist_corpus = FreqDist(corpus_tokens)
fdist_list = fdist_corpus.items()
if test_name == "Function Word PCA":
function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
tagged_top = nltk.pos_tag(top_words)
for j,k in tagged_top:
if k not in function_pos:
top_words.remove(j)
if len(top_words) == num_words:
break
elif test_name == "Burrows's Delta":
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
break
return top_words
示例13: palavrasChaves
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def palavrasChaves(self):
# fun��o da NLTK que retorna as stopwords na lingua inglesa
stopE = stopwords.words('english')
# fun��o da NLTK que retorna as stopwords na lingua portuguesa
stop = stopwords.words('portuguese')
stopS = stopwords.words('spanish')
palavrasChaves = []
textoArtigo = []
#retira pontua��es do texto e divide o texto em palavras
for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
#retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
if i not in stop:
#retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
if i not in stopE:
#ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
if i not in stopS:
if len(i) > 2:
textoArtigo.append(i)
# apresenta a frequencia de repeticoes das palavras no corpo do artigo
freq = FreqDist(textoArtigo)
# separa as quatro palavras mais frequentes
items = freq.items()[:4]
# coloca as palavras mais frequentes do texto na variavel palavrasChaves
for i in range(0,len(items)):
palavrasChaves.append(items[i][0])
return palavrasChaves
示例14: posAnalysis
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def posAnalysis(collection):
reviews = collection.find(timeout=False)
__reportProgress.counter = 0
skip = 1
for rev in reviews:
if skip%200 == 0:
print 'skip'+str(skip)
__reportProgress()
if rev.has_key('tags'):
skip += 1
if rev['tags'].has_key('NN'):
continue
sents = sent_tokenize(rev['text'])
tokens = [word for sent in sents for word in word_tokenize(sent)]
pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
tag_fd = FreqDist(tag for (word, tag) in pos)
tags = dict()
for (key,value) in tag_fd.items():
k = key.replace('$','S')
out = key.translate(string.maketrans("",""), string.punctuation)
if len(out)>0:
tags[k] = value
collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})
示例15: get_probs
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import items [as 别名]
def get_probs(filename):
"""read the given text and calculate the probabilities for all symbols."""
with open(filename) as file_in:
text = file_in.read()
probs = FreqDist(text)
count_sum = sum(v for v in probs.values())
for k,v in probs.items():
probs[k] = v * 1.0 / count_sum
return probs