本文整理汇总了Python中nltk.FreqDist.iteritems方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.iteritems方法的具体用法?Python FreqDist.iteritems怎么用?Python FreqDist.iteritems使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.iteritems方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_abbreviations
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def find_abbreviations():
import db
from tokenizers import es
from nltk import FreqDist
corpus = db.connect()
#text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
text = '\n'.join([a['text'] for a in corpus.articles.find()])
tokens = es.tokenize(text, ignore_abbreviations=True)
fd = FreqDist()
fd_abbr = FreqDist()
fd_n_abbr = FreqDist()
n_tokens = len(tokens)
for i in range(n_tokens):
fd.inc(tokens[i])
if i < (n_tokens - 1) and tokens[i + 1] == u'.':
fd_abbr.inc(tokens[i])
else:
fd_n_abbr.inc(tokens[i])
adjusted = {}
f_avg = len(fd.keys()) / fd.N()
for t, n in fd_abbr.iteritems():
f = fd.get(t, 0) / fd.N()
deviation = 1 + (f - f_avg)
adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)
items = adjusted.items()
items.sort(key=lambda i: i[1], reverse=True)
for t, n in items[:100]:
print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
示例2: analyzeTitles
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def analyzeTitles():
fulltitles = []
titles = []
with open('../top100clean.csv', 'rb') as bookfile:
reader = csv.reader(bookfile)
for row in reader:
if "..." in row[0]:
row[0] = " ".join(row[0].split(" ")[:-1])
words = nltk.word_tokenize(row[0])
for w in words:
if w.isalpha() and w.lower() not in ['the','a']:
titles.append(w.lower())
fulltitles.append(row[0])
titleset = nltk.Text(titles)
wordsintitle = [len(f.split(" ")) for f in fulltitles]
wit_fd = FreqDist(wordsintitle)
print "\nw.i.t.\tfreq"
print "--------------------"
for numwords, times in wit_fd.iteritems():
print str(numwords) + "\t" + str(times)
print "\n"
print "\nword\t\tfreq"
print "--------------------"
fd = FreqDist(titleset)
common_words = fd.most_common(25)
for k, v in common_words:
print str(k) + "\t\t" + str(v)
示例3: get_stats
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def get_stats(self, output_fname):
fd = FreqDist()
for text in self.texts:
fd.update(set(text))
fh = open(output_fname, 'w')
text = Text(self.paragraph_tokens)
fdist = FreqDist(text)
for (w,f) in fdist.iteritems():
print >> fh, "%s\t%i" % (w, f)
fh.close()
示例4: description_and_tokens
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def description_and_tokens(self, id_, timestamp, soup):
overview = soup.find(id="description")
for scr in overview.find_all('script'):
scr.clear()
desc = overview.text
tokens = word_tokenize(desc)
freqdist = FreqDist(tokens)
self.redis.set('daftpunk:%s:description' % id_, desc)
for token, freq in freqdist.iteritems():
self.redis.zadd('daftpunk:%s:tokens' % id_, freq, token)
示例5: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def main():
corpora = ['idwiki', 'kaskus', 'kompas', 'twitter']
corpora_dict = {}
for corpus in corpora:
fd = FreqDist()
for line in codecs.open('../data/' + corpus + '.1gram', 'r', 'utf-8'):
(word, freq) = line.split('\t')
fd[len(word)] += int(freq.strip())
sorted_fd = sorted(fd.iteritems(), key=operator.itemgetter(0))
lengths = [0] + [x for x, y in sorted_fd]
freqs = [0] + [y for x, y in sorted_fd]
plt.plot(lengths, freqs, label=corpus)
plt.grid(True)
plt.xlabel('length', fontsize=14, fontweight='bold')
plt.ylabel('frequency', fontsize=14, fontweight='bold')
plt.legend(loc='upper right')
plt.savefig('char.png')
plt.close()
示例6: create_word_scores
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def create_word_scores():
tweets = get_tweets_from_db()
postweets = tweets[800001:]
negtweets = tweets[:800001]
posWords = []
negWords = []
for tweet in postweets:
posWords.append(tweet[0])
for tweet in negtweets:
negWords.append(tweet[0])
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
示例7: get_frequent_features
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def get_frequent_features(self, min_support):
#get n item sets
wnl = WordNetLemmatizer()
features = [wnl.lemmatize(token) for token in self.candidate_feature_list()]
dist = FreqDist(features)
return [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
示例8: get_frequent_features_list
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def get_frequent_features_list(self, min_support):
dist = FreqDist(self.get_candidate_feature_list())
features = [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
return self.prune_features(features, 3)
示例9: len
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
import re, os, glob
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest=argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
fold = int(train.split(".")[-2])
if fold > 3: continue
wordpostrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag
for t in open(train)
for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag
for t in open(test).readlines()
for word,tag in zip(leaves.findall(t), pos.findall(t))), 3))
open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b)
for a, b in testtrigrams.iteritems() if wordpostrigrams[a])
print output
print "done"
示例10: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for item in train_set:
tweet = item[0].lower()
words = word_tokenize(item[0])
word_fd.update(words)
label_word_fd[item[1]].update(words)
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:50]
print best
bestwords = set([w for w, s in best])
print bestwords
def tweet_features(tweet):
tweet_words = word_tokenize(tweet)
features = {}
for word in all_words:
示例11: FeatureChoose
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000):
pos_feature = list()
neg_feature = list()
pos_all_words = list()
neg_all_words = list()
# pos_all_feature = dict()
# neg_all_feature = dict()
if featuregram == 'one':
for each in pos_wordlist:
cur = UniGramFeature(each)
pos_feature.append(cur)
# pos_all_feature.update(cur)
pos_all_words.extend(cur)
for each in neg_wordlist:
cur = UniGramFeature(each)
neg_feature.append(cur)
# neg_all_feature.update(cur)
neg_all_words.extend(cur)
elif featuregram == 'two':
for each in pos_wordlist:
cur = Mixup2Feature(each)
pos_feature.append(cur)
# pos_all_feature.update(cur)
pos_all_words.extend(cur)
for each in neg_wordlist:
cur = Mixup2Feature(each)
neg_feature.append(cur)
# neg_all_feature.update(cur)
neg_all_words.extend(cur)
elif featuregram == 'three':
for each in pos_wordlist:
cur = Mixup3Feature(each)
pos_feature.append(cur)
# pos_all_feature.update(cur)
pos_all_words.extend(cur)
for each in neg_wordlist:
cur = Mixup3Feature(each)
neg_feature.append(cur)
# neg_all_feature.update(cur)
neg_all_words.extend(cur)
else:
return []
fd = FreqDist()
cfd = ConditionalFreqDist()
for word in pos_all_words:
fd[word] += 1
cfd['pos'][word] += 1
for word in neg_all_words:
fd[word] += 1
cfd['neg'][word] += 1
pos_N = cfd['pos'].N()
neg_N = cfd['neg'].N()
N = fd.N()
score_list = dict()
for word, freq in fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N)
neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N)
score_list[word] = pos_score + neg_score
best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True)
# print json.dumps(best_topwords[-100:-1], ensure_ascii=False)
best_topwords = best_topwords[:n]
# print json.dumps(best_topwords[:100], ensure_ascii=False)
best_topwords = set(word for word, freq in best_topwords)
return pos_feature, neg_feature, best_topwords
示例12: books
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
from nltk import FreqDist
# Donwload all books (http://www.nltk.org/data.html)
# NOTE: if this does not work, run this code in Python from the Terminal (not from inside IDE)
# nltk.download()
# Import a text and examine its words
from nltk.corpus import brown
brown.words()
# Find the frequency of each word in a text
fd = FreqDist(brown.words())
# Find the most frequent words in a text:
# http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
import operator
max(fd.iteritems(), key=operator.itemgetter(1))
sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
# Or use the wrapper function
fd.most_common(10)
# plot the most frequent words
fd.plot(10)
fd.plot(10, cumulative=True)
# See the words with lowest frequency (these words are called hapaxes)
fd.hapaxes()
# Count all the words
len(text1)
# count unique words
len(set(text1))