当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.iteritems方法代码示例

本文整理汇总了Python中nltk.FreqDist.iteritems方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.iteritems方法的具体用法?Python FreqDist.iteritems怎么用?Python FreqDist.iteritems使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.FreqDist的用法示例。


在下文中一共展示了FreqDist.iteritems方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: find_abbreviations

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def find_abbreviations():
    import db
    from tokenizers import es
    from nltk import FreqDist

    corpus = db.connect()
    #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
    text = '\n'.join([a['text'] for a in corpus.articles.find()])
    tokens = es.tokenize(text, ignore_abbreviations=True)

    fd = FreqDist()
    fd_abbr = FreqDist()
    fd_n_abbr = FreqDist()
    n_tokens = len(tokens)
    for i in range(n_tokens):
        fd.inc(tokens[i])
        if i < (n_tokens - 1) and tokens[i + 1] == u'.':
            fd_abbr.inc(tokens[i])
        else:
            fd_n_abbr.inc(tokens[i])

    adjusted = {}
    f_avg = len(fd.keys()) / fd.N()
    for t, n in fd_abbr.iteritems():
        f = fd.get(t, 0) / fd.N()
        deviation = 1 + (f - f_avg)
        adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)

    items = adjusted.items()
    items.sort(key=lambda i: i[1], reverse=True)
    for t, n in items[:100]:
        print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
开发者ID:nosamanuel,项目名称:nlp,代码行数:34,代码来源:punctuation.py

示例2: analyzeTitles

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def analyzeTitles():
    fulltitles = []
    titles = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            if "..." in row[0]:
                row[0] = " ".join(row[0].split(" ")[:-1])
            words = nltk.word_tokenize(row[0])
            for w in words:
                if w.isalpha() and w.lower() not in ['the','a']:
                    titles.append(w.lower())
            fulltitles.append(row[0])

    titleset = nltk.Text(titles)
    wordsintitle = [len(f.split(" ")) for f in fulltitles]
    wit_fd = FreqDist(wordsintitle)
    print "\nw.i.t.\tfreq"
    print "--------------------"
    for numwords, times in wit_fd.iteritems():
        print str(numwords) + "\t" + str(times)
    print "\n"

    print "\nword\t\tfreq"
    print "--------------------"
    fd = FreqDist(titleset)
    common_words = fd.most_common(25)
    for k, v in common_words:
        print str(k) + "\t\t" + str(v)
开发者ID:nelsonam,项目名称:booklytics,代码行数:31,代码来源:analyze_titles.py

示例3: get_stats

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
    def get_stats(self, output_fname):
        fd = FreqDist()
        for text in self.texts:
            fd.update(set(text))

        fh = open(output_fname, 'w')
        text = Text(self.paragraph_tokens)
        fdist = FreqDist(text)
        for (w,f) in fdist.iteritems():
            print >> fh, "%s\t%i" % (w, f)
        fh.close()
开发者ID:yuedong111,项目名称:topical-spiders,代码行数:13,代码来源:topic_dictionary.py

示例4: description_and_tokens

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
    def description_and_tokens(self, id_, timestamp, soup):
        overview = soup.find(id="description")
        for scr in overview.find_all('script'):
            scr.clear()

        desc = overview.text
        tokens = word_tokenize(desc)
        freqdist = FreqDist(tokens)

        self.redis.set('daftpunk:%s:description' % id_, desc)
        for token, freq in freqdist.iteritems():
            self.redis.zadd('daftpunk:%s:tokens' % id_, freq, token)
开发者ID:nicr9,项目名称:daftpunk,代码行数:14,代码来源:worker.py

示例5: main

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def main():
    corpora = ['idwiki', 'kaskus', 'kompas', 'twitter']
    corpora_dict = {}

    for corpus in corpora:
        fd = FreqDist()
        for line in codecs.open('../data/' + corpus + '.1gram', 'r', 'utf-8'):
            (word, freq) = line.split('\t')
            fd[len(word)] += int(freq.strip())

        sorted_fd = sorted(fd.iteritems(), key=operator.itemgetter(0))
        lengths = [0] + [x for x, y in sorted_fd]
        freqs = [0] + [y for x, y in sorted_fd]
        plt.plot(lengths, freqs, label=corpus)

    plt.grid(True)
    plt.xlabel('length', fontsize=14, fontweight='bold')
    plt.ylabel('frequency', fontsize=14, fontweight='bold')
    plt.legend(loc='upper right')
    plt.savefig('char.png')
    plt.close()
开发者ID:ardwort,项目名称:freq-dist-id,代码行数:23,代码来源:genplot-charlen.py

示例6: create_word_scores

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def create_word_scores():
    tweets = get_tweets_from_db()
    postweets = tweets[800001:]
    negtweets = tweets[:800001]
 
    posWords = []
    negWords = []
    for tweet in postweets:
        posWords.append(tweet[0])
    for tweet in negtweets:
        negWords.append(tweet[0])

    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:asdvalenzuela,项目名称:moodmap,代码行数:38,代码来源:buildClassifier.py

示例7: get_frequent_features

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
	def get_frequent_features(self, min_support):
		#get n item sets	
		wnl = WordNetLemmatizer()
		features = [wnl.lemmatize(token) for token in self.candidate_feature_list()]
		dist = FreqDist(features)
		return [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
开发者ID:omkar20xx,项目名称:OpinionMiner,代码行数:8,代码来源:FeatureExtractor.py

示例8: get_frequent_features_list

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
 def get_frequent_features_list(self, min_support):
     dist = FreqDist(self.get_candidate_feature_list())
     features = [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
     return self.prune_features(features, 3)
开发者ID:sgudla,项目名称:OpninionMining,代码行数:6,代码来源:FeatureExtractor.py

示例9: len

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
import re, os, glob
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest=argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
	fold = int(train.split(".")[-2])
	if fold > 3: continue
	wordpostrigrams  = FreqDist(ingrams((porter.stem(word)+"/"+tag
		for t in open(train)
		for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
	for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
		output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
		testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag
			for t in open(test).readlines()
			for word,tag in zip(leaves.findall(t), pos.findall(t))), 3))
		open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b)
			for a, b in testtrigrams.iteritems() if wordpostrigrams[a])
		print output
print "done"
开发者ID:andreasvc,项目名称:authident,代码行数:30,代码来源:ngrams.py

示例10: FreqDist

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for item in train_set:
    tweet = item[0].lower()
    words = word_tokenize(item[0])
    word_fd.update(words)
    label_word_fd[item[1]].update(words)

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:50]
print best
bestwords = set([w for w, s in best])
print bestwords

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    features = {}
    for word in all_words:
开发者ID:MARS87,项目名称:ieor242,代码行数:33,代码来源:naive_bayes_classifier_reduce_features.py

示例11: FeatureChoose

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000):
	pos_feature = list()
	neg_feature = list()
	pos_all_words = list()
	neg_all_words = list()
	# pos_all_feature = dict()
	# neg_all_feature = dict()
	if featuregram == 'one':
		for each in pos_wordlist:
			cur = UniGramFeature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = UniGramFeature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'two':
		for each in pos_wordlist:
			cur = Mixup2Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup2Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'three':
		for each in pos_wordlist:
			cur = Mixup3Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup3Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	else:
		return []

	fd = FreqDist()
	cfd = ConditionalFreqDist()
	for word in pos_all_words:
		fd[word] += 1
		cfd['pos'][word] += 1
	for word in neg_all_words:
		fd[word] += 1
		cfd['neg'][word] += 1
	pos_N = cfd['pos'].N()
	neg_N = cfd['neg'].N()
	N = fd.N()
	score_list = dict()
	for word, freq in fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N)
		neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N)
		score_list[word] = pos_score + neg_score

	best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True)
	# print json.dumps(best_topwords[-100:-1], ensure_ascii=False)
	best_topwords = best_topwords[:n]
	# print json.dumps(best_topwords[:100], ensure_ascii=False)
	best_topwords = set(word for word, freq in best_topwords)
	return pos_feature, neg_feature, best_topwords
开发者ID:l11x0m7,项目名称:SentimentAnalysis,代码行数:68,代码来源:MLMethod.py

示例12: books

# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import iteritems [as 别名]
from nltk import FreqDist
# Donwload all books (http://www.nltk.org/data.html)
# NOTE: if this does not work, run this code in Python from the Terminal (not from inside IDE)
# nltk.download()

# Import a text and examine its words
from nltk.corpus import brown
brown.words()

# Find the frequency of each word in a text
fd = FreqDist(brown.words())

# Find the most frequent words in a text:
# http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
import operator
max(fd.iteritems(), key=operator.itemgetter(1))
sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
# Or use the wrapper function
fd.most_common(10)

# plot the most frequent words
fd.plot(10)
fd.plot(10, cumulative=True)

# See the words with lowest frequency (these words are called hapaxes)
fd.hapaxes()

# Count all the words
len(text1)
# count unique words
len(set(text1))
开发者ID:DeepakSinghRawat,项目名称:Tutorials,代码行数:33,代码来源:NLP_tut.py


注:本文中的nltk.FreqDist.iteritems方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。