本文整理汇总了Python中nltk.FreqDist.get方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.get方法的具体用法?Python FreqDist.get怎么用?Python FreqDist.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.get方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_abbreviations
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import get [as 别名]
def find_abbreviations():
import db
from tokenizers import es
from nltk import FreqDist
corpus = db.connect()
#text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
text = '\n'.join([a['text'] for a in corpus.articles.find()])
tokens = es.tokenize(text, ignore_abbreviations=True)
fd = FreqDist()
fd_abbr = FreqDist()
fd_n_abbr = FreqDist()
n_tokens = len(tokens)
for i in range(n_tokens):
fd.inc(tokens[i])
if i < (n_tokens - 1) and tokens[i + 1] == u'.':
fd_abbr.inc(tokens[i])
else:
fd_n_abbr.inc(tokens[i])
adjusted = {}
f_avg = len(fd.keys()) / fd.N()
for t, n in fd_abbr.iteritems():
f = fd.get(t, 0) / fd.N()
deviation = 1 + (f - f_avg)
adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)
items = adjusted.items()
items.sort(key=lambda i: i[1], reverse=True)
for t, n in items[:100]:
print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
示例2: featureset
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import get [as 别名]
def featureset(sample):
comment, label = sample
features = {}
# tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
words = map(lambda statement: map(lambda (w,t):w, statement), comment)
words = sum(words, [])
# tags = sum(tags, [])
size_= sum([len(word) for word in words])
features['stmt_len'] = len(words)/float(len(comment))
features['word_len'] = size_/float(len(words))
features['size'] = size_
# tags_dist = FreqDist(sum(tags, []))
# for tag in TAGS:
# features[tag] = tags_dist.get(tag, 0)
dist = FreqDist([word.lower() for word in words])
# num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
# features['prob_stop_words'] = num_stop_words/len(words)
for word in EN_STOPWORDS:
features[word] = dist.get(word, 0)/float(len(words))
features['alwayson'] = 1.0
for language in LANGUAGES:
for i in range(1,n+1):
word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
features['w_sim_%d_%s' % (i, language)] = word_sim
features['t_sim_%d_%s' % (i, language)] = tag_sim
features['c_sim_%d_%s' % (i, language)] = char_sim
# features['s_sim_%d_%s' % (i, language)] = w_s_sim
return (features, label)
示例3: transfer
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import get [as 别名]
def transfer(fileDj,vocabulary):
fo=open(fileDj,"r")
content=fo.read()
tokens=nltk.word_tokenize(content)
# st=[SBStemmer.stem(t) for t in tokens]
st=tokens
fo.close()
fdist=FreqDist(st)
BOWDj = []
for key in vocabulary:
if key in fdist.keys():
BOWDj.append(fdist.get(key))
else:
BOWDj.append(0)
return BOWDj
示例4: doc_features
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import get [as 别名]
def doc_features(doc):
doc_words = FreqDist(w for w in doc if not isStopWord(w))
features = {}
for word in word_features:
features['count (%s)' % word] = (doc_words.get(word, 0))
return features