本文整理汇总了Python中nltk.probability.FreqDist.samples方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.samples方法的具体用法?Python FreqDist.samples怎么用?Python FreqDist.samples使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.samples方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def text_to_dict(docs, metric):
""" Create dictionaries of term frequencies based on documents
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
num_docs = len(docs)
# Build dictionaries
dicts = []
for i, fd in enumerate(tf_dists):
if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists))
d = {}
if metric == FrequencyMetrics.TF:
for word in fd.samples():
d[word] = fd.freq(word)
elif metric == FrequencyMetrics.TF_IDF:
for word in fd.samples():
d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
else:
raise ValueError("No such feature type: %s" % feature_type);
dicts.append(d)
return dicts
示例2: text_to_vector
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def text_to_vector(docs, metric):
""" Create frequency based feature-vector from text
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
all_tokens = doc_freqs.keys()
num_docs = len(docs)
num_features = len(all_tokens)
# Build feature x document matrix
matrix = np.zeros((num_features, num_docs))
for i, fd in enumerate(tf_dists):
if metric == FrequencyMetrics.TF:
v = [fd.freq(word) for word in all_tokens]
elif metric == FrequencyMetrics.TF_IDF:
v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
else:
raise ValueError("No such feature type: %s" % feature_type);
matrix[:,i] = v
return matrix
示例3: getFeatures
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def getFeatures(self, corpus):
stemmer = PorterStemmer()
stems = FreqDist()
onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
corpus = onlyLettersNumbers.sub(' ', corpus.lower())
corpus = TreebankWordTokenizer().tokenize(corpus)
count = 0
for word in corpus :
if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
stems.inc(stemmer.stem_word(word))
count += 1
if self.__maxFeatures > 0 and count >= self.__maxFeatures :
break
features = stems.samples()
return features
示例4: sorted
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
found = tags_found[tag]
actual = tags_actual[tag]
precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
print ' '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10),
str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]])
print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13])
else:
sents = corpus.sents(**kwargs)
taglen = 7
if args.fraction != 1.0:
cutoff = int(math.ceil(len(sents) * args.fraction))
sents = sents[:cutoff]
for sent in sents:
for word, tag in tagger.tag(sent):
tags_found.inc(tag)
if len(tag) > taglen:
taglen = len(tag)
print ' '.join(['Tag'.center(taglen), 'Count'.center(9)])
print ' '.join(['='*taglen, '='*9])
for tag in sorted(tags_found.samples()):
print ' '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)])
print ' '.join(['='*taglen, '='*9])
示例5: int
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
cutoff = int(math.ceil(len(chunked_sents) * args.fraction))
chunked_sents = chunked_sents[:cutoff]
print chunker.evaluate(chunked_sents), '\n'
if args.trace:
print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__)
iobs_found = FreqDist()
sents = corpus.sents()
if args.fraction != 1.0:
cutoff = int(math.ceil(len(sents) * args.fraction))
sents = sents[:cutoff]
for sent in sents:
tree = chunker.parse(tagger.tag(sent))
for child in tree.subtrees(lambda t: t.node != 'S'):
iobs_found.inc(child.node)
iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])
print 'IOB'.center(justify) + ' Found '
print '='*justify + ' ========='
for iob in sorted(iobs):
print ' '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)])
print '='*justify + ' ========='
示例6: test_add_to_freq_dist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def test_add_to_freq_dist(self):
fd = FreqDist()
fd = coveyquery.add_words_to_freq_dist(fd, "hi how are you doing_today")
assert_equals(len(fd.samples()), 5)
assert_equals(fd.keys(), ['doing_today', 'how', 'you', 'hi', 'are'])