当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.samples方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.samples方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.samples方法的具体用法?Python FreqDist.samples怎么用?Python FreqDist.samples使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.samples方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: text_to_dict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
开发者ID:himanshusapra9,项目名称:TextNet,代码行数:35,代码来源:freq_representation.py

示例2: text_to_vector

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
开发者ID:himanshusapra9,项目名称:TextNet,代码行数:36,代码来源:freq_representation.py

示例3: getFeatures

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
 def getFeatures(self, corpus):
     stemmer = PorterStemmer()
     stems = FreqDist()
     onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
     corpus = onlyLettersNumbers.sub(' ', corpus.lower())
     corpus = TreebankWordTokenizer().tokenize(corpus)
     
     count = 0
     for word in corpus :
         if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
             stems.inc(stemmer.stem_word(word))
             count += 1
             if self.__maxFeatures > 0 and count >= self.__maxFeatures :
                 break
             
     features = stems.samples()
     
     return features
开发者ID:artscoop,项目名称:django-classifier,代码行数:20,代码来源:__init__.py

示例4: sorted

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
	for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
		found = tags_found[tag]
		actual = tags_actual[tag]
		precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
		recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
		print '  '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10),
			str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]])
	
	print '  '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13])
else:
	sents = corpus.sents(**kwargs)
	taglen = 7
	
	if args.fraction != 1.0:
		cutoff = int(math.ceil(len(sents) * args.fraction))
		sents = sents[:cutoff]
	
	for sent in sents:
		for word, tag in tagger.tag(sent):
			tags_found.inc(tag)
			
			if len(tag) > taglen:
				taglen = len(tag)
	
	print '  '.join(['Tag'.center(taglen), 'Count'.center(9)])
	print '  '.join(['='*taglen, '='*9])
	
	for tag in sorted(tags_found.samples()):
		print '  '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)])
	
	print '  '.join(['='*taglen, '='*9])
开发者ID:ANB2,项目名称:nltk-trainer,代码行数:33,代码来源:analyze_tagger_coverage.py

示例5: int

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
		cutoff = int(math.ceil(len(chunked_sents) * args.fraction))
		chunked_sents = chunked_sents[:cutoff]
	
	print chunker.evaluate(chunked_sents), '\n'

if args.trace:
	print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__)

iobs_found = FreqDist()
sents = corpus.sents()

if args.fraction != 1.0:
	cutoff = int(math.ceil(len(sents) * args.fraction))
	sents = sents[:cutoff]

for sent in sents:
	tree = chunker.parse(tagger.tag(sent))
	
	for child in tree.subtrees(lambda t: t.node != 'S'):
		iobs_found.inc(child.node)

iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])

print 'IOB'.center(justify) + '    Found  '
print '='*justify + '  ========='

for iob in sorted(iobs):
	print '  '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)])

print '='*justify + '  ========='
开发者ID:alepharchives,项目名称:nltk-trainer,代码行数:33,代码来源:analyze_chunker_coverage.py

示例6: test_add_to_freq_dist

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import samples [as 别名]
 def test_add_to_freq_dist(self):
     fd = FreqDist()
     fd = coveyquery.add_words_to_freq_dist(fd, "hi how are you doing_today")
     assert_equals(len(fd.samples()), 5)
     assert_equals(fd.keys(), ['doing_today', 'how', 'you', 'hi', 'are'])
开发者ID:atiw003,项目名称:steelir,代码行数:7,代码来源:test_coveyquery.py


注:本文中的nltk.probability.FreqDist.samples方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。