当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.values方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.values方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.values方法的具体用法?Python FreqDist.values怎么用?Python FreqDist.values使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.values方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: alpha

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
    def alpha(self):
        """Krippendorff 1980

        """
        # check for degenerate cases
        if len(self.K) == 0:
            raise ValueError("Cannot calculate alpha, no data present!")
        if len(self.K) == 1:
            log.debug("Only one annotation value, allpha returning 1.")
            return 1
        if len(self.C) == 1 and len(self.I) == 1:
            raise ValueError("Cannot calculate alpha, only one coder and item present!")

        total_disagreement = 0.0
        total_ratings = 0
        all_valid_labels_freq = FreqDist([])

        total_do = 0.0 # Total observed disagreement for all items.
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)
            labels_count = sum(label_freqs.values())
            if labels_count < 2:
                # Ignore the item.
                continue
            all_valid_labels_freq += label_freqs
            total_do += self.Disagreement(label_freqs) * labels_count

        do = total_do / sum(all_valid_labels_freq.values())

        de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
        k_alpha = 1.0 - do / de

        return k_alpha
开发者ID:prz3m,项目名称:kind2anki,代码行数:35,代码来源:agreement.py

示例2: char_freq

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def char_freq(lines):
    """ 返回 DataFrame,按字符频率倒序排列 """
    corpus = nltk.Text(chain.from_iterable(lines))  # 需要一个长字符串,而不是字符串列表
    wc = FreqDist(corpus)
    df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()})
    df.sort('freq', ascending=False, inplace=True)
    df['idx'] = np.arange(len(wc.values()))
    return df
开发者ID:ijustloveses,项目名称:machine_learning,代码行数:10,代码来源:lstm_w2v_segment.py

示例3: get_top_words

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def get_top_words(directory, n, file):
	num_docs = 0.0
	flist = {}
	result = {}
	for f in os.listdir(directory):
		#stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt"
		
		num_docs+=1
		rawContents = load_file_tokens(directory+'/'+f)
		fdist = FreqDist( rawContents )
		normalF = max(fdist.values())
		
		for key in fdist.keys():
			fdist[key]=float(float(fdist[key])/normalF)
	
		flist[directory+'/'+f] = fdist
		
		
	for key in flist[file].keys():
		num_appear=0
		for key_file in flist.keys():
			if key in flist[key_file].keys():
				num_appear+=1
		
		result[key] = flist[file][key]*math.log(num_docs/(num_appear))
	
	sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True)
	
	top_x = sorted_x[:n]
	result = []
	
	for item in top_x:
		result.append(item[0])
	
	return result
开发者ID:byouloh,项目名称:twitter-social-web-mining,代码行数:37,代码来源:ComputeTFIDF.py

示例4: createPDwithTeleport

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def createPDwithTeleport(readerWordlist,mergedWordList):
    ### teleporation paramerter with value of 1 percent 
    
    corpusPD = {}
    readerPD = {}
    
    unigramReaderWordList = FreqDist(readerWordlist)
    unigramCorpusWordList = FreqDist(mergedWordList)
    
    for word in unigramCorpusWordList.keys():
        
        corpusPD[word] = unigramCorpusWordList[word]/float(sum(unigramCorpusWordList.values()))
        
        if word in unigramReaderWordList:
            readerPD[word] = unigramReaderWordList[word]/float(sum(unigramReaderWordList.values()))
        else:
            readerPD[word] = 0
            
        readerPD[word] = 0.99*readerPD[word] + 0.01*corpusPD[word]
        
    return readerPD
开发者ID:UW-INFX575,项目名称:Kirtika_dhathathri,代码行数:23,代码来源:updated-jargonDist.py

示例5: wordprefixsuffixsubstringsprobdist

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def wordprefixsuffixsubstringsprobdist():
	for w in englishdicttxt:
		wtok=w.split()
		if len(wtok) > 0:		
			computeprefixessuffixessubstrings(wtok[0])
			wordlist.append(wtok[0])
	#prefixf=open("WordPrefixesProbabilities.txt","w")
	#suffixf=open("WordSuffixesProbabilities.txt","w")
	prefixdict=FreqDist(prefixes)
	suffixdict=FreqDist(suffixes)
	substringsdict=FreqDist(suffixes)
	totalprefixes=sum(prefixdict.values())
	totalsuffixes=sum(suffixdict.values())
	totalsubstrings=sum(substringsdict.values())
	for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
		prefixprobdict[pk] = float(pv)/float(totalprefixes)
	for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
		suffixprobdict[pk] = float(pv)/float(totalsuffixes)
	for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
		substringsprobdict[pk] = float(pv)/float(totalsubstrings)
	#json.dump(prefixprobdict,prefixf)
	#json.dump(suffixprobdict,suffixf)
	#print "prefix probabilities:",prefixprobdict
	#print "suffix probabilities:",suffixprobdict
	return (prefixprobdict, suffixprobdict, substringsprobdict)
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:27,代码来源:WordSubstringProbabilities.py

示例6: plot_dist_productions_by_frequency

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def plot_dist_productions_by_frequency(productions):
    f= FreqDist(productions)
    fdd = FreqDist(f.values())
    x = []
    y = []
    for k in fdd.keys():
        x.append(k)
        y.append(fdd[k])
    plt.plot(x,y,lw=2,color= 'b')
    plt.title('Productions by frequency' )
    plt.xlabel('frequency')
    plt.ylabel('number of rules with frequency')
    plt.show()
开发者ID:atiassa,项目名称:recommend-2011,代码行数:15,代码来源:q2_2.py

示例7: _train

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
开发者ID:atiassa,项目名称:recommend-2011,代码行数:51,代码来源:q2.py

示例8: filterTokens

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def filterTokens(tokens, typefeatures=None):
    all_terms = FreqDist(tokens)

    
    if typefeatures == 'unigrams':
        minimal = 2
    elif typefeatures == 'bigrams':    
        minimal = 2
    else:
        minimal = 1
   
    other = FreqDist()
    for freq,term in zip(all_terms.values(),all_terms.keys()):
        if freq >= minimal:
            other.inc(term, freq)
        else:
            break

    return other
开发者ID:diegocaro,项目名称:opinionapp,代码行数:21,代码来源:tokens.py

示例9: get_buzzwords

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
def get_buzzwords(docs):
	buzzwords = []
	for doc in docs:
		freqdist = FreqDist(docs[doc])
		vocab = freqdist.keys()
		freqs = freqdist.values()
		buzzwords = buzzwords + vocab[:50]

	buzzwords = set(buzzwords)

	freq_counts = {}
	for buzzword in buzzwords:
		print buzzword
		l = []
		for doc in docs:
			freqdist = FreqDist(docs[doc])
			t = (doc, freqdist[buzzword])
			l.append(t)
		freq_counts[buzzword] = l
	dump_content('freqs', freq_counts)
	return freq_counts
开发者ID:JNazare,项目名称:inagural_speech_analysis,代码行数:23,代码来源:rerun_experiments.py

示例10: getFreq

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
    def getFreq(self, text, normalize=True):
        stop_words = stopwords.words(self.detectLanguage(text))
        words = self.getTokens(text)
        clean_words = filter(lambda word: not word in stop_words and not word in punctuation, words)
        fdist = FreqDist(clean_words)
#==============================================================================
#         # same result        
#         fdist = FreqDist()
#         for word in word_tokenize(text):
#             word = word.lower()
#             if not word in stop_words and not word in punctuation:
#                 fdist[word] += 1
#==============================================================================
        # normalization by dividing on max freqency 
        if normalize:
            norm = float(max(fdist.values()))
            for word in fdist.keys():
                fdist[word] = fdist[word] / norm
                # remove too frequent and too rare words
                if fdist[word] >= self._upper_bound or fdist[word] <= self._lower_bound:
                    del fdist[word]
        return fdist
开发者ID:rusad,项目名称:summarizer,代码行数:24,代码来源:summarizer.py

示例11: for

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import values [as 别名]
     ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)])
     outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
     
     # Write out the distribution of words in the document
     with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_words.most_common():
             prozent = fdist_words.freq(k)
             out.write("{},{},{}\n".format(k,v, prozent))
     # Write out the distribution of ngrams in the document
     with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_ngrams.most_common():
             prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams))
             out.write("{},{},{}\n".format(k,v, prozent))  
     # Write the size of bins of words that appear with the same frequency               
     with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out:
         for i in sorted(set(fdist_words.values())):
             bin_size = fdist_words.Nr(i)
             out.write("{},{}\n".format(i,bin_size))     
 print('Output distributions saved in \'output\' folder.')
 print('Output bins saved in \'bins\' folder.')
 # If there are many documents -> compare their most common words and ngrams
 if len(params.files) > 1:
     print("Pairwise overlap between {} most frequent words:".format(params.n))
     short_names = [f.name[-15:] for f in params.files]
     for i, list1 in enumerate(words_most_common):
         for j, list2 in enumerate(words_most_common[i+1:]):
             print("{} | {} | ".format(short_names[i], short_names[i+j+1]), end="")
             overlap = len([w for w in list1 if w in list2])
             print(overlap)
     print("Pairwise overlap between {} most frequent letters and letter pairs:".format(params.m))
     short_names = [f.name[-15:] for f in params.files]
开发者ID:daniilsorokin,项目名称:Web-Mining-Exercises,代码行数:33,代码来源:compute_distributions.py


注:本文中的nltk.probability.FreqDist.values方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。