当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.keys方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.keys方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.keys方法的具体用法?Python FreqDist.keys怎么用?Python FreqDist.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.keys方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __FreqFromCorpus

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
 def __FreqFromCorpus (self):
     r"""
         Questo metodo estrae le frequenze dal corpus
     """
     print "Calcolo bigrams..."
     bi = FreqDist(bigrams(self.words))
     print "Calcolo FreqDist..."
     wfr = FreqDist(self.words)
     
     print "Coda di elaborazione..."
     print 
           
     tot = len(bi.keys())
     i = 0
     for eles in bi.keys():
         a = wfr[eles[0]]
         b = wfr[eles[1]]
         ab = bi[eles]
         N = wfr.N()
         try:
             self.__col_logl.append (nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood  (a, b, ab, N))
             print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t ->  %f" % (i, tot,eles[0], eles[1], self.__col_logl[-1])
         except UnicodeEncodeError:
             #catturo eventuali errori di codifica
             pass
         i += 1
开发者ID:patriziobellan86,项目名称:LogLikeLihood,代码行数:28,代码来源:LogLikelihood.py

示例2: pmi

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def pmi(features):
	'''
	Compute the PMI value for all features
	'''
	dic = FreqDist()
	dic_pos = FreqDist()
	pos = 0.0
	N = 0.0
	for i,feature in enumerate(features):
		N = N + 1
		for f in feature:
			if f[-1] == 1:
				pos = pos + 1
				for t in f[:-3]:
					dic_pos.inc(t)
					dic.inc(t)
			else:
				for t in f[:-3]:
					dic.inc(t)
	N = N + len(dic.keys())
	pos = pos + len(dic.keys())
	pmi_pos = {}
	for t in dic.keys():
		pmi_pos[t]=np.log(float((dic_pos[t]+1)*N)/float((dic[t]+1)*pos))
	pmi_pos = dict(sorted(pmi_pos.items(), key=itemgetter(1)))
	return pmi_pos
开发者ID:52nlp,项目名称:nlp,代码行数:28,代码来源:re.py

示例3: wordprefixsuffixsubstringsprobdist

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def wordprefixsuffixsubstringsprobdist():
	for w in englishdicttxt:
		wtok=w.split()
		if len(wtok) > 0:		
			computeprefixessuffixessubstrings(wtok[0])
			wordlist.append(wtok[0])
	#prefixf=open("WordPrefixesProbabilities.txt","w")
	#suffixf=open("WordSuffixesProbabilities.txt","w")
	prefixdict=FreqDist(prefixes)
	suffixdict=FreqDist(suffixes)
	substringsdict=FreqDist(suffixes)
	totalprefixes=sum(prefixdict.values())
	totalsuffixes=sum(suffixdict.values())
	totalsubstrings=sum(substringsdict.values())
	for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
		prefixprobdict[pk] = float(pv)/float(totalprefixes)
	for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
		suffixprobdict[pk] = float(pv)/float(totalsuffixes)
	for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
		substringsprobdict[pk] = float(pv)/float(totalsubstrings)
	#json.dump(prefixprobdict,prefixf)
	#json.dump(suffixprobdict,suffixf)
	#print "prefix probabilities:",prefixprobdict
	#print "suffix probabilities:",suffixprobdict
	return (prefixprobdict, suffixprobdict, substringsprobdict)
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:27,代码来源:WordSubstringProbabilities.py

示例4: get_most_common_ngrams

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
    def get_most_common_ngrams(self, n, nb_ngrams=None):
        """
        Compute and return the set of the most common ngrams in the documents.
        This set is cached inside the object.

        Args:
            n: The number of grams. Must be a positive interger.
            nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.

        Returns:
            A list of the most common ngrams.
        """
        try:
            # return cached value
            return self._most_common_ngrams[n]
        except KeyError:
            pass

        # compute all ngrams
        all_ngrams = []
        for document in self.training_set:
            all_ngrams.extend(self.compute_ngrams(document, n))

        # get the frequency or return all ngrams
        freq = FreqDist(ngram for ngram in all_ngrams)
        # store and return the nb_ngrams most common ngrams
        if nb_ngrams:
            self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
        else:
            self._most_common_ngrams[n] = freq.keys()
        return self._most_common_ngrams[n]
开发者ID:srom,项目名称:sentiment,代码行数:33,代码来源:sentiment.py

示例5: __init__

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class VocabBuilder:
    """
    Creates a vocabulary after scanning a corpus.
    """

    def __init__(self, lang="english", min_length=3, cut_first=100):
        """
        Set the minimum length of words and which stopword list (by language) to
        use.
        """
        self._counts = FreqDist()
        self._stop = set(stopwords.words(lang))
        self._min_length = min_length
        self._cut_first = cut_first

        print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))

    def scan(self, words):
        """
        Add a list of words as observed.
        """

        for ii in [x.lower() for x in words if x.lower() not in self._stop \
                       and len(x) >= self._min_length]:
            self._counts.inc(ii)

    def vocab(self, size=5000):
        """
        Return a list of the top words sorted by frequency.
        """
        if len(self._counts) > self._cut_first + size:
            return self._counts.keys()[self._cut_first:(size + self._cut_first)]
        else:
            return self._counts.keys()[:size]
开发者ID:ReedAnders,项目名称:StatisticalNLP,代码行数:36,代码来源:lda.py

示例6: get_bot_nouns_verbs

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_bot_nouns_verbs(pos_tags, tagmap, n):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for (word, tag) in pos_tags:
        if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets(word):
            fdVerb.inc(word) 
        elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets(word):
            fdNoun.inc(word)
    return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
开发者ID:bayomim,项目名称:CIS-530-Project,代码行数:13,代码来源:project.py

示例7: get_all_nouns_verbs

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_all_nouns_verbs(tok_sents, tagmap):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for sent in tok_sents:
        for tup in sent:
            if tagmap[tup[2]] == "VERB" and tup[1] not in funcwords and wn.synsets(tup[0]):
                fdVerb.inc(tup[1]) 
            elif tagmap[tup[2]] == "NOUN" and tup[1] not in funcwords and wn.synsets(tup[0]):
                fdNoun.inc(tup[1])
    return (fdNoun.keys(), fdVerb.keys())
开发者ID:closen39,项目名称:CIS-530-HW4,代码行数:14,代码来源:hw4_code_jmow_closen.py

示例8: ExtractorOfWords

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class ExtractorOfWords():
    
    def __init__(self, pos_words, neg_words, type_of_Feature_extractor = 0): 
        self.pos_words_training =  reduce(lambda words,review: words + review.words(), pos_words, [])
        self.neg_words_training =  reduce(lambda words,review: words + review.words(), neg_words, [])
        
        if type_of_Feature_extractor == 1:
            formated_pos_words_training = self.Feature_extractor1(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor1(self.neg_words_training)
        elif type_of_Feature_extractor == 2:
            formated_pos_words_training = self.Feature_extractor2(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor2(self.neg_words_training)
        elif type_of_Feature_extractor == 3:
            formated_pos_words_training = self.Feature_extractor3(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor3(self.neg_words_training)
        elif type_of_Feature_extractor == 4:
            formated_pos_words_training = self.Feature_extractor4(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor4(self.neg_words_training)
        else: 
            formated_pos_words_training = self.pos_words_training
            formated_neg_words_training = self.neg_words_training
        
        self.pos_words_freqdist = FreqDist(formated_pos_words_training)
        self.neg_words_freqdist = FreqDist(formated_neg_words_training)
               
    #Extract n most Freq. words    
    def Extraxt_n_most_Freq_Words (self, n):
        return  self.pos_words_freqdist.keys()[:n], self.neg_words_freqdist.keys()[:n]
    
    #list of all words with their number of occurrences over *number_count*   
    def Extraxt_words_above_count (self, number_count):
        return  [word for word,count in self.pos_words_freqdist.iteritems() if count > number_count], [word for word,count in self.neg_words_freqdist.iteritems() if count > number_count]
    
    #PorterStemmer
    def Feature_extractor1(self, in_list):             
        ps =  PorterStemmer()
        return [ps.stem(w) for w in in_list]
    
    #lowercase versions of all the words
    def Feature_extractor2(self, in_list):             
        return [w.lower() for w in in_list]
    
    #Replace all number tokens with "NUM"
    def Feature_extractor3(self, in_list):             
        return ["NUM" if w.isdigit() else w for w in in_list]
    
    #combination of fiters 1 and 2
    def Feature_extractor4(self, in_list):             
        return [w.lower() for w in in_list if w.isalpha() and w.lower() not in stopwords.words('english')]
开发者ID:Labzin,项目名称:NLP,代码行数:51,代码来源:ExtractorOfWords.py

示例9: _train

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
开发者ID:atiassa,项目名称:recommend-2011,代码行数:51,代码来源:q2.py

示例10: demo_similar

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def demo_similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.
        
        @param word: The word used to seed the similarity search
        @type word: C{str} 
        @param num: The number of words to generate (default=20)
        @type num: C{int}
        @seealso: L{ContextIndex.similar_words()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        while 1:
          word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); 
          print "word='"+ word + "'"
          if word == '0': break
          word = word.decode('utf-8')
          wci = self._word_context_index._word_to_contexts
          if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            print tokenwrap(words)
          else:
            print "No matches"
开发者ID:dreampocketit,项目名称:bocard,代码行数:35,代码来源:NLTK_tools.py

示例11: getsimilar

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
	def getsimilar(self, word, num =20):
		"""
		@param word: The word used to seed the similarity search 
		@type word: C{str}  
		@param num: The number of words to generate (default=20) 
		@type num: C{int} 
		@seealso: L{ContextIndex.similar_words()}
		"""	
		if '_word_context_index' not in self.__dict__: 
			print 'Building word-context index...' 
			self._word_context_index = ContextIndex(self.tokens, 
		                                        filter=lambda x:x.isalpha(), 
		                                        key=lambda s:s.lower()) 
		#words = self._word_context_index.similar_words(word, num) 
		word = word.lower() 
		wci = self._word_context_index._word_to_contexts 
		if word in wci.conditions(): 
			contexts = set(wci[word]) 
			fd = FreqDist(w for w in wci.conditions() for c in wci[w] 
		              if c in contexts and not w == word) 
			words = fd.keys()[:num] #lists of words
			#print tokenwrap(words) 
			return words
		else: 
			print "No matches"
			return None
开发者ID:wencanluo,项目名称:Summarization,代码行数:28,代码来源:OrigReader.py

示例12: tokenize_clean

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def tokenize_clean(text):
    """Return list of items from tokenized text."""
    tokens = word_tokenize(text.lower())
    fdist = FreqDist(tokens)
    words = [w.lower() for w in fdist.keys()
             if w not in stopwords.words('english') and w.isalpha()]
    return words
开发者ID:mkcor,项目名称:caption-words,代码行数:9,代码来源:term_frequency.py

示例13: MyMarkovModel

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class MyMarkovModel(MarkovModel):
    def __init__(self, order):
        self.order = order
        self.filename = NGRAM_FILES[self.order]
       
        if 3 >= self.order >= 2:
            self.backoff = MyMarkovModel(order - 1)
            self.cfd = ConditionalFreqDist()
            self.charset = self.backoff.charset
            for ngram, count in self.get_data():
                context, char = tuple(ngram[:-1]), ngram[-1]
                self.cfd[context][char] = count

        elif self.order == 1:
            self.backoff = None
            self.n = 0
            self.fd = FreqDist()
            for char, count in self.get_data():
                self.fd[char] = count
            self.charset = set(self.fd.keys())

        else:
            raise NotImplemented

    def get_data(self):
        with open(self.filename) as fp:
            for line in fp.readlines():
                ngram, count = line.lower().split()
                count = int(count)
                yield ngram, count
开发者ID:christiaanw,项目名称:NaNoGenMo-2014,代码行数:32,代码来源:markov.py

示例14: get_term_freq_dict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
开发者ID:Maverickwarrior,项目名称:Search-Engine,代码行数:28,代码来源:tokenize_docs.py

示例15: text_to_vector

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
开发者ID:himanshusapra9,项目名称:TextNet,代码行数:36,代码来源:freq_representation.py


注:本文中的nltk.probability.FreqDist.keys方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。