当前位置: 首页>>代码示例>>Python>>正文


Python probability.FreqDist类代码示例

本文整理汇总了Python中nltk.probability.FreqDist的典型用法代码示例。如果您正苦于以下问题:Python FreqDist类的具体用法?Python FreqDist怎么用?Python FreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了FreqDist类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_word_scores

def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #积极词的数量
    neg_word_count = cond_word_fd[negTag].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:27,代码来源:extractFeatures.py

示例2: summarize

def summarize(self, input, num_sentences ):
                s=[]
                punt_list=['.',',','!','?']
                summ_sentences = []
                sentences=input
                #sentences = sent_tokenize(input)
                lowercase_sentences =[sentence.lower() 
                        for sentence in sentences]
                #print lowercase_sentences
                saito=' '.join(sentences)
                s=input
                ts=''.join([ o for o in s if not o in  punt_list ]).split()
                lowercase_words=[word.lower() for word in ts]
                words = [word for word in lowercase_words if word not in stopwords.words()]
                word_frequencies = FreqDist(words)
                
                most_frequent_words = [pair[0] for pair in 
                        word_frequencies.items()[:100]]

                # add sentences with the most frequent words
                if(len(s) < num_sentences):
                    num_sentences=len(s)
                for word in most_frequent_words:
                        for i in range(len(lowercase_sentences)):
                            if len(summ_sentences) < num_sentences:
                                        if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]):
                                                summ_sentences.append(lowercase_sentences[i])
                            else:
								break
                        if len(summ_sentences) >= num_sentences:
                             break  
                        
                # reorder the selected sentences
                summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) )
                return summ_sentences
开发者ID:benjbigot,项目名称:BNN_WIN,代码行数:35,代码来源:naivesumm.py

示例3: create_words_bigrams_scores

def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:wac81,项目名称:LSI-for-ChineseDocument,代码行数:35,代码来源:pos_neg_ml_feature.py

示例4: make_summary

def make_summary( text):
	sent = []
	stemmed = []
	tokens = word_tokenize(text)
	sent = sent_tokenize(text)
	for token in tokens:
		if token in stopwords.words('english'):
			tokens.remove(token)
	stemmer = PorterStemmer()

	for token in tokens:
	 	stemmed.append(stemmer.stem(token))
#freq(stemmed)
	for word in stemmed:
		word.lower()
	word_freq = FreqDist(stemmed)

	most_freq_words = [pair[0] for pair in word_freq.items()[:60]]

	working_sent = [sentence.lower() for sentence in sent]

	out_sent = []

	for word in most_freq_words:
		for i in range(0,len(working_sent)):
			if (word in working_sent[i] and sent[i] not in out_sent):
				out_sent.append(sent[i])
				break
			if len(out_sent) >= 5:
			 	break
		
		if len(out_sent) >= 5:
			break

	return reorder(out_sent,text)
开发者ID:aigeano,项目名称:Summaly,代码行数:35,代码来源:summaly.py

示例5: train_supervised

    def train_supervised(self, labelled_sequences, **kwargs):
        """
        Supervised training maximising the joint probability of the symbol and
        state sequences. This is done via collecting frequencies of
        transitions between states, symbol observations while within each
        state and which states start a sentence. These frequency distributions
        are then normalised into probability estimates, which can be
        smoothed if desired.

        :return: the trained model
        :rtype: HiddenMarkovModelTagger
        :param labelled_sequences: the training data, a set of
            labelled sequences of observations
        :type labelled_sequences: list
        :param kwargs: may include an 'estimator' parameter, a function taking
            a FreqDist and a number of bins and returning a CProbDistI;
            otherwise a MLE estimate is used
        """

        # default to the MLE estimate
        estimator = kwargs.get('estimator')
        if estimator is None:
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # count occurrences of starting states, transitions out of each state
        # and output symbols observed in each state
        known_symbols = set(self._symbols)
        known_states = set(self._states)

        starting = FreqDist()
        transitions = ConditionalFreqDist()
        outputs = ConditionalFreqDist()
        for sequence in labelled_sequences:
            lasts = None
            for token in sequence:
                state = token[_TAG]
                symbol = token[_TEXT]
                if lasts is None:
                    starting.inc(state)
                else:
                    transitions[lasts].inc(state)
                outputs[state].inc(symbol)
                lasts = state

                # update the state and symbol lists
                if state not in known_states:
                    self._states.append(state)
                    known_states.add(state)

                if symbol not in known_symbols:
                    self._symbols.append(symbol)
                    known_symbols.add(symbol)

        # create probability distributions (with smoothing)
        N = len(self._states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        B = ConditionalProbDist(outputs, estimator, len(self._symbols))

        return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
开发者ID:pierrefribourg,项目名称:nltk,代码行数:60,代码来源:hmm.py

示例6: train_MLT

    def train_MLT(self, tagged_train_data, untagged_training_data):
        """
        Builds a most likely tag tagger from the given tagged training data as WORDS
        :param train_data:
        :return: model
        """
        # find the set of words
        words = set()
        for sent in untagged_training_data:
            for word in sent:
                words.add(word)
        # Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........}
        mlt_dict = dict()
        # Initialize keys and values to it
        for word in words:
            mlt_dict[word] = dict()
        # Compute the freq dist of tagged words
        tagged_words_fdist = FreqDist(tagged_train_data)

        for tagged_word, count in tagged_words_fdist.items():
            (mlt_dict[tagged_word[0]])[tagged_word] = count

        # Update the dict to contain the most likely tag for each word
        #for word, inside_dict in mlt_dict.items():
        #   max_val = max(inside_dict.values())
        #    inside_dict =
        print("Training is done!")
        return mlt_dict
开发者ID:GaddipatiAsish,项目名称:Natural-Language-Processing,代码行数:28,代码来源:Q6_Part1.py

示例7: most_frequent_words

def most_frequent_words(path,top):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = get_bigrams(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        for key in keys:
            #print(str(key[0]) + " , " + str(key[1]) + "\n");
            word_set.add(key[0]);
    print(word_set);
    fw = open("./Features/Bigrams.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:32,代码来源:most_frequent_bigrams.py

示例8: classify

	def classify(self, feats):
		counts = FreqDist()
		
		for classifier in self._classifiers:
			counts.inc(classifier.classify(feats))
		
		return counts.max()
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:7,代码来源:classification.py

示例9: scores

  def scores(self, docId):
    """
    Return the score from the given document to every other
    document in the index. Documents not listed are assumed
    to have no similarity detected by shared terms.

    :param docId: ID of doc to compare other docs to.
    :returns: A list of tuples of (document ID, similarity score).
      Larger scores are better.
    """
    if not self._idf:
      self._computeIdfs()
    # Track the scores
    #
    docScores = FreqDist()
    for termid, freq in self.termFrequencies[docId].iteritems():
      # Find the frequency with which this term appears in other documents.
      #
      inverseDocumentFrequency = self._idf[termid]
      for otherDocId in self.termsToDocuments[termid]:
        if otherDocId == docId:
          # Skip this document
          continue
        # Find the term frequency of the term in the other document. 
        #
        otherFreq = self.termFrequencies[docId][termid]
        # Score proportional to product of frequencies times the inverse of
        # the document frequency.
        #
        docScores.inc(otherDocId, freq * otherFreq * inverseDocumentFrequency)

    return docScores
开发者ID:timdestan,项目名称:quiz-bowl-entity-resolution,代码行数:32,代码来源:invertedindex.py

示例10: word_tag_model

def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)

    most_freq = (word for word, count in fd.most_common(limit))

    return dict((word, cfd[word].max()) for word in most_freq)
开发者ID:byam,项目名称:predictEPL,代码行数:7,代码来源:tag_util.py

示例11: get_term_freq_dict

def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
开发者ID:Maverickwarrior,项目名称:Search-Engine,代码行数:26,代码来源:tokenize_docs.py

示例12: choose_tag

	def choose_tag(self, tokens, index, history):
		tags = FreqDist()
		
		for tagger in self._taggers:
			tags.inc(tagger.choose_tag(tokens, index, history))
		
		return tags.max()
开发者ID:ANB2,项目名称:nltk-trainer,代码行数:7,代码来源:taggers.py

示例13: create_word_bigram_scores

def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:29,代码来源:process.py

示例14: create_word_scores

def create_word_scores(posWords, negWords):
    file_scores = file("cn_sample_data/scores.txt", "w")
    #迭代,将多个序列合并
    
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in negWords:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    for key in word_scores:
        file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
    file_scores.close()
    return word_scores 
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:25,代码来源:process.py

示例15: GetHighInformationWordsChi

        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
 
            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] +=1
                label_word_fd['pos'][word.lower()] +=1
 
            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] +=1
                label_word_fd['neg'][word.lower()] +=1
 
            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count
 
            word_scores = {}
 
            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score
 
            best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
开发者ID:ai2010,项目名称:machine_learning_for_the_web,代码行数:28,代码来源:views.py


注:本文中的nltk.probability.FreqDist类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。