当前位置: 首页>>代码示例>>Python>>正文


Python ConditionalFreqDist.conditions方法代码示例

本文整理汇总了Python中nltk.probability.ConditionalFreqDist.conditions方法的典型用法代码示例。如果您正苦于以下问题:Python ConditionalFreqDist.conditions方法的具体用法?Python ConditionalFreqDist.conditions怎么用?Python ConditionalFreqDist.conditions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.ConditionalFreqDist的用法示例。


在下文中一共展示了ConditionalFreqDist.conditions方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: high_information_words

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    """
    To eliminate low information feature words for set of words for EFFICIENCY
    :param labeled_words: list of 2 tuples [(label, words)]
                          label -> is a classification label (pos / neg)
                          words -> is a list of words that occur under that label
    :param score_fn: a scoring function to measure how informative that word is
    :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
    :return: a set of high informative words
    """
    print "Counting Word Frequencies"
    word_fq = FreqDist()
    labeled_word_fq = ConditionalFreqDist()

    for label, words in labeled_words:
        for word in words:
            word_fq[word] += 1
            labeled_word_fq[label][word] += 1
    n_xx = labeled_word_fq.N()
    high_info_words = set()

    for label in labeled_word_fq.conditions():
        n_xi = labeled_word_fq[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in labeled_word_fq[label].iteritems():
            n_ix = word_fq[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words
开发者ID:Saher-,项目名称:SATC,代码行数:36,代码来源:Sys_Params.py

示例2: Ae_kappa

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
 def Ae_kappa(self, cA, cB):
     Ae = 0.0
     nitems = float(len(self.I))
     label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
     for k in label_freqs.conditions():
         Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
     return Ae
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:9,代码来源:agreement.py

示例3: get_high_information_words

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    labels = lwords.keys()
    labelled_words = [(l, lwords[l]) for l in labels]
    word_freq_dist = FreqDist()
    label_word_freq_dist = ConditionalFreqDist()

    for label, dwords in labelled_words:
        for words in dwords:
            for word in words:
                word_freq_dist[word] += 1
                label_word_freq_dist[label][word] += 1

    n_words_total = label_word_freq_dist.N()
    high_info_words = set()

    for label in label_word_freq_dist.conditions():
        n_words_label = label_word_freq_dist[label].N()
        word_scores = defaultdict(int)

        for word, word_freq_label in label_word_freq_dist[label].items():
            word_freq = word_freq_dist[word]
            score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    return high_info_words
开发者ID:fruser,项目名称:review-analyzer,代码行数:29,代码来源:text_utils.py

示例4: readFormatedData

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def readFormatedData(formatedData):
    #unigramFd = FreqDist()
    #bigramFd = FreqDist()
    cBigramFd1 = ConditionalFreqDist()
    cBigramFd2 = ConditionalFreqDist()
    #dict1 = Set([])
    #dict2 = Set([])
    for tuple in formatedData:
        words = tuple[0].split(' ')
        count = int(tuple[1])
        #unigramFd.inc(words[0])
        #unigramFd.inc(words[1])
        #bigramFd.inc((words[0], words[1]), count)
        word2 = words[1]
        if count < 5:
            word2 = "unknown"
        cBigramFd1[words[0]].inc(word2, count)
        #if words[0] not in dict1:
        #    dict1.add(words[0])
        #if words[1] not in dict2:
        #    dict2.add(words[1])
    for w1 in cBigramFd1.conditions():
        bigram_w1 = cBigramFd1[w1]
        for w2 in bigram_w1.samples():
            cBigramFd2[w2].inc(w1, bigram_w1[w2])
    return cBigramFd1, cBigramFd2#, dict1, dict2
开发者ID:szha,项目名称:surprise-models,代码行数:28,代码来源:aggregatePickMax.py

示例5: high_information_words

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd.inc(word)
			label_word_fd[label].inc(word)
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].iteritems():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:27,代码来源:featx.py

示例6: _setSelectedPOSTags

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
   def _setSelectedPOSTags(self):

      buff = self._loadData('selective_pos.bin')

      if buff:
         self.selective_pos = buff
         return

      #First get all (word, tag) in corpuses
      sentences = brown.tagged_sents(simplify_tags=True)
      self.selected_tags = ["ADJ","ADV", "CNJ"]
      self.selective_pos = ConditionalFreqDist()
      temp_dist = ConditionalFreqDist()
      for sentence in sentences:
         for (word, tag) in sentence:
            if tag in self.selected_tags:
               temp_dist[tag].inc(str(word).lower())

      #Now, get the words with frequency > 10
      for category in temp_dist.conditions():
         fredist = temp_dist[category]
         for key in fredist.keys():
            if fredist[key] > 4:
               self.selective_pos[category].inc(key)

      self._saveData('selective_pos.bin',self.selective_pos)
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:28,代码来源:opinionminer.py

示例7: _train

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                        tag != self.backoff.tag_one(
                        tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:61,代码来源:sequential.py

示例8: _train

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
开发者ID:atiassa,项目名称:recommend-2011,代码行数:51,代码来源:q2.py

示例9: validate_pcfg_generate

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()

    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())

    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p, q)
        print "KL_Divergence for %s = %f" % (c, div)
开发者ID:haozhuoran1991,项目名称:recommend-2011,代码行数:19,代码来源:q2_1.py

示例10: _train

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        """
        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            #best_tag = fd[context].max()
            for (tag, hits) in fd[context].items():
                if hits > cutoff:
                    self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
                    self._contexts_to_tags[context][tag] = hits
                    hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)
开发者ID:0623forbidden,项目名称:nltk4russian,代码行数:45,代码来源:tagger.py

示例11: sum_category_word_scores

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def sum_category_word_scores(categorized_words, score_fn):
	word_fd = FreqDist()
	category_word_fd = ConditionalFreqDist()
	
	for category, words in categorized_words:
		for word in words:
			word_fd.inc(word)
			category_word_fd[category].inc(word)
	
	scores = collections.defaultdict(int)
	n_xx = category_word_fd.N()
	
	for category in category_word_fd.conditions():
		n_xi = category_word_fd[category].N()
		
		for word, n_ii in iteritems(category_word_fd[category]):
			n_ix = word_fd[word]
			scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
	
	return scores
开发者ID:Herka,项目名称:nltk-trainer,代码行数:22,代码来源:scoring.py

示例12: significantWords

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def significantWords(untagged_docs, min_chisq=5, ratio=0.75):
    """ 
    Use chisq test of bigram contingency table to measure 
    the association of token with its sentiment

    Parameters
    ----------
    untagged_docs: list of tuples (words, tag)
    min_chisq: lower bound of significant
    ratio: pos/neg ratio, used to determine the sentiment of a word

    Returns
    -------
    significant_words: a 3-key-dict of words set

    """ 
    significant_words = collections.defaultdict(set)
    freq_dist = FreqDist()
    label_freq_dist = ConditionalFreqDist()
    stopping_words = set(nltk.corpus.stopwords.words('english'))
    for tokens, label in untagged_docs:
        for token in tokens:
            if token.isalpha() and not (token in stopping_words):
                freq_dist.inc(token)
                label_freq_dist[label].inc(token)
    n_xx = label_freq_dist.N()
    #pdb.set_trace()
    for label in label_freq_dist.conditions():
        for word, n_ii in label_freq_dist[label].iteritems():
            n_xi = label_freq_dist[label].N()
            n_ix = freq_dist[word]
            n_oi = n_xi-n_ii
            n_io = n_ix-n_ii
            n_oo = n_xx-n_oi-n_io-n_ii
            chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\
                    /((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi))
            if chisq > min_chisq and n_ii>10:
                significant_words['total'] |= set([word])
                if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1:
                    significant_words[label] |= set([word])
    return significant_words
开发者ID:Applied-data-science-HW8,项目名称:Homework_08,代码行数:43,代码来源:tagger.py

示例13: sum_category_word_scores

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def sum_category_word_scores(categorized_words, score_fn):
    # get word freq
    word_fd = FreqDist()
    # get conditional freq Dist
    category_word_fd = ConditionalFreqDist()
    # according to catagory
    for category, words in categorized_words:
        for word in words:
            word_fd.inc(word)
            category_word_fd[category].inc(word)

    scores = collections.defaultdict(int)
    n_xx = category_word_fd.N()

    for category in category_word_fd.conditions():
        n_xi = category_word_fd[category].N()

        for word, n_ii in category_word_fd[category].iteritems():
            n_ix = word_fd[word]
            scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
            # return the scores
    return scores
开发者ID:CloudFlix,项目名称:Project_CloudFlix,代码行数:24,代码来源:model_trainer.py

示例14: high_information_words

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    # gathers the most frequently occuring features to improve classification
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for label, words in labelled_words:
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
            
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
    for word, n_ii in label_word_fd[label].items():
        n_ix = word_fd[word]
        score = score_fn(n_ii, (n_ix, n_xi), n_xx)
        word_scores[word] = score
        
    bestwords = [word for word, score in word_scores.items() if score>= min_score]
    high_info_words |= set(bestwords)
    return high_info_words
开发者ID:DeamonSpawn,项目名称:UntitledSAProj,代码行数:26,代码来源:feature_extractor.py

示例15: unicode

# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
				word = unicode(word)	
				cfd[prev_word][word]+=1
				global_fd[word] += 1
				prev_word = word
	except:
		print "falhou um link..."

print "terminou distrib probabilidades"

print "vai construir as listas com o formato pro SQLite"
global_frequencies = []
for word in sorted(global_fd.keys()):
	global_frequencies.append((word, global_fd[word]))

conditional_frequencies = []
for condition in sorted(cfd.conditions()):
	for word in sorted(cfd[condition].keys()):
		if condition:
			conditional_frequencies.append((condition, word, cfd[condition][word]))
	

print "vai comecar a parte do banco de dados"
con = lite.connect("words-pt.db")		

with con:
	cur = con.cursor()
	cur.execute("DROP TABLE IF EXISTS _1_gram")
	cur.execute("CREATE TABLE _1_gram(word TEXT, count INT)")
	cur.executemany("INSERT INTO _1_gram VALUES(?, ?)", tuple(global_frequencies))

	cur.execute("DROP TABLE IF EXISTS _2_gram")
开发者ID:rafaelrozendo,项目名称:ATBarDev,代码行数:33,代码来源:populate_db.py


注:本文中的nltk.probability.ConditionalFreqDist.conditions方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。