当前位置: 首页>>代码示例>>Python>>正文


Python probability.ConditionalFreqDist类代码示例

本文整理汇总了Python中nltk.probability.ConditionalFreqDist的典型用法代码示例。如果您正苦于以下问题:Python ConditionalFreqDist类的具体用法?Python ConditionalFreqDist怎么用?Python ConditionalFreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ConditionalFreqDist类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: cDist

    def cDist(self, params):
        """return conditional freq distribution (based on part of speech) using filtered_words from loadData"""

        president = params["president"]
        speech = params["speech"]

        if self.president == "All presidents":
            pipeline = [{"$match": {"type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}]
        else:
            pipeline = [
                {"$match": {"name": president, "type": speech}},
                {"$project": {"tags": "$filtered_speech_tags"}},
            ]

        tags = []
        for i in self.col.aggregate(pipeline):
            tags.extend(i["tags"])

        cfdist = ConditionalFreqDist()  # conditioned on pos_tag
        for word, tag in tags:
            condition = tag  # specify condition to group frequencies by
            cfdist[condition][word] += 1

        VB = MLEProbDist(cfdist.get("VBP"))
        NN = MLEProbDist(cfdist.get("NN"))
        JJ = MLEProbDist(cfdist.get("JJ"))

        return VB, NN, JJ  # return verbs, nouns, adjectives
开发者ID:cgerson,项目名称:presidential-haikus,代码行数:28,代码来源:pres_words_spyre.py

示例2: _setSelectedPOSTags

   def _setSelectedPOSTags(self):

      buff = self._loadData('selective_pos.bin')

      if buff:
         self.selective_pos = buff
         return

      #First get all (word, tag) in corpuses
      sentences = brown.tagged_sents(simplify_tags=True)
      self.selected_tags = ["ADJ","ADV", "CNJ"]
      self.selective_pos = ConditionalFreqDist()
      temp_dist = ConditionalFreqDist()
      for sentence in sentences:
         for (word, tag) in sentence:
            if tag in self.selected_tags:
               temp_dist[tag].inc(str(word).lower())

      #Now, get the words with frequency > 10
      for category in temp_dist.conditions():
         fredist = temp_dist[category]
         for key in fredist.keys():
            if fredist[key] > 4:
               self.selective_pos[category].inc(key)

      self._saveData('selective_pos.bin',self.selective_pos)
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:26,代码来源:opinionminer.py

示例3: readFormatedData

def readFormatedData(formatedData):
    #unigramFd = FreqDist()
    #bigramFd = FreqDist()
    cBigramFd1 = ConditionalFreqDist()
    cBigramFd2 = ConditionalFreqDist()
    #dict1 = Set([])
    #dict2 = Set([])
    for tuple in formatedData:
        words = tuple[0].split(' ')
        count = int(tuple[1])
        #unigramFd.inc(words[0])
        #unigramFd.inc(words[1])
        #bigramFd.inc((words[0], words[1]), count)
        word2 = words[1]
        if count < 5:
            word2 = "unknown"
        cBigramFd1[words[0]].inc(word2, count)
        #if words[0] not in dict1:
        #    dict1.add(words[0])
        #if words[1] not in dict2:
        #    dict2.add(words[1])
    for w1 in cBigramFd1.conditions():
        bigram_w1 = cBigramFd1[w1]
        for w2 in bigram_w1.samples():
            cBigramFd2[w2].inc(w1, bigram_w1[w2])
    return cBigramFd1, cBigramFd2#, dict1, dict2
开发者ID:szha,项目名称:surprise-models,代码行数:26,代码来源:aggregatePickMax.py

示例4: high_information_words

def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    """
    To eliminate low information feature words for set of words for EFFICIENCY
    :param labeled_words: list of 2 tuples [(label, words)]
                          label -> is a classification label (pos / neg)
                          words -> is a list of words that occur under that label
    :param score_fn: a scoring function to measure how informative that word is
    :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
    :return: a set of high informative words
    """
    print "Counting Word Frequencies"
    word_fq = FreqDist()
    labeled_word_fq = ConditionalFreqDist()

    for label, words in labeled_words:
        for word in words:
            word_fq[word] += 1
            labeled_word_fq[label][word] += 1
    n_xx = labeled_word_fq.N()
    high_info_words = set()

    for label in labeled_word_fq.conditions():
        n_xi = labeled_word_fq[label].N()
        word_scores = collections.defaultdict(int)

        for word, n_ii in labeled_word_fq[label].iteritems():
            n_ix = word_fq[word]
            score = score_fn(n_ii, (n_ix, n_xi), n_xx)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
        high_info_words |= set(bestwords)

    return high_info_words
开发者ID:Saher-,项目名称:SATC,代码行数:34,代码来源:Sys_Params.py

示例5: get_high_information_words

def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    labels = lwords.keys()
    labelled_words = [(l, lwords[l]) for l in labels]
    word_freq_dist = FreqDist()
    label_word_freq_dist = ConditionalFreqDist()

    for label, dwords in labelled_words:
        for words in dwords:
            for word in words:
                word_freq_dist[word] += 1
                label_word_freq_dist[label][word] += 1

    n_words_total = label_word_freq_dist.N()
    high_info_words = set()

    for label in label_word_freq_dist.conditions():
        n_words_label = label_word_freq_dist[label].N()
        word_scores = defaultdict(int)

        for word, word_freq_label in label_word_freq_dist[label].items():
            word_freq = word_freq_dist[word]
            score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
            word_scores[word] = score

        bestwords = [word for word, score in word_scores.items() if score >= min_score]
        high_info_words |= set(bestwords)
    return high_info_words
开发者ID:fruser,项目名称:review-analyzer,代码行数:27,代码来源:text_utils.py

示例6: high_information_words

def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd.inc(word)
			label_word_fd[label].inc(word)
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].iteritems():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:25,代码来源:featx.py

示例7: Ae_kappa

 def Ae_kappa(self, cA, cB):
     Ae = 0.0
     nitems = float(len(self.I))
     label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
     for k in label_freqs.conditions():
         Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
     return Ae
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:7,代码来源:agreement.py

示例8: _train

    def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                        tag != self.backoff.tag_one(
                        tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0) / token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:59,代码来源:sequential.py

示例9: __init__

	def __init__(self, r, name, cond_samples=None):
		self._r = r
		self._name = name
		ConditionalFreqDist.__init__(self, cond_samples)
		# initialize self._fdists for all matching keys
		for key in self._r.keys(encode_key('%s:*' % name)):
			condition = key.split(':')[1]
			self[condition] # calls self.__getitem__(condition)
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:8,代码来源:redisprob.py

示例10: __init__

	def __init__(self, r, name, cond_samples=None):
		self._r = r
		self._name = name
		ConditionalFreqDist.__init__(self, cond_samples)
		
		for key in self._r.keys(encode_key('%s:*' % name)):
			condition = key.split(b':')[1].decode()
			self[condition] # calls self.__getitem__(condition)
开发者ID:ShunyuanZ,项目名称:nltk3-cookbook,代码行数:8,代码来源:redisprob.py

示例11: words_by_followers

def words_by_followers(category):
    """Given a category from the brown corpus, lowercases everything,
    and returns a frequency distribution where the keys are words
    and the counts are the number of different contexts that each word can appear in."""
    bigrams = brown_bigrams(category)
    cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
    fdist = FreqDist()
    for context in cfdist.keys():
        fdist[context] = len(cfdist[context])
    return fdist
开发者ID:slee17,项目名称:NLP,代码行数:10,代码来源:languageModel.py

示例12: _train

def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
开发者ID:atiassa,项目名称:recommend-2011,代码行数:49,代码来源:q2.py

示例13: __init__

    def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0
开发者ID:Arttii,项目名称:TextBlob,代码行数:47,代码来源:tnt.py

示例14: validate_pcfg_generate

def validate_pcfg_generate(grammar):
    pd = makeLhrProbDict(grammar)
    productions = []
    cfd = ConditionalFreqDist()

    for i in np.arange(1000):
        tree = pcfg_generate(grammar)
        productions += tree.productions()

    for p in productions:
        cfd[p.lhs()].inc(p.rhs())

    for c in cfd.conditions():
        p = MLEProbDist(cfd[c])
        q = pd[c]
        div = KL_Divergence(p, q)
        print "KL_Divergence for %s = %f" % (c, div)
开发者ID:haozhuoran1991,项目名称:recommend-2011,代码行数:17,代码来源:q2_1.py

示例15: __init__

    def __init__(self, load_from_disk=True):
        self._corpus = reuters.words()

        self._unigram_fd = FreqDist()
        self._bigram_cfd = ConditionalFreqDist()
        self._trigram_cfd = ConditionalFreqDist()
        self._quadgram_cfd = ConditionalFreqDist()

        self._unigram_pd = None
        self._bigram_cpd = None
        self._trigram_cpd = None
        self._quadgram_cpd = None

        if load_from_disk:
            self._load_models()
        else:
            self._train()
开发者ID:drewatk,项目名称:textPredictor,代码行数:17,代码来源:predictor.py


注:本文中的nltk.probability.ConditionalFreqDist类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。