Python probability.ConditionalFreqDist方法代碼示例

本文整理匯總了Python中nltk.probability.ConditionalFreqDist方法的典型用法代碼示例。如果您正苦於以下問題：Python probability.ConditionalFreqDist方法的具體用法？Python probability.ConditionalFreqDist怎麽用？Python probability.ConditionalFreqDist使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.probability的用法示例。

在下文中一共展示了probability.ConditionalFreqDist方法的14個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: weighted_kappa_pairwise

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
                for x in self.data
                if x['coder'] in (cA, cB))
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:18，代碼來源:agreement.py

示例2: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

開發者ID:foxbook，項目名稱:atap，代碼行數:22，代碼來源:model.py

示例3: weighted_kappa_pairwise

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist(
            (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
        )
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret

開發者ID:V1EngineeringInc，項目名稱:V1EngineeringInc-Docs，代碼行數:18，代碼來源:agreement.py

示例4: pynlpir_feature

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def pynlpir_feature(number):  # 選取number個特征詞
    normalWords = []
    advWords = []
    for items in read_file('ad/normal.txt'):  # 把集合的集合變成集合
        for item in items:
            normalWords.append(item)
    for items in read_file('ad/advertise.txt'):
        for item in items:
            advWords.append(item)
    word_fd = FreqDist()  # 可統計所有詞的詞頻
    cond_word_fd = ConditionalFreqDist()  # 可統計正常文本中的詞頻和廣告文本中的詞頻
    for word in normalWords:
        word_fd[word] += 1
        cond_word_fd['normal'][word] += 1
    for word in advWords:
        word_fd[word] += 1
        cond_word_fd['adv'][word] += 1
    normal_word_count = cond_word_fd['normal'].N()  # 正常詞的數量
    adv_word_count = cond_word_fd['adv'].N()  # 廣告詞的數量
    total_word_count = normal_word_count + adv_word_count
    word_scores = {}  # 包括了每個詞和這個詞的信息量
    for word, freq in word_fd.items():
        # 計算正常詞的卡方統計量，這裏也可以計算互信息等其它統計量
        normal_score = BigramAssocMeasures.chi_sq(cond_word_fd['normal'][word],
                                                  (freq, normal_word_count),
                                                  total_word_count)
        adv_score = BigramAssocMeasures.chi_sq(cond_word_fd['adv'][word],
                                               (freq, adv_word_count),
                                               total_word_count)  # 同理
        # 一個詞的信息量等於正常卡方統計量加上廣告卡方統計量
        word_scores[word] = normal_score + adv_score
    best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[
                :number]  # 把詞按信息量倒序排序。number是特征的維度，是可以不斷調整直至最優的
    # χ²=∑(Oi-Ei)/Ei~χ²(k-1)
    # i=1~k
    # Oi是觀測值
    # Ei是期望值
    # 統計量大於臨界值時,拒絕原假設

    best_words = set([w for w, s in best_vals])
    return dict([(word, True) for word in best_words])

開發者ID:Zephery，項目名稱:weiboanalysis，代碼行數:43，代碼來源:svm_train.py

示例5: Ae_kappa

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:9，代碼來源:agreement.py

示例6: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                     for i, w in enumerate(tokens))
        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                     for i, w in enumerate(tokens))

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:15，代碼來源:text.py

示例7: create_word_scores

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def create_word_scores():
	#creates lists of all positive and negative words
	posWords = []
	negWords = []
	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
		for i in posSentences:
			posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			posWords.append(posWord)
	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
		for i in negSentences:
			negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			negWords.append(negWord)
	posWords = list(itertools.chain(*posWords))
	negWords = list(itertools.chain(*negWords))

	#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
	word_fd = FreqDist()
	cond_word_fd = ConditionalFreqDist()
	for word in posWords:
		word_fd.inc(word.lower())
		cond_word_fd['pos'].inc(word.lower())
	for word in negWords:
		word_fd.inc(word.lower())
		cond_word_fd['neg'].inc(word.lower())

	#finds the number of positive and negative words, as well as the total number of words
	pos_word_count = cond_word_fd['pos'].N()
	neg_word_count = cond_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count
    
    #builds dictionary of word scores based on chi-squared test
	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score

	return word_scores

#finds word scores

開發者ID:McGillX，項目名稱:edx_data_research，代碼行數:42，代碼來源:sentiment_analysis.py

示例8: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
        )
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
        )

開發者ID:V1EngineeringInc，項目名稱:V1EngineeringInc-Docs，代碼行數:17，代碼來源:text.py

示例9: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, ngram_text=None):
        """Creates a new NgramCounter.

        If `ngram_text` is specified, counts ngrams from it, otherwise waits for
        `update` method to be called explicitly.

        :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
        :type ngram_text: Iterable(Iterable(tuple(str))) or None

        """
        self._counts = defaultdict(ConditionalFreqDist)
        self._counts[1] = self.unigrams = FreqDist()

        if ngram_text:
            self.update(ngram_text)

開發者ID:V1EngineeringInc，項目名稱:V1EngineeringInc-Docs，代碼行數:17，代碼來源:counter.py

示例10: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:49，代碼來源:tnt.py

示例11: _train

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))

######################################################################
#{ Tagger Classes
######################################################################

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:63，代碼來源:sequential.py

示例12: _train

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag) tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        :param verbose: Not used
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if self.backoff is None or tag != self.backoff.tag_one(
                    tokens, index, tags[:index]
                ):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max() # Remove
            weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()]
            hits = fd[context][best_tag] #INT
            if hits > cutoff:
                self._context_to_tag[context] = weighted_tags
                hit_count += hits

開發者ID:cltk，項目名稱:cltk，代碼行數:54，代碼來源:ensemble.py

示例13: _train

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)

######################################################################
#{ Tagger Classes
######################################################################

開發者ID:blackye，項目名稱:luscan-devel，代碼行數:63，代碼來源:sequential.py

示例14: init

# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0

開發者ID:V1EngineeringInc，項目名稱:V1EngineeringInc-Docs，代碼行數:49，代碼來源:tnt.py

注：本文中的nltk.probability.ConditionalFreqDist方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。