Python probability.FreqDist方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist方法的典型用法代码示例。如果您正苦于以下问题：Python probability.FreqDist方法的具体用法？Python probability.FreqDist怎么用？Python probability.FreqDist使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability的用法示例。

在下文中一共展示了probability.FreqDist方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Do_alpha

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in iteritems(label_freqs):
                for l, nl in iteritems(label_freqs):
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:18，代码来源:agreement.py

示例2: _freq_threshold

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok in fdist:
            count = fdist[tok]
            if count < threshold:
                num_removed += 1
            else:
                res[tok] += count
        res[None] += num_removed
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #////////////////////////////////////////////////////////////

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:23，代码来源:punkt.py

示例3: from_words

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size)

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:22，代码来源:collocations.py

示例4: binary_stump

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label)
                         in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1


        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default)

开发者ID:rafasashi，项目名称:razzy-spinner，代码行数:25，代码来源:decisiontree.py

示例5: calculate_ngram_diversity

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def calculate_ngram_diversity(corpus):
    """
    Calculates unigram and bigram diversity

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score

    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity

开发者ID:HareeshBahuleyan，项目名称:tf-var-attention，代码行数:21，代码来源:eval_utils.py

示例6: calculate_entropy

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def calculate_entropy(corpus):
    """
    Calculates diversity in terms of entropy (using unigram probability)

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        ent: entropy on the sample sentence list

    """
    fdist = FreqDist(corpus)
    total_len = len(corpus)
    ent = 0
    for k, v in fdist.items():
        p = v / total_len

        ent += -p * np.log(p)

    return ent

开发者ID:HareeshBahuleyan，项目名称:tf-var-attention，代码行数:22，代码来源:eval_utils.py

示例7: init

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist()

开发者ID:foxbook，项目名称:atap，代码行数:22，代码来源:model.py

示例8: transform

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def transform(self, documents):
        words = []
        docs = []
        for document in documents:
            docs.append(document)
            for para in document:
                for sent in para:
                    for token, tag in sent:
                        words.append(token)

        counts = FreqDist(words)
        self.reduced = set(
            w for w in words if counts[w] > self.min and counts[w] < self.max
        )

        return [
            ' '.join(self.normalize(doc)) for doc in docs
        ]

开发者ID:foxbook，项目名称:atap，代码行数:20，代码来源:transformer.py

示例9: Do_alpha

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in label_freqs.iteritems():
                for l, nl in label_freqs.iteritems():
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret

开发者ID:blackye，项目名称:luscan-devel，代码行数:18，代码来源:agreement.py

示例10: _freq_threshold

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok, count in fdist.iteritems():
            if count < threshold:
                num_removed += 1
            else:
                res.inc(tok, count)
        res.inc(None, num_removed)
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #////////////////////////////////////////////////////////////

开发者ID:blackye，项目名称:luscan-devel，代码行数:22，代码来源:punkt.py

示例11: from_words

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            try:
                window = window[:list(window).index(w1, 1)]
            except ValueError:
                pass
            wfd.inc(w1)
            for w2 in set(window[1:]):
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd)

开发者ID:blackye，项目名称:luscan-devel，代码行数:23，代码来源:collocations.py

示例12: binary_stump

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist([label for (featureset,label)
                          in labeled_featuresets]).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist.inc(label)
            else:
                neg_fdist.inc(label)

        decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        default = DecisionTreeClassifier(neg_fdist.max())
        return DecisionTreeClassifier(label, feature_name, decisions, default)

开发者ID:blackye，项目名称:luscan-devel，代码行数:18，代码来源:decisiontree.py

示例13: cal_Distinct

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def cal_Distinct(corpus):
    """
    Calculates unigram and bigram diversity
    Args:
        corpus: tokenized list of sentences sampled
    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score
    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity

开发者ID:gmftbyGMFTBY，项目名称:MultiTurnDialogZoo，代码行数:18，代码来源:metric.py

示例14: sample_relations_top_n

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def sample_relations_top_n(graph, context, type_):
    num_total_words = len(context)
    dist = FreqDist(context)
 
    for node in graph:
        node = build_score_per_layer(node, dist, num_total_words)

    for node in graph:
        node = calc_top_n_score_by_level(node)

    for i, node in enumerate(graph):
        graph[i] = prune_graph_by_top_n_softmax(node)

    selected_paths = select_paths(graph) 
    paths = build_subpaths(selected_paths)
    final_paths = list(paths for paths, _ in itertools.groupby(paths))
    random.shuffle(final_paths)
    return final_paths

开发者ID:yicheng-w，项目名称:CommonSenseMultiHopQA，代码行数:20，代码来源:general.py

示例15: get_stop_words_1

# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def get_stop_words_1(data, num_stop_words):
    total_words = []
    for d in data:
       total_words.extend(d["ques"])
       total_words.extend(d["answer1"])
       for d_i in d["summary"]:
           total_words.extend(d_i)
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words

开发者ID:yicheng-w，项目名称:CommonSenseMultiHopQA，代码行数:18，代码来源:read_data.py

注：本文中的nltk.probability.FreqDist方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。