本文整理汇总了Python中nltk.probability.FreqDist方法的典型用法代码示例。如果您正苦于以下问题:Python probability.FreqDist方法的具体用法?Python probability.FreqDist怎么用?Python probability.FreqDist使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability
的用法示例。
在下文中一共展示了probability.FreqDist方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Do_alpha
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def Do_alpha(self):
"""The observed disagreement for the alpha coefficient.
The alpha coefficient, unlike the other metrics, uses this rather than
observed agreement.
"""
total = 0.0
for i, itemdata in self._grouped_data('item'):
label_freqs = FreqDist(x['labels'] for x in itemdata)
for j, nj in iteritems(label_freqs):
for l, nl in iteritems(label_freqs):
total += float(nj * nl) * self.distance(l, j)
ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
log.debug("Observed disagreement: %f", ret)
return ret
示例2: _freq_threshold
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def _freq_threshold(self, fdist, threshold):
"""
Returns a FreqDist containing only data with counts below a given
threshold, as well as a mapping (None -> count_removed).
"""
# We assume that there is more data below the threshold than above it
# and so create a new FreqDist rather than working in place.
res = FreqDist()
num_removed = 0
for tok in fdist:
count = fdist[tok]
if count < threshold:
num_removed += 1
else:
res[tok] += count
res[None] += num_removed
return res
#////////////////////////////////////////////////////////////
#{ Orthographic data
#////////////////////////////////////////////////////////////
示例3: from_words
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
示例4: binary_stump
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def binary_stump(feature_name, feature_value, labeled_featuresets):
label = FreqDist(label for (featureset, label)
in labeled_featuresets).max()
# Find the best label for each value.
pos_fdist = FreqDist()
neg_fdist = FreqDist()
for featureset, label in labeled_featuresets:
if featureset.get(feature_name) == feature_value:
pos_fdist[label] += 1
else:
neg_fdist[label] += 1
decisions = {}
default = label
# But hopefully we have observations!
if pos_fdist.N() > 0:
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
if neg_fdist.N() > 0:
default = DecisionTreeClassifier(neg_fdist.max())
return DecisionTreeClassifier(label, feature_name, decisions, default)
示例5: calculate_ngram_diversity
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def calculate_ngram_diversity(corpus):
"""
Calculates unigram and bigram diversity
Args:
corpus: tokenized list of sentences sampled
Returns:
uni_diversity: distinct-1 score
bi_diversity: distinct-2 score
"""
bigram_finder = BigramCollocationFinder.from_words(corpus)
bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N
dist = FreqDist(corpus)
uni_diversity = len(dist) / len(corpus)
return uni_diversity, bi_diversity
示例6: calculate_entropy
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def calculate_entropy(corpus):
"""
Calculates diversity in terms of entropy (using unigram probability)
Args:
corpus: tokenized list of sentences sampled
Returns:
ent: entropy on the sample sentence list
"""
fdist = FreqDist(corpus)
total_len = len(corpus)
ent = 0
for k, v in fdist.items():
p = v / total_len
ent += -p * np.log(p)
return ent
示例7: __init__
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
"""
n is the size of the ngram
"""
if n < 1:
raise ValueError("ngram size must be greater than or equal to 1")
self.n = n
self.unknown = unknown
self.padding = {
"pad_left": True,
"pad_right": True,
"left_pad_symbol": "<s>",
"right_pad_symbol": "</s>"
}
self.vocabulary = vocabulary
self.allgrams = defaultdict(ConditionalFreqDist)
self.ngrams = FreqDist()
self.unigrams = FreqDist()
示例8: transform
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def transform(self, documents):
words = []
docs = []
for document in documents:
docs.append(document)
for para in document:
for sent in para:
for token, tag in sent:
words.append(token)
counts = FreqDist(words)
self.reduced = set(
w for w in words if counts[w] > self.min and counts[w] < self.max
)
return [
' '.join(self.normalize(doc)) for doc in docs
]
示例9: Do_alpha
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def Do_alpha(self):
"""The observed disagreement for the alpha coefficient.
The alpha coefficient, unlike the other metrics, uses this rather than
observed agreement.
"""
total = 0.0
for i, itemdata in self._grouped_data('item'):
label_freqs = FreqDist(x['labels'] for x in itemdata)
for j, nj in label_freqs.iteritems():
for l, nl in label_freqs.iteritems():
total += float(nj * nl) * self.distance(l, j)
ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
log.debug("Observed disagreement: %f", ret)
return ret
示例10: _freq_threshold
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def _freq_threshold(self, fdist, threshold):
"""
Returns a FreqDist containing only data with counts below a given
threshold, as well as a mapping (None -> count_removed).
"""
# We assume that there is more data below the threshold than above it
# and so create a new FreqDist rather than working in place.
res = FreqDist()
num_removed = 0
for tok, count in fdist.iteritems():
if count < threshold:
num_removed += 1
else:
res.inc(tok, count)
res.inc(None, num_removed)
return res
#////////////////////////////////////////////////////////////
#{ Orthographic data
#////////////////////////////////////////////////////////////
示例11: from_words
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. By default, bigrams must be contiguous.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError, "Specify window_size at least 2"
for window in ingrams(words, window_size, pad_right=True):
w1 = window[0]
try:
window = window[:list(window).index(w1, 1)]
except ValueError:
pass
wfd.inc(w1)
for w2 in set(window[1:]):
if w2 is not None:
bfd.inc((w1, w2))
return cls(wfd, bfd)
示例12: binary_stump
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def binary_stump(feature_name, feature_value, labeled_featuresets):
label = FreqDist([label for (featureset,label)
in labeled_featuresets]).max()
# Find the best label for each value.
pos_fdist = FreqDist()
neg_fdist = FreqDist()
for featureset, label in labeled_featuresets:
if featureset.get(feature_name) == feature_value:
pos_fdist.inc(label)
else:
neg_fdist.inc(label)
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
default = DecisionTreeClassifier(neg_fdist.max())
return DecisionTreeClassifier(label, feature_name, decisions, default)
示例13: cal_Distinct
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def cal_Distinct(corpus):
"""
Calculates unigram and bigram diversity
Args:
corpus: tokenized list of sentences sampled
Returns:
uni_diversity: distinct-1 score
bi_diversity: distinct-2 score
"""
bigram_finder = BigramCollocationFinder.from_words(corpus)
bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N
dist = FreqDist(corpus)
uni_diversity = len(dist) / len(corpus)
return uni_diversity, bi_diversity
示例14: sample_relations_top_n
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def sample_relations_top_n(graph, context, type_):
num_total_words = len(context)
dist = FreqDist(context)
for node in graph:
node = build_score_per_layer(node, dist, num_total_words)
for node in graph:
node = calc_top_n_score_by_level(node)
for i, node in enumerate(graph):
graph[i] = prune_graph_by_top_n_softmax(node)
selected_paths = select_paths(graph)
paths = build_subpaths(selected_paths)
final_paths = list(paths for paths, _ in itertools.groupby(paths))
random.shuffle(final_paths)
return final_paths
示例15: get_stop_words_1
# 需要导入模块: from nltk import probability [as 别名]
# 或者: from nltk.probability import FreqDist [as 别名]
def get_stop_words_1(data, num_stop_words):
total_words = []
for d in data:
total_words.extend(d["ques"])
total_words.extend(d["answer1"])
for d_i in d["summary"]:
total_words.extend(d_i)
fdist = FreqDist(total_words)
stop_words = fdist.most_common(num_stop_words)
stop_words = [t[0] for t in stop_words]
pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"]
filtered_stop_words = []
for p in stop_words:
if p not in pronoun_list:
filtered_stop_words.append(p)
return filtered_stop_words