本文整理汇总了Python中nltk.probability.ConditionalFreqDist类的典型用法代码示例。如果您正苦于以下问题:Python ConditionalFreqDist类的具体用法?Python ConditionalFreqDist怎么用?Python ConditionalFreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ConditionalFreqDist类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cDist
def cDist(self, params):
"""return conditional freq distribution (based on part of speech) using filtered_words from loadData"""
president = params["president"]
speech = params["speech"]
if self.president == "All presidents":
pipeline = [{"$match": {"type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}]
else:
pipeline = [
{"$match": {"name": president, "type": speech}},
{"$project": {"tags": "$filtered_speech_tags"}},
]
tags = []
for i in self.col.aggregate(pipeline):
tags.extend(i["tags"])
cfdist = ConditionalFreqDist() # conditioned on pos_tag
for word, tag in tags:
condition = tag # specify condition to group frequencies by
cfdist[condition][word] += 1
VB = MLEProbDist(cfdist.get("VBP"))
NN = MLEProbDist(cfdist.get("NN"))
JJ = MLEProbDist(cfdist.get("JJ"))
return VB, NN, JJ # return verbs, nouns, adjectives
示例2: _setSelectedPOSTags
def _setSelectedPOSTags(self):
buff = self._loadData('selective_pos.bin')
if buff:
self.selective_pos = buff
return
#First get all (word, tag) in corpuses
sentences = brown.tagged_sents(simplify_tags=True)
self.selected_tags = ["ADJ","ADV", "CNJ"]
self.selective_pos = ConditionalFreqDist()
temp_dist = ConditionalFreqDist()
for sentence in sentences:
for (word, tag) in sentence:
if tag in self.selected_tags:
temp_dist[tag].inc(str(word).lower())
#Now, get the words with frequency > 10
for category in temp_dist.conditions():
fredist = temp_dist[category]
for key in fredist.keys():
if fredist[key] > 4:
self.selective_pos[category].inc(key)
self._saveData('selective_pos.bin',self.selective_pos)
示例3: readFormatedData
def readFormatedData(formatedData):
#unigramFd = FreqDist()
#bigramFd = FreqDist()
cBigramFd1 = ConditionalFreqDist()
cBigramFd2 = ConditionalFreqDist()
#dict1 = Set([])
#dict2 = Set([])
for tuple in formatedData:
words = tuple[0].split(' ')
count = int(tuple[1])
#unigramFd.inc(words[0])
#unigramFd.inc(words[1])
#bigramFd.inc((words[0], words[1]), count)
word2 = words[1]
if count < 5:
word2 = "unknown"
cBigramFd1[words[0]].inc(word2, count)
#if words[0] not in dict1:
# dict1.add(words[0])
#if words[1] not in dict2:
# dict2.add(words[1])
for w1 in cBigramFd1.conditions():
bigram_w1 = cBigramFd1[w1]
for w2 in bigram_w1.samples():
cBigramFd2[w2].inc(w1, bigram_w1[w2])
return cBigramFd1, cBigramFd2#, dict1, dict2
示例4: high_information_words
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
"""
To eliminate low information feature words for set of words for EFFICIENCY
:param labeled_words: list of 2 tuples [(label, words)]
label -> is a classification label (pos / neg)
words -> is a list of words that occur under that label
:param score_fn: a scoring function to measure how informative that word is
:param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
:return: a set of high informative words
"""
print "Counting Word Frequencies"
word_fq = FreqDist()
labeled_word_fq = ConditionalFreqDist()
for label, words in labeled_words:
for word in words:
word_fq[word] += 1
labeled_word_fq[label][word] += 1
n_xx = labeled_word_fq.N()
high_info_words = set()
for label in labeled_word_fq.conditions():
n_xi = labeled_word_fq[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in labeled_word_fq[label].iteritems():
n_ix = word_fq[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例5: get_high_information_words
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
labels = lwords.keys()
labelled_words = [(l, lwords[l]) for l in labels]
word_freq_dist = FreqDist()
label_word_freq_dist = ConditionalFreqDist()
for label, dwords in labelled_words:
for words in dwords:
for word in words:
word_freq_dist[word] += 1
label_word_freq_dist[label][word] += 1
n_words_total = label_word_freq_dist.N()
high_info_words = set()
for label in label_word_freq_dist.conditions():
n_words_label = label_word_freq_dist[label].N()
word_scores = defaultdict(int)
for word, word_freq_label in label_word_freq_dist[label].items():
word_freq = word_freq_dist[word]
score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例6: high_information_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd.inc(word)
label_word_fd[label].inc(word)
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].iteritems():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例7: Ae_kappa
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
示例8: _train
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None:
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(
tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=' ')
print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning))
示例9: __init__
def __init__(self, r, name, cond_samples=None):
self._r = r
self._name = name
ConditionalFreqDist.__init__(self, cond_samples)
# initialize self._fdists for all matching keys
for key in self._r.keys(encode_key('%s:*' % name)):
condition = key.split(':')[1]
self[condition] # calls self.__getitem__(condition)
示例10: __init__
def __init__(self, r, name, cond_samples=None):
self._r = r
self._name = name
ConditionalFreqDist.__init__(self, cond_samples)
for key in self._r.keys(encode_key('%s:*' % name)):
condition = key.split(b':')[1].decode()
self[condition] # calls self.__getitem__(condition)
示例11: words_by_followers
def words_by_followers(category):
"""Given a category from the brown corpus, lowercases everything,
and returns a frequency distribution where the keys are words
and the counts are the number of different contexts that each word can appear in."""
bigrams = brown_bigrams(category)
cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
fdist = FreqDist()
for context in cfdist.keys():
fdist[context] = len(cfdist[context])
return fdist
示例12: _train
def _train(self, tagged_corpus, cutoff=0, verbose=False):
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
tag_prob = FreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
tag_prob.inc(tag)
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context,
# calculate the entropy. Only include contexts that
# lower then `cutoff` .
total_tags = float(sum(tag_prob.values()))
tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
useful_contexts_after_filter = useful_contexts.copy()
most_high = FreqDist()
for context in useful_contexts:
dd = fd[context]
# total_tags = float(sum(dd.values()))
# tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
h = self.H(dd.keys(),tags_probs)
if h > cutoff:
useful_contexts_after_filter.remove(context)
continue
most_high[context] = h
print most_high.keys()
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is.
for context in useful_contexts_after_filter:
best_tag = fd[context].max()
hits = fd[context][best_tag]
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
示例13: __init__
def __init__(self, unk=None, Trained=False, N=1000, C=False):
'''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
:param unk: instance of a POS tagger, conforms to TaggerI
:type unk:(TaggerI)
:param Trained: Indication that the POS tagger is trained or not
:type Trained: boolean
:param N: Beam search degree (see above)
:type N:(int)
:param C: Capitalization flag
:type C: boolean
Initializer, creates frequency distributions to be used
for tagging
_lx values represent the portion of the tri/bi/uni taggers
to be used to calculate the probability
N value is the number of possible solutions to maintain
while tagging. A good value for this is 1000
C is a boolean value which specifies to use or
not use the Capitalization of the word as additional
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
'''
self._uni = FreqDist()
self._bi = ConditionalFreqDist()
self._tri = ConditionalFreqDist()
self._wd = ConditionalFreqDist()
self._eos = ConditionalFreqDist()
self._l1 = 0.0
self._l2 = 0.0
self._l3 = 0.0
self._N = N
self._C = C
self._T = Trained
self._unk = unk
# statistical tools (ignore or delete me)
self.unknown = 0
self.known = 0
示例14: validate_pcfg_generate
def validate_pcfg_generate(grammar):
pd = makeLhrProbDict(grammar)
productions = []
cfd = ConditionalFreqDist()
for i in np.arange(1000):
tree = pcfg_generate(grammar)
productions += tree.productions()
for p in productions:
cfd[p.lhs()].inc(p.rhs())
for c in cfd.conditions():
p = MLEProbDist(cfd[c])
q = pd[c]
div = KL_Divergence(p, q)
print "KL_Divergence for %s = %f" % (c, div)
示例15: __init__
def __init__(self, load_from_disk=True):
self._corpus = reuters.words()
self._unigram_fd = FreqDist()
self._bigram_cfd = ConditionalFreqDist()
self._trigram_cfd = ConditionalFreqDist()
self._quadgram_cfd = ConditionalFreqDist()
self._unigram_pd = None
self._bigram_cpd = None
self._trigram_cpd = None
self._quadgram_cpd = None
if load_from_disk:
self._load_models()
else:
self._train()