本文整理汇总了Python中nltk.probability.ConditionalFreqDist.conditions方法的典型用法代码示例。如果您正苦于以下问题:Python ConditionalFreqDist.conditions方法的具体用法?Python ConditionalFreqDist.conditions怎么用?Python ConditionalFreqDist.conditions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.ConditionalFreqDist
的用法示例。
在下文中一共展示了ConditionalFreqDist.conditions方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: high_information_words
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
"""
To eliminate low information feature words for set of words for EFFICIENCY
:param labeled_words: list of 2 tuples [(label, words)]
label -> is a classification label (pos / neg)
words -> is a list of words that occur under that label
:param score_fn: a scoring function to measure how informative that word is
:param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
:return: a set of high informative words
"""
print "Counting Word Frequencies"
word_fq = FreqDist()
labeled_word_fq = ConditionalFreqDist()
for label, words in labeled_words:
for word in words:
word_fq[word] += 1
labeled_word_fq[label][word] += 1
n_xx = labeled_word_fq.N()
high_info_words = set()
for label in labeled_word_fq.conditions():
n_xi = labeled_word_fq[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in labeled_word_fq[label].iteritems():
n_ix = word_fq[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例2: Ae_kappa
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
示例3: get_high_information_words
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
labels = lwords.keys()
labelled_words = [(l, lwords[l]) for l in labels]
word_freq_dist = FreqDist()
label_word_freq_dist = ConditionalFreqDist()
for label, dwords in labelled_words:
for words in dwords:
for word in words:
word_freq_dist[word] += 1
label_word_freq_dist[label][word] += 1
n_words_total = label_word_freq_dist.N()
high_info_words = set()
for label in label_word_freq_dist.conditions():
n_words_label = label_word_freq_dist[label].N()
word_scores = defaultdict(int)
for word, word_freq_label in label_word_freq_dist[label].items():
word_freq = word_freq_dist[word]
score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例4: readFormatedData
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def readFormatedData(formatedData):
#unigramFd = FreqDist()
#bigramFd = FreqDist()
cBigramFd1 = ConditionalFreqDist()
cBigramFd2 = ConditionalFreqDist()
#dict1 = Set([])
#dict2 = Set([])
for tuple in formatedData:
words = tuple[0].split(' ')
count = int(tuple[1])
#unigramFd.inc(words[0])
#unigramFd.inc(words[1])
#bigramFd.inc((words[0], words[1]), count)
word2 = words[1]
if count < 5:
word2 = "unknown"
cBigramFd1[words[0]].inc(word2, count)
#if words[0] not in dict1:
# dict1.add(words[0])
#if words[1] not in dict2:
# dict2.add(words[1])
for w1 in cBigramFd1.conditions():
bigram_w1 = cBigramFd1[w1]
for w2 in bigram_w1.samples():
cBigramFd2[w2].inc(w1, bigram_w1[w2])
return cBigramFd1, cBigramFd2#, dict1, dict2
示例5: high_information_words
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd.inc(word)
label_word_fd[label].inc(word)
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].iteritems():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例6: _setSelectedPOSTags
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def _setSelectedPOSTags(self):
buff = self._loadData('selective_pos.bin')
if buff:
self.selective_pos = buff
return
#First get all (word, tag) in corpuses
sentences = brown.tagged_sents(simplify_tags=True)
self.selected_tags = ["ADJ","ADV", "CNJ"]
self.selective_pos = ConditionalFreqDist()
temp_dist = ConditionalFreqDist()
for sentence in sentences:
for (word, tag) in sentence:
if tag in self.selected_tags:
temp_dist[tag].inc(str(word).lower())
#Now, get the words with frequency > 10
for category in temp_dist.conditions():
fredist = temp_dist[category]
for key in fredist.keys():
if fredist[key] > 4:
self.selective_pos[category].inc(key)
self._saveData('selective_pos.bin',self.selective_pos)
示例7: _train
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None:
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(
tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=' ')
print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning))
示例8: _train
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
tag_prob = FreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
tag_prob.inc(tag)
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context,
# calculate the entropy. Only include contexts that
# lower then `cutoff` .
total_tags = float(sum(tag_prob.values()))
tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
useful_contexts_after_filter = useful_contexts.copy()
most_high = FreqDist()
for context in useful_contexts:
dd = fd[context]
# total_tags = float(sum(dd.values()))
# tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
h = self.H(dd.keys(),tags_probs)
if h > cutoff:
useful_contexts_after_filter.remove(context)
continue
most_high[context] = h
print most_high.keys()
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is.
for context in useful_contexts_after_filter:
best_tag = fd[context].max()
hits = fd[context][best_tag]
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
示例9: validate_pcfg_generate
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def validate_pcfg_generate(grammar):
pd = makeLhrProbDict(grammar)
productions = []
cfd = ConditionalFreqDist()
for i in np.arange(1000):
tree = pcfg_generate(grammar)
productions += tree.productions()
for p in productions:
cfd[p.lhs()].inc(p.rhs())
for c in cfd.conditions():
p = MLEProbDist(cfd[c])
q = pd[c]
div = KL_Divergence(p, q)
print "KL_Divergence for %s = %f" % (c, div)
示例10: _train
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
#best_tag = fd[context].max()
for (tag, hits) in fd[context].items():
if hits > cutoff:
self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
self._contexts_to_tags[context][tag] = hits
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning)
示例11: sum_category_word_scores
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def sum_category_word_scores(categorized_words, score_fn):
word_fd = FreqDist()
category_word_fd = ConditionalFreqDist()
for category, words in categorized_words:
for word in words:
word_fd.inc(word)
category_word_fd[category].inc(word)
scores = collections.defaultdict(int)
n_xx = category_word_fd.N()
for category in category_word_fd.conditions():
n_xi = category_word_fd[category].N()
for word, n_ii in iteritems(category_word_fd[category]):
n_ix = word_fd[word]
scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
return scores
示例12: significantWords
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def significantWords(untagged_docs, min_chisq=5, ratio=0.75):
"""
Use chisq test of bigram contingency table to measure
the association of token with its sentiment
Parameters
----------
untagged_docs: list of tuples (words, tag)
min_chisq: lower bound of significant
ratio: pos/neg ratio, used to determine the sentiment of a word
Returns
-------
significant_words: a 3-key-dict of words set
"""
significant_words = collections.defaultdict(set)
freq_dist = FreqDist()
label_freq_dist = ConditionalFreqDist()
stopping_words = set(nltk.corpus.stopwords.words('english'))
for tokens, label in untagged_docs:
for token in tokens:
if token.isalpha() and not (token in stopping_words):
freq_dist.inc(token)
label_freq_dist[label].inc(token)
n_xx = label_freq_dist.N()
#pdb.set_trace()
for label in label_freq_dist.conditions():
for word, n_ii in label_freq_dist[label].iteritems():
n_xi = label_freq_dist[label].N()
n_ix = freq_dist[word]
n_oi = n_xi-n_ii
n_io = n_ix-n_ii
n_oo = n_xx-n_oi-n_io-n_ii
chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\
/((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi))
if chisq > min_chisq and n_ii>10:
significant_words['total'] |= set([word])
if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1:
significant_words[label] |= set([word])
return significant_words
示例13: sum_category_word_scores
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def sum_category_word_scores(categorized_words, score_fn):
# get word freq
word_fd = FreqDist()
# get conditional freq Dist
category_word_fd = ConditionalFreqDist()
# according to catagory
for category, words in categorized_words:
for word in words:
word_fd.inc(word)
category_word_fd[category].inc(word)
scores = collections.defaultdict(int)
n_xx = category_word_fd.N()
for category in category_word_fd.conditions():
n_xi = category_word_fd[category].N()
for word, n_ii in category_word_fd[category].iteritems():
n_ix = word_fd[word]
scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
# return the scores
return scores
示例14: high_information_words
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
# gathers the most frequently occuring features to improve classification
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd[word] += 1
label_word_fd[label][word] += 1
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].items():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score>= min_score]
high_info_words |= set(bestwords)
return high_info_words
示例15: unicode
# 需要导入模块: from nltk.probability import ConditionalFreqDist [as 别名]
# 或者: from nltk.probability.ConditionalFreqDist import conditions [as 别名]
word = unicode(word)
cfd[prev_word][word]+=1
global_fd[word] += 1
prev_word = word
except:
print "falhou um link..."
print "terminou distrib probabilidades"
print "vai construir as listas com o formato pro SQLite"
global_frequencies = []
for word in sorted(global_fd.keys()):
global_frequencies.append((word, global_fd[word]))
conditional_frequencies = []
for condition in sorted(cfd.conditions()):
for word in sorted(cfd[condition].keys()):
if condition:
conditional_frequencies.append((condition, word, cfd[condition][word]))
print "vai comecar a parte do banco de dados"
con = lite.connect("words-pt.db")
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS _1_gram")
cur.execute("CREATE TABLE _1_gram(word TEXT, count INT)")
cur.executemany("INSERT INTO _1_gram VALUES(?, ?)", tuple(global_frequencies))
cur.execute("DROP TABLE IF EXISTS _2_gram")