本文整理匯總了Python中nltk.probability.ConditionalFreqDist方法的典型用法代碼示例。如果您正苦於以下問題:Python probability.ConditionalFreqDist方法的具體用法?Python probability.ConditionalFreqDist怎麽用?Python probability.ConditionalFreqDist使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.probability
的用法示例。
在下文中一共展示了probability.ConditionalFreqDist方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: weighted_kappa_pairwise
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
"""Cohen 1968
"""
total = 0.0
label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
for x in self.data
if x['coder'] in (cA, cB))
for j in self.K:
for l in self.K:
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
De = total / (max_distance * pow(len(self.I), 2))
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
Do = self.Do_Kw_pairwise(cA, cB)
ret = 1.0 - (Do / De)
return ret
示例2: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, n, vocabulary, unknown="<UNK>"):
"""
n is the size of the ngram
"""
if n < 1:
raise ValueError("ngram size must be greater than or equal to 1")
self.n = n
self.unknown = unknown
self.padding = {
"pad_left": True,
"pad_right": True,
"left_pad_symbol": "<s>",
"right_pad_symbol": "</s>"
}
self.vocabulary = vocabulary
self.allgrams = defaultdict(ConditionalFreqDist)
self.ngrams = FreqDist()
self.unigrams = FreqDist()
示例3: weighted_kappa_pairwise
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
"""Cohen 1968
"""
total = 0.0
label_freqs = ConditionalFreqDist(
(x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
)
for j in self.K:
for l in self.K:
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
De = total / (max_distance * pow(len(self.I), 2))
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
Do = self.Do_Kw_pairwise(cA, cB)
ret = 1.0 - (Do / De)
return ret
示例4: pynlpir_feature
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def pynlpir_feature(number): # 選取number個特征詞
normalWords = []
advWords = []
for items in read_file('ad/normal.txt'): # 把集合的集合變成集合
for item in items:
normalWords.append(item)
for items in read_file('ad/advertise.txt'):
for item in items:
advWords.append(item)
word_fd = FreqDist() # 可統計所有詞的詞頻
cond_word_fd = ConditionalFreqDist() # 可統計正常文本中的詞頻和廣告文本中的詞頻
for word in normalWords:
word_fd[word] += 1
cond_word_fd['normal'][word] += 1
for word in advWords:
word_fd[word] += 1
cond_word_fd['adv'][word] += 1
normal_word_count = cond_word_fd['normal'].N() # 正常詞的數量
adv_word_count = cond_word_fd['adv'].N() # 廣告詞的數量
total_word_count = normal_word_count + adv_word_count
word_scores = {} # 包括了每個詞和這個詞的信息量
for word, freq in word_fd.items():
# 計算正常詞的卡方統計量,這裏也可以計算互信息等其它統計量
normal_score = BigramAssocMeasures.chi_sq(cond_word_fd['normal'][word],
(freq, normal_word_count),
total_word_count)
adv_score = BigramAssocMeasures.chi_sq(cond_word_fd['adv'][word],
(freq, adv_word_count),
total_word_count) # 同理
# 一個詞的信息量等於正常卡方統計量加上廣告卡方統計量
word_scores[word] = normal_score + adv_score
best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[
:number] # 把詞按信息量倒序排序。number是特征的維度,是可以不斷調整直至最優的
# χ²=∑(Oi-Ei)/Ei~χ²(k-1)
# i=1~k
# Oi是觀測值
# Ei是期望值
# 統計量大於臨界值時,拒絕原假設
best_words = set([w for w, s in best_vals])
return dict([(word, True) for word in best_words])
示例5: Ae_kappa
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
示例6: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
self._key = key
self._tokens = tokens
if context_func:
self._context_func = context_func
else:
self._context_func = self._default_context
if filter:
tokens = [t for t in tokens if filter(t)]
self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
for i, w in enumerate(tokens))
self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
for i, w in enumerate(tokens))
示例7: create_word_scores
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def create_word_scores():
#creates lists of all positive and negative words
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd.inc(word.lower())
cond_word_fd['pos'].inc(word.lower())
for word in negWords:
word_fd.inc(word.lower())
cond_word_fd['neg'].inc(word.lower())
#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
#finds word scores
示例8: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
self._key = key
self._tokens = tokens
if context_func:
self._context_func = context_func
else:
self._context_func = self._default_context
if filter:
tokens = [t for t in tokens if filter(t)]
self._word_to_contexts = CFD(
(self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
)
self._context_to_words = CFD(
(self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
)
示例9: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, ngram_text=None):
"""Creates a new NgramCounter.
If `ngram_text` is specified, counts ngrams from it, otherwise waits for
`update` method to be called explicitly.
:param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
:type ngram_text: Iterable(Iterable(tuple(str))) or None
"""
self._counts = defaultdict(ConditionalFreqDist)
self._counts[1] = self.unigrams = FreqDist()
if ngram_text:
self.update(ngram_text)
示例10: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, unk=None, Trained=False, N=1000, C=False):
'''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
:param unk: instance of a POS tagger, conforms to TaggerI
:type unk:(TaggerI)
:param Trained: Indication that the POS tagger is trained or not
:type Trained: boolean
:param N: Beam search degree (see above)
:type N:(int)
:param C: Capitalization flag
:type C: boolean
Initializer, creates frequency distributions to be used
for tagging
_lx values represent the portion of the tri/bi/uni taggers
to be used to calculate the probability
N value is the number of possible solutions to maintain
while tagging. A good value for this is 1000
C is a boolean value which specifies to use or
not use the Capitalization of the word as additional
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
'''
self._uni = FreqDist()
self._bi = ConditionalFreqDist()
self._tri = ConditionalFreqDist()
self._wd = ConditionalFreqDist()
self._eos = ConditionalFreqDist()
self._l1 = 0.0
self._l2 = 0.0
self._l3 = 0.0
self._N = N
self._C = C
self._T = Trained
self._unk = unk
# statistical tools (ignore or delete me)
self.unknown = 0
self.known = 0
示例11: _train
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=' ')
print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning))
######################################################################
#{ Tagger Classes
######################################################################
示例12: _train
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag) tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
:param verbose: Not used
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None:
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if self.backoff is None or tag != self.backoff.tag_one(
tokens, index, tags[:index]
):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max() # Remove
weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()]
hits = fd[context][best_tag] #INT
if hits > cutoff:
self._context_to_tag[context] = weighted_tags
hit_count += hits
示例13: _train
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning)
######################################################################
#{ Tagger Classes
######################################################################
示例14: __init__
# 需要導入模塊: from nltk import probability [as 別名]
# 或者: from nltk.probability import ConditionalFreqDist [as 別名]
def __init__(self, unk=None, Trained=False, N=1000, C=False):
'''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
:param unk: instance of a POS tagger, conforms to TaggerI
:type unk:(TaggerI)
:param Trained: Indication that the POS tagger is trained or not
:type Trained: boolean
:param N: Beam search degree (see above)
:type N:(int)
:param C: Capitalization flag
:type C: boolean
Initializer, creates frequency distributions to be used
for tagging
_lx values represent the portion of the tri/bi/uni taggers
to be used to calculate the probability
N value is the number of possible solutions to maintain
while tagging. A good value for this is 1000
C is a boolean value which specifies to use or
not use the Capitalization of the word as additional
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
'''
self._uni = FreqDist()
self._bi = ConditionalFreqDist()
self._tri = ConditionalFreqDist()
self._wd = ConditionalFreqDist()
self._eos = ConditionalFreqDist()
self._l1 = 0.0
self._l2 = 0.0
self._l3 = 0.0
self._N = N
self._C = C
self._T = Trained
self._unk = unk
# statistical tools (ignore or delete me)
self.unknown = 0
self.known = 0