本文整理汇总了Python中nltk.probability.FreqDist.keys方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.keys方法的具体用法?Python FreqDist.keys怎么用?Python FreqDist.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.keys方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __FreqFromCorpus
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def __FreqFromCorpus (self):
r"""
Questo metodo estrae le frequenze dal corpus
"""
print "Calcolo bigrams..."
bi = FreqDist(bigrams(self.words))
print "Calcolo FreqDist..."
wfr = FreqDist(self.words)
print "Coda di elaborazione..."
print
tot = len(bi.keys())
i = 0
for eles in bi.keys():
a = wfr[eles[0]]
b = wfr[eles[1]]
ab = bi[eles]
N = wfr.N()
try:
self.__col_logl.append (nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood (a, b, ab, N))
print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t -> %f" % (i, tot,eles[0], eles[1], self.__col_logl[-1])
except UnicodeEncodeError:
#catturo eventuali errori di codifica
pass
i += 1
示例2: pmi
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def pmi(features):
'''
Compute the PMI value for all features
'''
dic = FreqDist()
dic_pos = FreqDist()
pos = 0.0
N = 0.0
for i,feature in enumerate(features):
N = N + 1
for f in feature:
if f[-1] == 1:
pos = pos + 1
for t in f[:-3]:
dic_pos.inc(t)
dic.inc(t)
else:
for t in f[:-3]:
dic.inc(t)
N = N + len(dic.keys())
pos = pos + len(dic.keys())
pmi_pos = {}
for t in dic.keys():
pmi_pos[t]=np.log(float((dic_pos[t]+1)*N)/float((dic[t]+1)*pos))
pmi_pos = dict(sorted(pmi_pos.items(), key=itemgetter(1)))
return pmi_pos
示例3: wordprefixsuffixsubstringsprobdist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def wordprefixsuffixsubstringsprobdist():
for w in englishdicttxt:
wtok=w.split()
if len(wtok) > 0:
computeprefixessuffixessubstrings(wtok[0])
wordlist.append(wtok[0])
#prefixf=open("WordPrefixesProbabilities.txt","w")
#suffixf=open("WordSuffixesProbabilities.txt","w")
prefixdict=FreqDist(prefixes)
suffixdict=FreqDist(suffixes)
substringsdict=FreqDist(suffixes)
totalprefixes=sum(prefixdict.values())
totalsuffixes=sum(suffixdict.values())
totalsubstrings=sum(substringsdict.values())
for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
prefixprobdict[pk] = float(pv)/float(totalprefixes)
for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
suffixprobdict[pk] = float(pv)/float(totalsuffixes)
for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
substringsprobdict[pk] = float(pv)/float(totalsubstrings)
#json.dump(prefixprobdict,prefixf)
#json.dump(suffixprobdict,suffixf)
#print "prefix probabilities:",prefixprobdict
#print "suffix probabilities:",suffixprobdict
return (prefixprobdict, suffixprobdict, substringsprobdict)
示例4: get_most_common_ngrams
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_most_common_ngrams(self, n, nb_ngrams=None):
"""
Compute and return the set of the most common ngrams in the documents.
This set is cached inside the object.
Args:
n: The number of grams. Must be a positive interger.
nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.
Returns:
A list of the most common ngrams.
"""
try:
# return cached value
return self._most_common_ngrams[n]
except KeyError:
pass
# compute all ngrams
all_ngrams = []
for document in self.training_set:
all_ngrams.extend(self.compute_ngrams(document, n))
# get the frequency or return all ngrams
freq = FreqDist(ngram for ngram in all_ngrams)
# store and return the nb_ngrams most common ngrams
if nb_ngrams:
self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
else:
self._most_common_ngrams[n] = freq.keys()
return self._most_common_ngrams[n]
示例5: __init__
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class VocabBuilder:
"""
Creates a vocabulary after scanning a corpus.
"""
def __init__(self, lang="english", min_length=3, cut_first=100):
"""
Set the minimum length of words and which stopword list (by language) to
use.
"""
self._counts = FreqDist()
self._stop = set(stopwords.words(lang))
self._min_length = min_length
self._cut_first = cut_first
print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))
def scan(self, words):
"""
Add a list of words as observed.
"""
for ii in [x.lower() for x in words if x.lower() not in self._stop \
and len(x) >= self._min_length]:
self._counts.inc(ii)
def vocab(self, size=5000):
"""
Return a list of the top words sorted by frequency.
"""
if len(self._counts) > self._cut_first + size:
return self._counts.keys()[self._cut_first:(size + self._cut_first)]
else:
return self._counts.keys()[:size]
示例6: get_bot_nouns_verbs
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_bot_nouns_verbs(pos_tags, tagmap, n):
# get_func_words('/home1/c/cis530/hw4/funcwords.txt')
funcwords = get_func_words('funcwords.txt')
fdNoun = FreqDist()
fdVerb = FreqDist()
for (word, tag) in pos_tags:
if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets(word):
fdVerb.inc(word)
elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets(word):
fdNoun.inc(word)
return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
示例7: get_all_nouns_verbs
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_all_nouns_verbs(tok_sents, tagmap):
# get_func_words('/home1/c/cis530/hw4/funcwords.txt')
funcwords = get_func_words('funcwords.txt')
fdNoun = FreqDist()
fdVerb = FreqDist()
for sent in tok_sents:
for tup in sent:
if tagmap[tup[2]] == "VERB" and tup[1] not in funcwords and wn.synsets(tup[0]):
fdVerb.inc(tup[1])
elif tagmap[tup[2]] == "NOUN" and tup[1] not in funcwords and wn.synsets(tup[0]):
fdNoun.inc(tup[1])
return (fdNoun.keys(), fdVerb.keys())
示例8: ExtractorOfWords
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class ExtractorOfWords():
def __init__(self, pos_words, neg_words, type_of_Feature_extractor = 0):
self.pos_words_training = reduce(lambda words,review: words + review.words(), pos_words, [])
self.neg_words_training = reduce(lambda words,review: words + review.words(), neg_words, [])
if type_of_Feature_extractor == 1:
formated_pos_words_training = self.Feature_extractor1(self.pos_words_training)
formated_neg_words_training = self.Feature_extractor1(self.neg_words_training)
elif type_of_Feature_extractor == 2:
formated_pos_words_training = self.Feature_extractor2(self.pos_words_training)
formated_neg_words_training = self.Feature_extractor2(self.neg_words_training)
elif type_of_Feature_extractor == 3:
formated_pos_words_training = self.Feature_extractor3(self.pos_words_training)
formated_neg_words_training = self.Feature_extractor3(self.neg_words_training)
elif type_of_Feature_extractor == 4:
formated_pos_words_training = self.Feature_extractor4(self.pos_words_training)
formated_neg_words_training = self.Feature_extractor4(self.neg_words_training)
else:
formated_pos_words_training = self.pos_words_training
formated_neg_words_training = self.neg_words_training
self.pos_words_freqdist = FreqDist(formated_pos_words_training)
self.neg_words_freqdist = FreqDist(formated_neg_words_training)
#Extract n most Freq. words
def Extraxt_n_most_Freq_Words (self, n):
return self.pos_words_freqdist.keys()[:n], self.neg_words_freqdist.keys()[:n]
#list of all words with their number of occurrences over *number_count*
def Extraxt_words_above_count (self, number_count):
return [word for word,count in self.pos_words_freqdist.iteritems() if count > number_count], [word for word,count in self.neg_words_freqdist.iteritems() if count > number_count]
#PorterStemmer
def Feature_extractor1(self, in_list):
ps = PorterStemmer()
return [ps.stem(w) for w in in_list]
#lowercase versions of all the words
def Feature_extractor2(self, in_list):
return [w.lower() for w in in_list]
#Replace all number tokens with "NUM"
def Feature_extractor3(self, in_list):
return ["NUM" if w.isdigit() else w for w in in_list]
#combination of fiters 1 and 2
def Feature_extractor4(self, in_list):
return [w.lower() for w in in_list if w.isalpha() and w.lower() not in stopwords.words('english')]
示例9: _train
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def _train(self, tagged_corpus, cutoff=0, verbose=False):
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
tag_prob = FreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
tag_prob.inc(tag)
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context,
# calculate the entropy. Only include contexts that
# lower then `cutoff` .
total_tags = float(sum(tag_prob.values()))
tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
useful_contexts_after_filter = useful_contexts.copy()
most_high = FreqDist()
for context in useful_contexts:
dd = fd[context]
# total_tags = float(sum(dd.values()))
# tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
h = self.H(dd.keys(),tags_probs)
if h > cutoff:
useful_contexts_after_filter.remove(context)
continue
most_high[context] = h
print most_high.keys()
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is.
for context in useful_contexts_after_filter:
best_tag = fd[context].max()
hits = fd[context][best_tag]
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
示例10: demo_similar
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def demo_similar(self, word, num=20):
"""
Distributional similarity: find other words which appear in the
same contexts as the specified word; list most similar words first.
@param word: The word used to seed the similarity search
@type word: C{str}
@param num: The number of words to generate (default=20)
@type num: C{int}
@seealso: L{ContextIndex.similar_words()}
"""
if '_word_context_index' not in self.__dict__:
print 'Building word-context index...'
self._word_context_index = nltk.text.ContextIndex(self.tokens,
filter=lambda x:x.isalpha(),
key=lambda s:s.lower())
# words = self._word_context_index.similar_words(word, num)
while 1:
word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):');
print "word='"+ word + "'"
if word == '0': break
word = word.decode('utf-8')
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w]
if c in contexts and not w == word)
words = fd.keys()[:num]
print tokenwrap(words)
else:
print "No matches"
示例11: getsimilar
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def getsimilar(self, word, num =20):
"""
@param word: The word used to seed the similarity search
@type word: C{str}
@param num: The number of words to generate (default=20)
@type num: C{int}
@seealso: L{ContextIndex.similar_words()}
"""
if '_word_context_index' not in self.__dict__:
print 'Building word-context index...'
self._word_context_index = ContextIndex(self.tokens,
filter=lambda x:x.isalpha(),
key=lambda s:s.lower())
#words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w]
if c in contexts and not w == word)
words = fd.keys()[:num] #lists of words
#print tokenwrap(words)
return words
else:
print "No matches"
return None
示例12: tokenize_clean
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def tokenize_clean(text):
"""Return list of items from tokenized text."""
tokens = word_tokenize(text.lower())
fdist = FreqDist(tokens)
words = [w.lower() for w in fdist.keys()
if w not in stopwords.words('english') and w.isalpha()]
return words
示例13: MyMarkovModel
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
class MyMarkovModel(MarkovModel):
def __init__(self, order):
self.order = order
self.filename = NGRAM_FILES[self.order]
if 3 >= self.order >= 2:
self.backoff = MyMarkovModel(order - 1)
self.cfd = ConditionalFreqDist()
self.charset = self.backoff.charset
for ngram, count in self.get_data():
context, char = tuple(ngram[:-1]), ngram[-1]
self.cfd[context][char] = count
elif self.order == 1:
self.backoff = None
self.n = 0
self.fd = FreqDist()
for char, count in self.get_data():
self.fd[char] = count
self.charset = set(self.fd.keys())
else:
raise NotImplemented
def get_data(self):
with open(self.filename) as fp:
for line in fp.readlines():
ngram, count = line.lower().split()
count = int(count)
yield ngram, count
示例14: get_term_freq_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def get_term_freq_dict(data):
# Change it to lower case
lower_data = data.lower()
# Tokenize it
tokens = word_tokenize(lower_data)
freq_dist = FreqDist(tokens)
# Lemmatize it
word_freq = {}
for term in freq_dist.keys():
lemmatize_term = wordnet.lemmatize(term)
val = freq_dist.get(term)
# If it exist in word_freq, add value
if lemmatize_term in word_freq:
freq = word_freq[lemmatize_term]
word_freq[lemmatize_term] = freq + val
# Else, assign value
else:
word_freq[lemmatize_term] = val
return word_freq
示例15: text_to_vector
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import keys [as 别名]
def text_to_vector(docs, metric):
""" Create frequency based feature-vector from text
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
all_tokens = doc_freqs.keys()
num_docs = len(docs)
num_features = len(all_tokens)
# Build feature x document matrix
matrix = np.zeros((num_features, num_docs))
for i, fd in enumerate(tf_dists):
if metric == FrequencyMetrics.TF:
v = [fd.freq(word) for word in all_tokens]
elif metric == FrequencyMetrics.TF_IDF:
v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
else:
raise ValueError("No such feature type: %s" % feature_type);
matrix[:,i] = v
return matrix