本文整理汇总了Python中nltk.FreqDist.inc方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.inc方法的具体用法?Python FreqDist.inc怎么用?Python FreqDist.inc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.inc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: category_by_pos
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
示例2: process
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def process(f, return_tokens=True, return_freqdist=True):
"""
Function to process deals data.
Splits text into sentences. FreqDist is incremented from tokenization.
Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
Deals are also about domain names. Not intending to split it up
:rtype : FreqDist, list() of str
:param f: Input file with a deal per line
"""
fd = FreqDist()
tokens = []
fh = open(f, 'r')
sentences = [line.strip() for line in fh.readlines()]
for line in sentences:
t = []
for word in PunktWordTokenizer().tokenize(line.lower()):
if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
if return_tokens:
t.append(word)
if return_freqdist:
fd.inc(word)
tokens.append(t)
fh.close()
return fd, sentences, tokens
示例3: dotranslate
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def dotranslate(sent, parser, tdop):
# todo: tokenize sentence by maximizing unigram probabilities
# in training corpus, to detect multiword units
sent = sent.split()
# parse sentence with bitpar, gives an n-best list
try:
parsetrees1 = list(parser.nbest_parse(sent))
except Exception as e:
parsetrees1 = []
print "parsing failed", e
return (), {}
# undo binarization and auxilary POS tags introduced to accomodate bitpar:
parsetrees = FreqDist()
for tree in parsetrees1:
tree.un_chomsky_normal_form()
parsetrees.inc(removeforcepos(tree).freeze(), count=tree.prob())
# for each parsetree, get a list of translations
resultfd = {}
for m, tree in enumerate(parsetrees):
print "parse tree", tree
for nn, (result, prob) in enumerate(
tdop.get_mlt_deriv_multi(tree, smoothing=True, verbose=False)):
if not result: continue
key = (undecorate_with_ids(result).freeze(),
sum(1 if "@" in a.node else 0 for a in result.subtrees()))
resultfd[key] = resultfd.get(key, 0.0) + prob
return parsetrees, resultfd
示例4: __init__
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
class Index:
"""
The Index class stores an index for a document.
"""
def __init__(self):
self._freq_dist = None
self._document = None
def index(self, document):
self._document = document
if self._freq_dist == None:
self._freq_dist = FreqDist()
for term in self.terms():
self._freq_dist.inc(term)
def reset(self):
"Reset the index"
self._freq_dist = None
def freq_dist(self):
if self._freq_dist == None:
self.index()
return self._freq_dist
# return the number of times a term appears in this document
def freq(self, term):
if not self._freq_dist:
self.index()
return self._freq_dist[term]
def tf(self, term):
if not self._freq_dist:
self.index()
return float(self._freq_dist[term]) / float(self._freq_dist.N())
示例5: proto
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def proto(self, num, language, authors, token_vocab, token_df, lemma_vocab,
pos_vocab, synset_vocab, stemmer):
d = Document()
assert language == self.lang
if self._id:
d.id = self._id
else:
d.id = num
d.language = language
d.title = self.title.strip()
num_sentences = max(self._sentences) + 1
tf_token = FreqDist()
for ii in self.tokens():
tf_token.inc(ii)
for ii in xrange(num_sentences):
s = d.sentences.add()
for jj in self._sentences[ii]:
w = s.words.add()
w.token = token_vocab[jj.word]
w.lemma = lemma_vocab[jj.lemma]
w.pos = pos_vocab[jj.pos]
w.relation = pos_vocab[jj.rel]
w.parent = jj.parent
w.offset = jj.offset
w.tfidf = token_df.compute_tfidf(jj.word,
tf_token.freq(jj.word))
return d
示例6: word_fdist_single
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def word_fdist_single(address, exclude=excludes(), corpus=inaugural):
fd = FreqDist()
for word in corpus.words(address):
if not word.lower() in exclude:
fd.inc(word.lower())
return fd
示例7: sent_length_fdist_single
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def sent_length_fdist_single(address, exclude=excludePuncts(), corpus=inaugural):
fd = FreqDist()
for sent in corpus.sents(address):
nopunct_sent = [word for word in sent if not word in exclude]
fd.inc(len(nopunct_sent))
return fd
示例8: sent_length_fdist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def sent_length_fdist(address_list, exclude=excludePuncts(), corpus=inaugural):
total_fd = FreqDist()
for address in address_list:
fd = sent_length_fdist_single(address, exclude, corpus)
for len in fd.keys():
total_fd.inc(len, fd[len])
return total_fd
示例9: content_FreqDist_generator
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def content_FreqDist_generator(articles_list):
# get the FreqDist of all articles
all_fdist = FreqDist()
for article in articles_list:
for item in article.content_freqDist().iteritems():
key = item[0]
value = item[1]
all_fdist.inc(key, value)
return all_fdist
示例10: word_fdist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def word_fdist(address_list, exclude=excludes(), corpus=inaugural):
total_fd = FreqDist()
for address in address_list:
fd = word_fdist_single(address, exclude, corpus)
for word in fd.keys():
total_fd.inc(word, fd[word])
return total_fd
示例11: __extract_level_words
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def __extract_level_words(self, levels_db, level, values):
words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"]
most_freq_words = {}
for value in values:
fdist = FreqDist()
for word_dist in levels_db[level][value]:
fdist.inc(word_dist[0], count = word_dist[1])
most_freq_words[value] = fdist.items()[:words_number_per_value]
return most_freq_words
示例12: kneser_ney
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def kneser_ney(self, context, word):
"""
Return the log probability of a word given a context given
Kneser Ney backoff
"""
bgram = (context, word)
unigram_freq = FreqDist()
theta = self._kn_concentration
vocabulary = 1 / len(self._vocab_freq.keys())
discount_delta = self._kn_discount
unigram_T = len(self._context_freq.keys())
bigram_T = self._context_freq[context]
for i in self._gram_freq:
unigram_freq.inc(i[1])
# Unigram Restaurant
# C_0,x
count_unirest_wordTable = unigram_freq[word]
# C_0,.
count_unirest_allTable = unigram_freq.N()
# u_Bigram Restaurant
# C_u,x
count_birest_wordTable = self._gram_freq[bgram]
# C_u,.
count_birest_allTable = self._context_freq[context]
existingTable_numer = count_birest_wordTable - discount_delta
existingTable_denom = theta + count_birest_allTable
existingTable = existingTable_numer / existingTable_denom
if existingTable < 0:
existingTable = 0
newTable_numer = theta + (bigram_T * discount_delta)
newTable_denom = theta + count_birest_allTable
newTable = newTable_numer / newTable_denom
back_a_numer = count_unirest_wordTable - discount_delta
back_a_denom = count_unirest_allTable + theta
back_a = back_a_numer / back_a_denom
if back_a < 0:
back_a = 0
back_b_numer = theta + (unigram_T * discount_delta)
back_b_denom = count_unirest_allTable + theta
back_b = back_b_numer / back_b_denom
back_b = back_b * vocabulary
result = existingTable + (newTable * (back_a + back_b))
return lg(result)
示例13: __getTimelineFeatures
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def __getTimelineFeatures(self, timeline):
logger.info(u"Get timeline features")
tweets = []
self.__changePhase(PHASE["GET_TIMELINE_URLS"])
for t in timeline:
try:
tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
except:
logger.exception(u"Error: \"" + unicode(t) + u"\"")
raise ValueError(t)
logger.debug(u"Tweet:" + unicode(tweet))
tweets.append(tweet)
urls = []
ti = 0
for tweet in tweets:
for url in tweet.urls():
self.__breakIfStopped()
self.__urlResolver.addUrlToQueue(url)
urls.append(url)
logger.info(u"Tweet:" + unicode(tweet))
ti += 1
self.__proc = 100 * float(ti) / float(len(tweets))
#Kategorie
self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
url2labels = {}
ui = 0
for url in urls:
self.__breakIfStopped()
if not url.isError():
logger.debug(u"Classify " + unicode(url.getUrl()))
url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
ui += 1
self.__proc = 100 * float(ui) / float(len(urls))
labelsFreq = FreqDist()
for labels in url2labels.values():
for label in labels:
labelsFreq.inc(label)
self.__catFreq = labelsFreq.items()
logger.info(u"Categories: " + unicode(labelsFreq.items()))
labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
#normalizacja
labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
logger.info(u"Category factors: " + unicode(labelsFreqValues))
#Języki
langFreq = FreqDist()
for u in urls:
langFreq.inc(u.lang())
self.__langFreq = langFreq.items()
logger.info(u"Languages: " + unicode(langFreq.items()))
return labelsFreqValues
示例14: train_supervised
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def train_supervised(self, labelled_sequences, **kwargs):
"""
Supervised training maximising the joint probability of the symbol and
state sequences. This is done via collecting frequencies of
transitions between states, symbol observations while within each
state and which states start a sentence. These frequency distributions
are then normalised into probability estimates, which can be
smoothed if desired.
@return: the trained model
@rtype: HiddenMarkovModelTagger
@param labelled_sequences: the training data, a set of
labelled sequences of observations
@type labelled_sequences: list
@param kwargs: may include an 'estimator' parameter, a function taking
a C{FreqDist} and a number of bins and returning a C{ProbDistI};
otherwise a MLE estimate is used
"""
# default to the MLE estimate
estimator = kwargs.get('estimator')
if estimator == None:
estimator = lambda fdist, bins: MLEProbDist(fdist)
# count occurences of starting states, transitions out of each state
# and output symbols observed in each state
starting = FreqDist()
transitions = ConditionalFreqDist()
outputs = ConditionalFreqDist()
for sequence in labelled_sequences:
lasts = None
for token in sequence:
state = token[_TAG]
symbol = token[_TEXT]
if lasts == None:
starting.inc(state)
else:
transitions[lasts].inc(state)
outputs[state].inc(symbol)
lasts = state
# update the state and symbol lists
if state not in self._states:
self._states.append(state)
if symbol not in self._symbols:
self._symbols.append(symbol)
# create probability distributions (with smoothing)
N = len(self._states)
pi = estimator(starting, N)
A = ConditionalProbDist(transitions, estimator, False, N)
B = ConditionalProbDist(outputs, estimator, False, len(self._symbols))
return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
示例15: handle
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import inc [as 别名]
def handle(self, *args, **options):
fdist = FreqDist()
print "Analyzing raw data"
limit = 10
if args:
raw_datas = RawData.objects.filter(pk__in=args)
else:
raw_datas = RawData.objects.all()[:limit]
tagged_data = []
for raw_data in raw_datas:
words = nltk.word_tokenize(raw_data.data)
tagged_data.extend(nltk.pos_tag(words))
for word in words:
word = word.strip()
if word:
fdist.inc(word)
print "Anaylzed %s items" % len(raw_datas)
print
print "Top word: %s" % fdist.max()
print
print "Top 10 words"
for word in fdist.keys()[:10]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Bottom 10 words"
for word in fdist.keys()[-10:]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Words occurring between 50-100 times"
words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
print ", ".join(words)
cfdist = ConditionalFreqDist()
for (word, tag) in tagged_data:
cfdist[tag].inc(word)
print "Most popular noun: %s" % cfdist["NN"].max()
print
print "Top 50 nouns"
for word in cfdist["NN"].keys()[:50]:
times = cfdist["NN"][word]
print " -- %s occurred %s times" % (word, times)
print