本文整理汇总了Python中nltk.FreqDist.update方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.update方法的具体用法?Python FreqDist.update怎么用?Python FreqDist.update使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.update方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: findBestWords
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for category, words in wordsInCategories:
word_fd.update(words)
label_word_fd[category].update(words)
word_counts = {}
for condition in label_word_fd.conditions():
word_counts[condition] = label_word_fd[condition].N()
total_word_count = 0
for condition, count in word_counts.items():
total_word_count += count
word_scores = {}
for word, freq in word_fd.items():
score = 0
for condition, count in word_counts.items():
score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
word_scores[word] = score
best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
return set([w for w, s in best])
示例2: get_stats
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def get_stats(self, output_fname):
fd = FreqDist()
for text in self.texts:
fd.update(set(text))
fh = open(output_fname, 'w')
text = Text(self.paragraph_tokens)
fdist = FreqDist(text)
for (w,f) in fdist.iteritems():
print >> fh, "%s\t%i" % (w, f)
fh.close()
示例3: get_ngrams
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def get_ngrams(fileLines, n, pos_tag_dict):
# Get n gram counts for corpus
tokens = [];
ngram_counts = FreqDist();
for excerpt in fileLines:
ngram_counts_exp = get_ngram_counts_per_excerpt(excerpt,n,pos_tag_dict);
for ngram in ngram_counts_exp:
if ( ngram in ngram_counts ):
val = ngram_counts[ngram];
else:
val = 0;
ngram_counts[ngram] = val + ngram_counts_exp[ngram];
ngram_counts.update(ngram_counts_exp);
return ngram_counts;
示例4: check_svc_bef_aft
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def check_svc_bef_aft(list_line, command):
# check the freq of words before and after bus service
# check the freq of words before and after of word (number) which is non bus svc
text = ''
for i in range(0, len(list_line), 3):
split_first = 0
split_second = 0
if i % 3 == 0:
split_first = list_line[i].strip().split('\t')
j = i + 1
if j % 3 == 1:
split_second = list_line[j].strip().split('\t')
for k in range(0, len(split_second)):
if command == 'before_svc':
if int(split_second[k]) == 1: # mean bus svc
if command == 'before_svc':
if k > 0: # bus svc doesn't appear at the first position of sentences
text = text + split_first[k - 1].lower() + ' ' # take the word before
print i, k, split_first[k]
if command == 'after_svc':
if int(split_second[k]) == 1: # mean bus svc
if command == 'after_svc':
if k < len(split_second) - 1:
text = text + split_first[k + 1].lower() + ' ' # take the word after
if command == 'before_notsvc':
if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc
if k > 0: # bus svc doesn't appear at the last position of sentences
text = text + split_first[k - 1].lower() + ' '
if command == 'after_notsvc':
if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc
if k < len(split_second) - 1: # bus svc doesn't appear at the last position of sentences
text = text + split_first[k + 1].lower() + ' '
fdist = FreqDist()
tokens = word_tokenize(str(text))
fdist.update(tokens)
for value in fdist.most_common(len(fdist)):
print value[0], '\t', value[1]
print text
示例5: get_vocab
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def get_vocab(series, addtional_tokens=[], top=None):
"""
extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words.
:param series: array of sentences
:param addtional_tokens: additional tokens we want to include in the vocabulary
:param top: top n frequent words we want to include in the vocabulary
:return: map from a word to its numeric representation and the opposite map
"""
rev_vocab = addtional_tokens
freq_vocab = FreqDist()
for s in tqdm(series):
freq_vocab.update(word_tokenize(decontracted(s)))
print("Original vocab size %s" % len(freq_vocab))
all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True)
top_words = all_words_sorted[:top]
rev_vocab += top_words
vocab = {word: index for index, word in enumerate(rev_vocab)}
return vocab, rev_vocab
示例6: term_freq_all
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def term_freq_all(path, name):
file = path + '/' + name
fdist = FreqDist()
list_line = []
with open(file) as f:
for line in f:
split_line = line.split('\t')
words = nltk.word_tokenize(split_line[1].decode('utf-8').lower().strip())
fdist.update(words)
print split_line[0]
# list_stem = []
# for token in words:
# # st = LancasterStemmer()
# # try:
# # list_stem.append(st.stem(token).decode('utf-8'))
# # except:
# # print (split_line[0])
#
# st = PorterStemmer()
# try:
# list_stem.append(st.stem(token).decode('utf-8'))
# except:
# print (split_line[0])
# fdist.update(list_stem)
#print (line)
print ('==========================================')
print ('==========================================')
print (len(fdist))
stop = stopwords.words('english')
for value in fdist.most_common(15000):
# if (value[0] not in stop and (len(value[0]) >= 4)):
if (value[0] not in stop):
print (str(value[0].encode('utf-8')) + '\t' + str(value[1]))
示例7: token_aft
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def token_aft(list_line, command):
# check the token after label, note that belongs to the command ('svc', 'road', 'busstop')
text = ''
list_length = []
for i in range(0, len(list_line), 3):
split_first = 0
split_second = 0
if i % 3 == 0:
split_first = list_line[i].strip().split('\t') # list of sentences
j = i + 1
if j % 3 == 1:
split_second = list_line[j].strip().split('\t') # list of label for each word
list_length.append(len(split_first))
if command == 'svc':
for k in range(0, len(split_second)):
# check the frequency of token before bus service
if int(split_second[k]) == 1: # mean bus svc
if k < len(split_second) - 1: # bus svc doesn't appear at the first position of sentences
# try: # don't use stemming here
# stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before
# except UnicodeDecodeError:
# stem_word = connect_token(split_first[k - 1].lower())
stem_word = connect_token(split_first[k + 1].lower()) # take the token after label
if is_int(stem_word) is False:
text = text + stem_word + ' '
# if stem_word == 'sd' or stem_word == 'dd':
# print list_line[i]
fdist = FreqDist()
tokens = word_tokenize(str(text))
fdist.update(tokens)
for value in fdist.most_common(len(fdist)):
# print value[0], '\t', value[1]
print value[0]
print text
示例8: term_freq_time
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def term_freq_time(first, last):
## get the time convert in sgforum
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="ducthong", # your password
db="sgforums_singaporebuses") # name of the data base
# you must create a Cursor object. It will let
# you execute all the queries you need
cur = db.cursor()
# Use all the SQL you like
sql = "select p.post_id, s.createdAtSecond, p.summary from posts_filter p, posts_createatsecond s where p.post_id = s.post_id and s.createdAtSecond >= " \
+ str(first) + " and s.createdAtSecond <= " + str(last) + " order by s.createdAtSecond;"
cur.execute(sql) #call the database which name 'posts'
fdist = FreqDist()
for row in cur.fetchall():
post_id = str(row[0])
createdAtSecond = str(row[1])
summary = unicode(str(row[2]), errors='ignore')
#print (post_id + '\t' + createdAtSecond + '\t' + summary)
words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
# try:
# words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
# except:
# print (post_id + '\t' + summary)
fdist.update(words)
cur.close()
print ('==========================================')
print ('==========================================')
print (len(fdist))
stop = stopwords.words('english')
for value in fdist.most_common(200):
if (value[0] not in stop and len(value[0]) >= 3):
print (str(value[0]).encode('utf-8') + '\t' + str(value[1]))
示例9: check_bef_aft_roadBusStop
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def check_bef_aft_roadBusStop(list_line, command):
text = ''
for i in range(0, len(list_line), 3):
split_first = 0
split_second = 0
if i % 3 == 0:
split_first = list_line[i].strip().split('\t')
j = i + 1
if j % 3 == 1:
split_second = list_line[j].strip().split('\t')
k = 0
while True:
if k >= len(split_second):
break
if command == 'bef_road':
try:
if int(split_second[k]) == 2: # take road
if k > 0:
text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 2:
break
else:
k += 1
except ValueError:
k += 1
if command == 'aft_road':
try:
if int(split_second[k]) == 2: # take road
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 2:
break
if k < len(split_second) - 1:
if is_int(split_first[k]) is False:
text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label
else:
k += 1
except ValueError:
k += 1
if command == 'bef_busstop':
try:
if int(split_second[k]) == 3: # take busstop
if k > 0:
text = text + connect_token(split_first[k - 1].lower()) + ' ' # take the word before
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 3:
break
else:
k += 1
except ValueError:
k += 1
if command == 'aft_busstop':
try:
if int(split_second[k]) == 3: # take road
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 3:
break
if k < len(split_second) - 1:
if is_int(split_first[k]) is False:
text = text + connect_token(split_first[k].lower()) + ' ' # take the token after the label
else:
k += 1
except ValueError:
k += 1
fdist = FreqDist()
tokens = word_tokenize(str(text))
fdist.update(tokens)
for value in fdist.most_common(len(fdist)):
print value[0], '\t', value[1]
print text
示例10: ngrams
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import FreqDist
sentence = 'this is a foo bar sentences and i want to ngramize it this this'
# n = 3
# list_grams = ngrams(sentence.split(), n)
#
# for grams in list_grams:
# string = ''
# for value in grams:
# string = string + ' ' + value
# print (string.strip())
fdist = FreqDist()
tokens = word_tokenize(str(sentence))
fdist.update(tokens)
for value in fdist.most_common():
print value
i = 11
for i in range(0, 10):
i = i + 2
print 'testing'
text = 'Mount Batten Rd Haig Rd Sims Ave'
split_text = text.split('Rd')
for value in split_text:
print value
示例11: __init__
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
class BiWordExtractor:
def __init__(self, pickle_file):
self._statuses = pickle.load(open(pickle_file, 'rb'))
self._averages = dict()
self._gender_stats = dict()
self.fdistneuro = FreqDist()
self.fdistnonneuro = FreqDist()
self.highneuro = defaultdict()
self.highnonneuro =defaultdict()
"""
Processes statuses. (For information on how the different data structures
are set up, look at the comments for the getters.)
"""
def wordprocess(self):
lengths = dict()
row = 0
for status in self._statuses[1:]:
row +=1
print row
user = status[0]
filtered_status = status[1].translate(string.maketrans("",""), string.punctuation)
tokens = pattern_split.split(filtered_status.lower())
filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist]
bitokens = nltk.bigrams(filtered_tokens)
if status[5] == '+':
self.fdistneuro.update(bitokens)
elif status[5] == '-':
self.fdistnonneuro.update(bitokens)
def neuro_word_frequency(self):
vocneuro= self.fdistneuro.keys()
highvocneuro = vocneuro [:300]
return highvocneuro
def highneuro_word_frequency(self):
for w in self.neuro_word_frequency():
if self.fdistneuro[w]>= 5:
self.highneuro[w] =self.fdistneuro[w]
print self.highneuro.items()
print self.highneuro.keys()
return self.highneuro.keys()
示例12: token_bef
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
def token_bef(list_line, command):
# check the token before label, note that belongs to the command ('svc', 'road', 'busstop')
port = PorterStemmer()
text = ''
list_length = []
for i in range(0, len(list_line), 3):
split_first = 0
split_second = 0
if i % 3 == 0:
split_first = list_line[i].strip().split('\t') # list of sentences
j = i + 1
if j % 3 == 1:
split_second = list_line[j].strip().split('\t') # list of label for each word
list_length.append(len(split_first))
if command == 'svc':
for k in range(0, len(split_second)):
# check the frequency of token before bus service
if int(split_second[k]) == 1: # mean bus svc
if k > 0: # bus svc doesn't appear at the first position of sentences
# try: # don't use stemming here
# stem_word = port.stem(connect_token(split_first[k - 1].lower())) # take the token before
# except UnicodeDecodeError:
# stem_word = connect_token(split_first[k - 1].lower())
stem_word = connect_token(split_first[k - 1].lower())
if is_int(stem_word) is False:
text = text + stem_word + ' '
elif command == 'road':
k = 0
while True:
if k >= len(split_second):
break
else:
try:
if int(split_second[k]) == 2: # mean road
if k > 0:
stem_word = connect_token(split_first[k - 1].lower())
if is_int(stem_word) is False:
text = text + stem_word + ' ' # take the word before
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 2:
break
else:
k += 1
except ValueError:
k += 1
elif command == 'busstop':
k = 0
while True:
if k >= len(split_second):
break
else:
try:
if int(split_second[k]) == 3: # mean bus stop
if k > 0:
stem_word = connect_token(split_first[k - 1].lower())
if is_int(stem_word) is False:
text = text + stem_word + ' ' # take the word before
while True:
k += 1
if k == len(split_second):
break
else:
if int(split_second[k]) != 3:
break
else:
k += 1
except ValueError:
k += 1
fdist = FreqDist()
tokens = word_tokenize(str(text))
fdist.update(tokens)
for value in fdist.most_common(len(fdist)):
print value[0], '\t', value[1]
# print value[0]
print text
示例13: while
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
in_str = sys.stdin.read(BUF_SIZE)
rest = ''
read_count = 0
while (rest + in_str).strip() != '':
read_count += 1
if read_count % 100 == 0:
sys.stderr.write('.')
sys.stderr.flush()
tokens = (rest + in_str).split()
rest = tokens.pop()
if not tokens:
vocab.update(rest)
break
else:
vocab.update(tokens)
in_str = sys.stdin.read(BUF_SIZE)
print
for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
if i > len(vocab.values()):
break
print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
示例14: FreqDist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
all_words = FreqDist(w.lower() for w in train_set_words).keys()
def tweet_features(tweet):
tweet_words = word_tokenize(tweet)
features = {}
for word in all_words:
features['contains({})'.format(word)] = (word in tweet_words)
return features
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for item in train_set:
tweet = item[0].lower()
words = word_tokenize(item[0])
word_fd.update(words)
label_word_fd[item[1]].update(words)
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
示例15: parse_ngram_order
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import update [as 别名]
if word_limit:
logging.info('Word limit %d' % word_limit)
order = parse_ngram_order(opts.ngram_order)
logging.info('Char n-gram order (%d, %d)' % order)
cutoff = opts.min_count
corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)
tf = FreqDist()
df = FreqDist()
n_docs = 0
for text in corpus:
n_docs += 1
tf.update(text)
df.update(set(text))
print "###TOTAL###\t%d\t%d" % (tf.N(), n_docs)
for token, freq in tf.items():
if freq < cutoff:
break
print "%s\t%d\t%d\t%.6f" % (token, freq, df[token], math.log(float(n_docs)/df[token]))