本文整理汇总了Python中nltk.probability.FreqDist.update方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.update方法的具体用法?Python FreqDist.update怎么用?Python FreqDist.update使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.update方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: buildCategoryDictionary
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def buildCategoryDictionary(category):
tweetList = twitter_fetch.get_tweets_text(classn=category)
freq = FreqDist()
for tweet in tweetList:
freq.update(word for word in tokenizeTweet(tweet))
saveDictionaryToFile(freq, category + categoryDictFilePath)
return freq
示例2: high_words
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos = 0
neg = 0
for review in posids:
pos += 1
if (pos != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
for review in negids:
neg += 1
if (neg != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
bestwords = set([w for w, s in best])
return bestwords
"""
示例3: text_to_vector
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def text_to_vector(docs, metric):
""" Create frequency based feature-vector from text
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
all_tokens = doc_freqs.keys()
num_docs = len(docs)
num_features = len(all_tokens)
# Build feature x document matrix
matrix = np.zeros((num_features, num_docs))
for i, fd in enumerate(tf_dists):
if metric == FrequencyMetrics.TF:
v = [fd.freq(word) for word in all_tokens]
elif metric == FrequencyMetrics.TF_IDF:
v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
else:
raise ValueError("No such feature type: %s" % feature_type);
matrix[:,i] = v
return matrix
示例4: evaluate_html
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def evaluate_html(content, html_conf):
fdist = FreqDist()
if html_conf['usehtml'] == False:
logging.info('Discarding HTML tags')
return fdist
logging.info("\tEvaluating HTML")
# try with TITLE tag
titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
for title in titles:
root = etree.fromstring(title)
words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
stems = steming(terms_list)
for i in range(html_conf['title']):
fdist.update(stems)
# try with H1 tag
headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
for header in headers:
root = etree.fromstring(header)
words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
stems = steming(terms_list)
for i in range(html_conf['h1']):
fdist.update(stems)
return fdist
示例5: text_to_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def text_to_dict(docs, metric):
""" Create dictionaries of term frequencies based on documents
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
num_docs = len(docs)
# Build dictionaries
dicts = []
for i, fd in enumerate(tf_dists):
if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists))
d = {}
if metric == FrequencyMetrics.TF:
for word in fd.samples():
d[word] = fd.freq(word)
elif metric == FrequencyMetrics.TF_IDF:
for word in fd.samples():
d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
else:
raise ValueError("No such feature type: %s" % feature_type);
dicts.append(d)
return dicts
示例6: reduce_text
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def reduce_text(t1, t2):
words = FreqDist(t1[0])
words.update(t2[0])
try:
bigrams = FreqDist(t1[1])
bigrams.update(t2[1])
except:
logger.error('problem in reducing..')
logger.error('t1: %s' % str(t1))
logger.error('t2: %s' % str(t2))
return words, bigrams
示例7: analyze
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def analyze(data, out_dir):
summary = {}
freq = FreqDist()
sentence_length = defaultdict(list)
year_freq_dist = defaultdict(FreqDist)
year_dist = defaultdict(int)
year_month_dist = defaultdict(int)
year_quarter_dist = defaultdict(int)
has_date = no_date = sentences = words = 0
for year, date_str, title, text in data:
date = parsedate(date_str)
logger.debug('%s -> %s' % (date_str, str(date)))
freq.update(ngram_phrases(text,3))
if date:
# Since can't use strftime for years before 1900, we need to use isoformat
year_str = date.isoformat()[:4]
year_mo_str = date.isoformat()[:7]
has_date += 1
else:
no_date += 1
year_mo_str = ''
if year_str:
year_range = get_year_range(year_str)
sentence_length[ year_range ].extend( sentence_lengths(text) )
year_freq_dist[ year_range ].update( ngram_phrases(text,3) )
year_dist[year] += 1
if year_mo_str:
year_month_dist[year_mo_str] += 1
year_quarter_dist[ year_quarter(year_mo_str) ] += 1
sentences += count_sentences(text)
words += count_words(text)
logger.debug('Documents with a valid date: %d Documents without a valid date: %d' % (has_date, no_date))
logger.debug('Total # Sentences: %d' % sentences)
logger.debug('Total $ Words: %d' % words)
generate_dict_csv(['year', 'cnt'], year_dist, os.path.join(out_dir, 'year-data.csv'))
generate_dict_csv(['yearmo', 'cnt'], year_month_dist, os.path.join(out_dir, 'year-mo-data.csv'))
generate_dict_csv(['yearq', 'cnt'], year_quarter_dist, os.path.join(out_dir, 'year-quarter-data.csv'))
generate_stream_js(year_freq_dist, os.path.join(out_dir, 'stream-data.json'))
generate_cloud_csv(year_freq_dist, os.path.join(out_dir, 'year-phrase-data.csv'))
generate_sentence_length_csv(sentence_length, os.path.join(out_dir, 'data-sentence-lengths.csv'))
示例8: updateCategoryDictionary
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def updateCategoryDictionary(category):
tweetList = twitter_fetch.get_new_tweets(classn=category)
freq = FreqDist()
tmpDict = FreqDist()
for tweet in tweetList:
freq.update(word for word in tokenizeTweet(tweet))
try:
oldDict = readDictionaryFromFile(category + categoryDictFilePath)
except:
newDict = buildCategoryDictionary(category)
return newDict
oldDict.update(freq)
saveDictionaryToFile(oldDict, category + categoryDictFilePath)
return oldDict
示例9: cnc
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True):
"""given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
frequency_dists_by_length = {}
for phrase in phrase_lists:
l = len(phrase)
if l not in frequency_dists_by_length:
frequency_dists_by_length[l] = FreqDist()
frequency_dists_by_length[l].inc(tuple(phrase))
# word -> C-value(word)
phrase_scores = {}
# word -> num occurrences(word)
phrase_frequencies = FreqDist()
# word -> (t(word), c(word))
sub_phrase_scores = {}
# traverse from longest phrases to shortest
for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True):
# update global frequency counts with all counts of this length
phrase_frequencies.update(frequency_dist)
# within each phrase length, traverse from most common phrases to least
for phrase, frequency in frequency_dist.iteritems():
if phrase in sub_phrase_scores:
t, c = sub_phrase_scores[phrase]
subtractive = 1.0 / c * t
else:
subtractive = 0
if weight_by_length:
if include_unigrams:
weight = log(length + 1, 2)
else:
weight = log(length, 2)
else:
weight = 1
c_value = weight * (frequency - subtractive)
if c_value >= c_value_threshold:
phrase_scores[phrase] = c_value
for sub_phrase in utils.sub_lists(phrase):
if sub_phrase in sub_phrase_scores:
t, c = sub_phrase_scores[sub_phrase]
else:
t, c = 0, 0
sub_phrase_scores[sub_phrase] = t + frequency, c + 1
return phrase_scores, phrase_frequencies
示例10: reduce_tweets
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def reduce_tweets(t1, t2):
tags = FreqDist(t1[0])
tags.update(t2[0])
words = FreqDist(t1[1])
words.update(t2[1])
places = FreqDist(t1[2])
places.update(t2[2])
bigrams = FreqDist(t1[3])
bigrams.update(t2[3])
return tags, words, places, bigrams
示例11: word_count
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def word_count(drug=None,limit=None,pos_filter=False,lemma=True):
"""Scans comment texts (from drug_mentions.texts) for selected drug,
calculates most common words.
KWARGS:
drug: string or None.
Drug selector. Allows three cases:
* None: scrape all comments in database, regardless of drug.
* 'antidepressant': select comments speaking generically about
drug, not referencing specific drug.
* [drug name]: comments referencing specific drug.
Default None. Passed to drug_mentions.texts.
limit: int or None.
Optional limit on SQL queries retrieved by drug_mentions.texts.
Defaults to None (returns all hits).
pos_filter: boolean.
Passed to tokenize(), set True to use part-of-speech filtering.
lemma: boolean.
Passed to tokenize(), set True to use lemmatization.
RETURNS:
freq: nltk.probability.FreqDist object.
Frequency distribution of words from comments.
RAISES:
ValueError:
for invalid drug name.
"""
try:
texts = dm.texts(drug=drug,limit=limit)
except ValueError:
raise ValueError('Invalid drug name.')
freq = FreqDist()
for text in texts:
freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma))
return freq
示例12: buildGoogleUnigram
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def buildGoogleUnigram( ):
DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"
unigramFiles = os.listdir( DirPrefix )
unigramFiles = list( map( lambda _fileName: DirPrefix + _fileName, unigramFiles ) )
masterUnigram = FreqDist( )
with multiprocessing.Pool( 8, initializer = initProcess ) as ProcessPool:
resAsync = ProcessPool.map_async( _buildUnigram, unigramFiles )
results = resAsync.get( )
ProcessPool.join( )
print( "all jobs finished, building master unigram" )
for freqdist in results:
masterUnigram.update( freqdist )
with open( "PickledData/GoogleUnigram.pickle", 'wb' ) as pklFile:
pickle.dump( masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL )
return
示例13: AddAlphaBigramModel
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
class AddAlphaBigramModel():
def __init__(self, alpha=0.1):
self.vocabulary=set()
self.V = 0
self.bigrams=ConditionalFreqDist([])
self.unigrams=FreqDist([])
self.alpha = 0.1
def train(self):
self.vocabulary=set()
this_bigrams=[]
self.unigrams = FreqDist([])
for fileid in gutenberg.fileids():
for sentence in gutenberg.sents(fileid):
words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
this_bigrams += bigrams(words)
self.vocabulary.update(words)
self.unigrams.update(words)
self.bigrams=ConditionalFreqDist(this_bigrams)
self.V = len(self.vocabulary)
def bigram_prob(self, w1, w2):
numerator = self.bigrams[w1][w2] + self.alpha
denominator = self.bigrams[w1].N() + (self.alpha * self.V)
retval= math.log(numerator / denominator)
return retval
def unigram_prob(self, w):
numerator = self.unigrams[w] + self.alpha
denominator = self.unigrams.N() + (self.alpha * self.V)
return math.log(numerator/denominator)
def __contains__(self, w):
return w in self.vocabulary
示例14: process_documents
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
def process_documents(path, html_conf):
logging.info("Using documents from \"" + path + "\" directory ")
if path[-1] != "/" :
path + "/"
documents = {}
allterms = {}
listing = os.listdir(path)
allfreq = FreqDist()
# retriving document content - discarding structure
logging.info("Processing files...")
for infile in listing:
logging.info("\tReading document " + infile)
raw_doc = open(path + infile, 'r').read()
nonhtml_doc = nltk.clean_html(raw_doc)
word_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', raw_doc))
terms_list = [ x.lower() for x in word_list if x.lower() not in stopwords.words('english')]
stemmes = steming(terms_list)
for stem in stemmes :
allterms[stem] = 0
fdist = FreqDist(word.lower() for word in stemmes)
allfreq.update(word.lower() for word in stemmes)
htmldist = evaluate_html(raw_doc.lower(), html_conf)
fdist.update(htmldist)
allfreq.update(htmldist)
documents[infile] = { 'docname': infile, 'terms': stemmes, 'tf': fdist, 'tfidf': None }
for key, doc in documents.iteritems():
doctfidf = compute_tfidf(doc ,documents)
documents[key]['tfidf'] = dict(allterms.items() + doctfidf.items())
return documents, allfreq
示例15: EditDistanceFinder
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import update [as 别名]
class EditDistanceFinder():
def __init__(self):
self.char_probs = ConditionalProbDist([],MLEProbDist)
self.bichar_freqs = ConditionalFreqDist([])
self.transp_freqs = FreqDist()
self.DOWN,self.LEFT,self.DIAG,self.DOUBLE_DIAG = range(4)
self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)
def train(self, fname):
misspellings=[]
for line in open(fname):
line=line.strip()
if not(line): continue
w1, w2 = line.split(",")
misspellings.append((w1.strip(),w2.strip()))
last_alignments = None
done = False
while not done:
print("Iteration")
alignments, bigrams = self.train_alignments(misspellings)
self.train_costs(alignments, bigrams)
done = (alignments == last_alignments)
last_alignments = alignments
def train_alignments(self, misspellings):
alignments = []
self.bichar_freqs = FreqDist()
for error, corrected in misspellings:
distance, this_alignments = self.align(corrected, error)
alignments += this_alignments
bigrams = [corrected[i:i+2] for i in range(len(corrected)-1)]
self.bichar_freqs.update(bigrams)
return alignments,bigrams
def train_costs(self, alignments,bigrams):
add_one_aligns = [(a,b) for a in string.ascii_lowercase for b in string.ascii_lowercase]
single_aligns = [(a,b) for a,b in alignments if len(a) < 2]
char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)
double_aligns = [a for a,b in alignments if len(a) >= 2]
self.transp_freqs = FreqDist(double_aligns)
def align(self, w1, w2, verbose=False):
M = len(w1) +1
N = len(w2) +1
table = numpy.zeros((M,N))
backtrace = numpy.zeros((M,N))
for i in range(1,M):
w1_char = w1[i-1]
table[i,0] = table[i-1,0] + self.del_cost(w1_char)
backtrace[i,0] = self.DOWN
for j in range(1,N):
w2_char = w2[j-1]
backtrace[0,j] = self.LEFT
table[0,j] = table[0,j-1] + self.ins_cost(w2_char)
for i in range(1,M):
w1_char = w1[i-1]
for j in range(1,N):
w2_char = w2[j-1]
this_del = table[i-1,j] + self.del_cost(w1_char)
this_ins = table[i,j-1] + self.ins_cost(w2_char)
this_sub = table[i-1,j-1] + self.sub_cost(w1_char,w2_char)
if j > 1 and i > 1 and w1[i-1] == w2[j-2] and w1[i-2]==w2[j-1] and w1[i-1] != w1[i-2]:
this_transp = table[i-2,j-2] + self.transp_cost(w1_char, w2_char)
else:
this_transp = 999999
min_cost = min(this_del, this_ins, this_sub, this_transp)
table[i,j] = min_cost
if this_sub == min_cost:
backtrace[i,j] = self.DIAG
elif this_transp == min_cost:
backtrace[i,j] = self.DOUBLE_DIAG
elif this_ins == min_cost:
backtrace[i,j] = self.LEFT
else: # insert
backtrace[i,j] = self.DOWN
alignments = []
i = M - 1
j = N - 1
while (j or i):
this_backtrace = backtrace[i,j]
if this_backtrace == self.DIAG: # sub
alignments.append((w1[i-1],w2[j-1]))
i -= 1
j -= 1
elif this_backtrace == self.DOUBLE_DIAG:
alignments.append((w1[i-2:i],w2[j-2:j]))
#.........这里部分代码省略.........