本文整理汇总了Python中nltk.probability.FreqDist.freq方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.freq方法的具体用法?Python FreqDist.freq怎么用?Python FreqDist.freq使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.freq方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: statsText
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def statsText(text, words):
fdist = FreqDist()
# formatted prints will work with Python2 and Python3
for word in word_tokenize(text):
fdist[word.lower()] += 1
#loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the
#wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those
#rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc...
frequencies = []
for word in words:
if '*' in word: #if word has * we need to compare it with each item in fdist...
wordRegEx = word.replace('*', '.*') #make it suitable for Regular Expression...
for k in fdist:
m = re.match(wordRegEx, k)
if m:
frequencies.append((word, fdist.freq(m.group())))
else:
frequencies.append((word, fdist.freq(word)))
return frequencies
示例2: text_to_vector
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def text_to_vector(docs, metric):
""" Create frequency based feature-vector from text
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
all_tokens = doc_freqs.keys()
num_docs = len(docs)
num_features = len(all_tokens)
# Build feature x document matrix
matrix = np.zeros((num_features, num_docs))
for i, fd in enumerate(tf_dists):
if metric == FrequencyMetrics.TF:
v = [fd.freq(word) for word in all_tokens]
elif metric == FrequencyMetrics.TF_IDF:
v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
else:
raise ValueError("No such feature type: %s" % feature_type);
matrix[:,i] = v
return matrix
示例3: text_to_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def text_to_dict(docs, metric):
""" Create dictionaries of term frequencies based on documents
Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
"""
doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
tf_dists = [] # List of TF distributions per document
# Create freq_dist for each document
for doc in docs:
doc = preprocess.preprocess_text(doc)
fd = FreqDist()
for word in doc: fd.inc(word)
doc_freqs.update(fd.samples())
tf_dists.append(fd)
num_docs = len(docs)
# Build dictionaries
dicts = []
for i, fd in enumerate(tf_dists):
if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists))
d = {}
if metric == FrequencyMetrics.TF:
for word in fd.samples():
d[word] = fd.freq(word)
elif metric == FrequencyMetrics.TF_IDF:
for word in fd.samples():
d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
else:
raise ValueError("No such feature type: %s" % feature_type);
dicts.append(d)
return dicts
示例4: fun14
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def fun14():
"""counting other things"""
# print [len(w) for w in text1]
fdist1 = FreqDist([len(w) for w in text1])
# print fdist1.keys()
# print fdist1.items()
# word length 3 => 50223
print fdist1[3]
print fdist1.max()
# frequency 20%
print fdist1.freq(3)
示例5: get_best_answers
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def get_best_answers(self, passage_list, q):
logger = logging.getLogger("qa_logger")
logger.info("%s:\tAnswer Processing", q.id_q)
empty = passage_list == []
logger.info("%s:\t\tAnswer Extraction", q.id_q)
answer_list = []
for passage in passage_list:
a = passage.find_answer(q)
if a.is_successful():
answer_list.append(a)
if not answer_list:
return ([], empty)
logger.info("%s:\t\tAnswer Filtering", q.id_q)
# Obtain answer frequency
fd = FreqDist(answer_list)
# Normalize frequencies
normalize = fd.freq(fd.max())
# Modify scores by frequency
for answer in answer_list:
answer.score = int(answer.score * (fd.freq(answer) / normalize))
# Sort answers by score
answer_list.sort(key=lambda x: x.score, reverse=True)
# Filter bad answers
try:
threshold = int(MyConfig.get("answer_filtering", "threshold"))
except:
logger = logging.getLogger("qa_logger")
logger.error("answer quality threshold not found")
threshold = 50
answer_list = filter(lambda x: x.score > threshold, answer_list)
final_answers = []
for a in answer_list:
if a not in final_answers:
final_answers.append(a)
if len(final_answers) == 3:
break
return (final_answers, empty)
示例6: zipfity
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def zipfity(lst):
unigram = FreqDist()
for sent in lst:
for word in sent:
unigram[word.lower()] +=1 #the task didn't say anything so did it all to lower
sorted_unigram = sorted(unigram, key = unigram.get, reverse = True)
top10 = sorted_unigram[:10]
most_freq = unigram.freq(top10[0])
count = 1
print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)')
print '----------------------------'
for word in top10:
print '{0:7s}{1:10.2f}{2:10.2f}'.format(word, unigram.freq(word)*100, (most_freq/count)*100)
count += 1
示例7: statsText
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def statsText(text, words):
fdist = FreqDist()
# formatted prints will work with Python2 and Python3
for word in word_tokenize(text):
fdist.inc(word.lower())
return [(k, fdist.freq(k)) for k in words]
示例8: generate_weight_dictionary
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def generate_weight_dictionary(self, service, words):
df = open(self.dictionary.get_dict_service_file_name(service), "w+")
t = Text(words)
freq_dist = FreqDist(t)
for w in freq_dist:
weight = 100 * freq_dist.freq(w)
df.write(w + helper.results_field_separator + str(weight) + "\n")
df.close()
示例9: main
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def main():
argparser = argparse.ArgumentParser(description='text file')
argparser.add_argument('file', type=str, help='file to produce frequency distribution for')
args = argparser.parse_args()
#toker = WhitespaceTokenizer()
f = open(args.file)
text = f.read()
print(text)
fdist = FreqDist(text)
print(fdist.freq('28') * 100)
fdist.plot()
示例10: freq_lema_ngrams
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def freq_lema_ngrams(list_monograms,list_lemas):
fdist1 = FreqDist(list_monograms)
#fdist2 = FreqDist(list_lemas)
vocabulary1 = fdist1.keys() #valores distintos
frec_grams=[];
for tag in vocabulary1:
temp1=[]
for i in range(len(list_monograms)):
if(list_monograms[i] == tag):
temp1.append(list_lemas[i])
temp2=set(temp1)
frec_grams.append([tag, fdist1[tag], fdist1.freq(tag),'-'.join(temp2)])
frec_grams_sort= sorted(frec_grams, key=itemgetter(1), reverse=True)
return frec_grams_sort
示例11: _entity_ranking
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def _entity_ranking(self, entities):
if len(entities) == 0:
return "", "", int(0)
# Obtain frequency of entities
entities_freq = FreqDist(entities)
# Our answer is the sample with the greatest number of outcomes
exact = entities_freq.max()
# Our window is empty because this algorithm generates exact answers
window = ""
# Our score is the entity frequency
score = int(entities_freq.freq(exact) * 1000)
return exact, window, score
示例12: create_enhanced_dale_chall_list
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def create_enhanced_dale_chall_list(self):
#list of sites used to create list of most frequent words
alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask']
#bring all privacy texts into one list
corpus = []
data = get_all_policies()
for site in data:
if site in alexa_list:
corpus.append(data[site]["text"])
#get the words of this list into a list of words
t = textanalyzer("eng")
words = t.getWords("".join(corpus))
#open the dale chall wordlist
dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';')
#create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words
new_corpus = []
for word in words:
if word.lower() not in dale_chall_list and word not in alexa_list:
new_corpus.append(word.lower())
#create a frequency distribution of the words of this list of words
fdist = FreqDist(new_corpus)
#plot this
fdist.plot(80, cumulative=True)
#make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative)
most_frequ = []
cum_percentage = 0.0
for sample in fdist:
cum_percentage += fdist.freq(sample)
most_frequ.append(sample)
if cum_percentage > 0.33:
break
#write those into a file
privacy_file = open("privacy_wordlist.txt", "w")
privacy_file.write(";".join(most_frequ))
示例13: next
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def next(self, s, method = MOST_LIKELY):
# Pick a transition leaving state s and return a state that would
# likely follow. The next state is chosen according to the method
# specified. The default is to choose and return the most likely
# transition state.
# determine all states adjacent to s
transitions = self._adjacentVertices[s]
freqDist = FreqDist()
# determine the weights of the edges between state s and all adjacent states
for state in transitions:
freqDist.inc(state)
if method == MarkovChain.MOST_LIKELY:
return freqDist.max()
elif method == MarkovChain.LEAST_LIKELY:
# NLTK provides no built-in method to return the minimum of a
# frequency distribution so for now, we get a list of samples
# sorted in decreasing order and grab the last one.
return freqDist.sorted_samples()[-1]
else:
# choose a real number between 0 and 1
x = uniform(0,1)
# choose next state based on weights of the edges. Randomness plays a part here.
for i in range(len(transitions)):
probability = freqDist.freq(transitions[i])
if x < probability:
return transitions[i]
x = x - probability
exc = "Error in MarkovChain.next(). Did not find next state.\n"
raise exc
示例14: get_content_avg_entropy
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
def get_content_avg_entropy(self):
'''
:return: avg entropy of text/<mime> parts for multipart bodies
'''
n = 0
txt_avg_ent = INIT_SCORE
# todo: make n-grams
tokens_list = tuple(self.pattern.get_stemmed_tokens())
#logger.debug(tokens_list)
for tokens in tokens_list:
#logger.debug(tokens)
n +=1
freqdist = FreqDist(tokens)
probs = [freqdist.freq(l) for l in FreqDist(tokens)]
txt_avg_ent += -sum([p * math.log(p,2) for p in probs])
#logger.debug(n)
# :))
if n !=0:
txt_avg_ent = txt_avg_ent/n
return txt_avg_ent
示例15: listdir
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import freq [as 别名]
unigrams_path = reu_path + unigramsFrom
# count word length frequencies
for f in listdir(samples_path):
if (isfile(join(samples_path, f))):
output_path = reu_path + toDir + f
output = open(output_path, "w")
thisfile = open(samples_path + f).read()
tokens = tokenize(thisfile)
fd_words = FreqDist([len(w) for w in tokens])
for a in range(1, 21):
output.write(str(a) + '\t' + str(fd_words.freq(a)) + '\n')
count_20 = 0
# count 20+
for w in tokens:
if (len(w) >= 20):
count_20 += 1
output.write("20+\t" + str(count_20 / len(fd_words)) + '\n')
# count POS tag frequencies
for f in listdir(unigrams_path):
if (isfile(join(unigrams_path, f))):
output_path = reu_path + toDir + f
output = open(output_path, "a")
thisfile = open(unigrams_path + f).read()
tokens = tokenize(thisfile)