本文整理汇总了Python中nltk.FreqDist类的典型用法代码示例。如果您正苦于以下问题:Python FreqDist类的具体用法?Python FreqDist怎么用?Python FreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FreqDist类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: top_words_from_corpus
def top_words_from_corpus(self, num_words, test_name):
corpus_tokens = []
for i in self.corpus_vars["corpus_member_ids"]:
title = 'document_' + str(i)
doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
corpus_tokens += doc_tokens
top_words = []
fdist_corpus = FreqDist(corpus_tokens)
fdist_list = fdist_corpus.items()
if test_name == "Function Word PCA":
function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
tagged_top = nltk.pos_tag(top_words)
for j,k in tagged_top:
if k not in function_pos:
top_words.remove(j)
if len(top_words) == num_words:
break
elif test_name == "Burrows's Delta":
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
break
return top_words
示例2: get_hosts
def get_hosts(year):
'''Hosts is a list of one or more strings. Do NOT change the name
of this function or what it returns.'''
# Your code here
file_name = 'gg%s.json' % year
with open(file_name, 'r') as data:
db = json.load(data)
hosts = []
pairs = []
for f in db:
e = f['text']
if 'and' in e.lower():
for proper in strip_proper_pairs(normalize_str(e).split()):
pair = proper.split('and')
if len(pair) == 2:
if pair[0] != ' ' and pair[1] != ' ':
pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
pairs_freq = FreqDist(pairs)
if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
hosts.append(pairs_freq.most_common(10)[1][0][0])
hosts.append(pairs_freq.most_common(10)[1][0][1])
else:
hosts.append(pairs_freq.most_common(10)[0][0][0])
hosts.append(pairs_freq.most_common(10)[0][0][1])
return hosts
示例3: make_cutOff
def make_cutOff(flatList, bottomCutOff, topCutOff):
'''
INPUT:
flatList is a 1-d list of all tokens in set of tweets and both bottom and
topCutOff are intergers
OUTPUT:
newVocab = a 1-d list of all tokens we want to keep
thrownOut = a 1-d list of all tokens to throw out
'''
fd = FreqDist(flatList)
newVocab = []
thrownOut = []
for item in fd.items()[:topCutOff]:
# append most common words
thrownOut.append(item)
for item in fd.items()[topCutOff:]:
if item[1] > bottomCutOff:
# append good words
newVocab.append(item[0])
else:
# append uncommon words
thrownOut.append(item)
print 'Cutoffs made...'
return newVocab, thrownOut
示例4: main
def main():
keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
"Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
"hacker", "exploit code", "Defense", "Intelligence", "Agency"]
file_name = "tweets_output.txt"
pickle_words_file = "words.pickle"
pickle_words(file_name, pickle_words_file, keyword_list)
pickle_tweets_file = "tweets.pickle"
pickle_tweets(file_name, pickle_tweets_file)
words = load(open("words.pickle"))
tweets = load(open("tweets.pickle"))
freq_dist = FreqDist(words)
print tweets
print("===")
print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
print("===")
print("Number of words within the twitter search space: ")
print(len(words))
print("Number of unique words within twitter search space: ")
print(len(set(words)))
print("Lexical Diversity of unique words within twitter search space: ")
print(lexical_diversity(words))
print("===")
print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
print("===")
print("Top 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[:50])
print("===")
print("Bottom 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[-50:])
print("===")
示例5: get_word_features
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
示例6: __init__
def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0,
max_tables = 50000, sanity_check=False, initialize=False,
report_filename="topic_history.txt"):
self.max_tables = max_tables
self._alphabet = FreqDist()
# store all words seen in a list so they are associated with a unique ID.
self.initialize_index()
self._words = FreqDist()
self.alpha_topic = alpha_topic
self.alpha_word = alpha_word
self._num_updates = 0
self._report = None
if report_filename:
self._report = open(report_filename, 'w')
self.num_topics = num_topics
self._topics = [FreqDist() for x in xrange(num_topics)]
# the sanity_check flag is for testing only.
if initialize and sanity_check == True:
self.deterministic_seed()
elif initialize:
self.initialize_topics()
示例7: __init__
class Index:
"""
The Index class stores an index for a document.
"""
def __init__(self):
self._freq_dist = None
self._document = None
def index(self, document):
self._document = document
if self._freq_dist == None:
self._freq_dist = FreqDist()
for term in self.terms():
self._freq_dist.inc(term)
def reset(self):
"Reset the index"
self._freq_dist = None
def freq_dist(self):
if self._freq_dist == None:
self.index()
return self._freq_dist
# return the number of times a term appears in this document
def freq(self, term):
if not self._freq_dist:
self.index()
return self._freq_dist[term]
def tf(self, term):
if not self._freq_dist:
self.index()
return float(self._freq_dist[term]) / float(self._freq_dist.N())
示例8: create_word_freq
def create_word_freq(db):
db = getattr(db, "Posts")
#client.command("CREATE CLASS concepted EXTENDS E")
client.command("DELETE EDGE concepted")
#client.command('create property frequency.freq string')
#client.command("DELETE VERTEX frequency")
data = db.find().batch_size(50)
concept = client.command("SELECT name FROM concept")
c = [c.name for c in concept]
for d in data:
if not 'Body' in d:
display= ''
else:
display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
tokens = nltk.word_tokenize(display)
fdist=FreqDist(tokens)
i = fdist.most_common()
for k in i:
if k[0].lower() in c:
try:
client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
except:
continue
示例9: process
def process(f, return_tokens=True, return_freqdist=True):
"""
Function to process deals data.
Splits text into sentences. FreqDist is incremented from tokenization.
Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
Deals are also about domain names. Not intending to split it up
:rtype : FreqDist, list() of str
:param f: Input file with a deal per line
"""
fd = FreqDist()
tokens = []
fh = open(f, 'r')
sentences = [line.strip() for line in fh.readlines()]
for line in sentences:
t = []
for word in PunktWordTokenizer().tokenize(line.lower()):
if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
if return_tokens:
t.append(word)
if return_freqdist:
fd.inc(word)
tokens.append(t)
fh.close()
return fd, sentences, tokens
示例10: process_tweets
def process_tweets (hashtag,addl_stops=[]):
count=0
good_count=0
words_to_plot=[]
#Iterate through all chunked files with relevant hashtag
for fname in os.listdir(os.getcwd()):
if fname.startswith(hashtag):
with open(fname,'r') as data_file:
data=data_file.read()
# Parse raw string since json.load() approach wasn't working
data=data.split("\n\x00,")
for tweet in data:
count+=1
# Tweets have a well-defined structure, so we can parse them
# manually (even though the JSON approach would be cleaner)
text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
# Skip tweets that contain Unicode
if text.find('\u')>=0:
continue
else:
good_count+=1
# Tokenize and count word frequency, ignoring case
words = word_tokenize(text)
clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
words_to_plot=words_to_plot+clean_words
#Create frequency histogram of 50 most common words and print summary of activity
fdist=FreqDist(words_to_plot)
fdist.plot(50)
print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
return words_to_plot
示例11: featureset
def featureset(sample):
comment, label = sample
features = {}
# tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
words = map(lambda statement: map(lambda (w,t):w, statement), comment)
words = sum(words, [])
# tags = sum(tags, [])
size_= sum([len(word) for word in words])
features['stmt_len'] = len(words)/float(len(comment))
features['word_len'] = size_/float(len(words))
features['size'] = size_
# tags_dist = FreqDist(sum(tags, []))
# for tag in TAGS:
# features[tag] = tags_dist.get(tag, 0)
dist = FreqDist([word.lower() for word in words])
# num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
# features['prob_stop_words'] = num_stop_words/len(words)
for word in EN_STOPWORDS:
features[word] = dist.get(word, 0)/float(len(words))
features['alwayson'] = 1.0
for language in LANGUAGES:
for i in range(1,n+1):
word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
features['w_sim_%d_%s' % (i, language)] = word_sim
features['t_sim_%d_%s' % (i, language)] = tag_sim
features['c_sim_%d_%s' % (i, language)] = char_sim
# features['s_sim_%d_%s' % (i, language)] = w_s_sim
return (features, label)
示例12: posAnalysis
def posAnalysis(collection):
reviews = collection.find(timeout=False)
__reportProgress.counter = 0
skip = 1
for rev in reviews:
if skip%200 == 0:
print 'skip'+str(skip)
__reportProgress()
if rev.has_key('tags'):
skip += 1
if rev['tags'].has_key('NN'):
continue
sents = sent_tokenize(rev['text'])
tokens = [word for sent in sents for word in word_tokenize(sent)]
pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
tag_fd = FreqDist(tag for (word, tag) in pos)
tags = dict()
for (key,value) in tag_fd.items():
k = key.replace('$','S')
out = key.translate(string.maketrans("",""), string.punctuation)
if len(out)>0:
tags[k] = value
collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})
示例13: getTopNFreqWords
def getTopNFreqWords(textArr,N):
fdist = FreqDist(textArr)
topWordsWithFreq = fdist.most_common(N)
topWords=[]
for word in topWordsWithFreq:
topWords.append(word[0])
return topWords
示例14: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
示例15: cleaner
def cleaner(filename):
textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
text = []
all_dates = []
complete_text = []
words_list = []
nodes = []
for line in textfile:
datetime,chat = line.split('-')
date, time = datetime.split(',')
loc = chat.find(':')
#if len(chat.split(':'))==3:
# print chat
user,text = chat[:loc],chat[loc+2:]
text = text.replace("\n",'')
words = text.split(' ')
for i in words:
words_list.append(i)
complete_text.append(text)
nodes.append(user)
all_dates.append(date)
#print set(nodes)
#print set(all_dates)
fdist = FreqDist(words_list)
f1 = fdist.most_common(100)
create_csv('wordcloud.csv',f1)
textfile.close()