本文整理汇总了Python中nltk.FreqDist.most_common方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.most_common方法的具体用法?Python FreqDist.most_common怎么用?Python FreqDist.most_common使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.most_common方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_politician_names
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def find_politician_names(debate_soup_dict):
for debate in debate_soup_dict.keys():
raw = debate_soup_dict[debate]["soup"].get_text()
# raw = raw.replace("\\\", "")
raw = raw.replace("\\", "")
raw = raw.replace(".", ". ")
raw = raw.replace("?", "? ")
raw = raw.replace("!", "! ")
raw = raw.replace(" ", " ")
raw = raw.replace("[applause]", "")
raw = raw.replace("[crosstalk]", "")
raw = raw.replace("[laughter]" "[Laughter]" "(LAUGHTER)", "")
tokens = nltk.word_tokenize(raw)
speech = nltk.Text(tokens)
sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")
sents = sent_detector.tokenize(raw.strip())
# find candidate names, most commonly repeated first words of sentences, not common words
colon_names = []
dumbWords = stopwords.words("english")
for sent in sents:
if ":" in sent:
sent = sent.split(":")
possible_name = sent[0]
if len(possible_name) < 25:
colon_names.append(possible_name)
fdist1 = FreqDist(colon_names)
fdist2 = FreqDist(sents)
mostFreq = fdist1.most_common(1)[0][1]
if mostFreq > 20:
debate_soup_dict[debate]["names"] = fdist1.most_common(10)
else:
debate_soup_dict[debate]["names"] = fdist2.most_common(10)
示例2: get_hosts
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def get_hosts(year):
'''Hosts is a list of one or more strings. Do NOT change the name
of this function or what it returns.'''
# Your code here
file_name = 'gg%s.json' % year
with open(file_name, 'r') as data:
db = json.load(data)
hosts = []
pairs = []
for f in db:
e = f['text']
if 'and' in e.lower():
for proper in strip_proper_pairs(normalize_str(e).split()):
pair = proper.split('and')
if len(pair) == 2:
if pair[0] != ' ' and pair[1] != ' ':
pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
pairs_freq = FreqDist(pairs)
if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
hosts.append(pairs_freq.most_common(10)[1][0][0])
hosts.append(pairs_freq.most_common(10)[1][0][1])
else:
hosts.append(pairs_freq.most_common(10)[0][0][0])
hosts.append(pairs_freq.most_common(10)[0][0][1])
return hosts
示例3: get_top_followings
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def get_top_followings(screen_name):
# authorize twitter, initialize tweepy
api = TwitterGrabber.initialise_api(0)
print(api.get_status)
# initialize a list to hold all the tweepy Tweets
all_tweets = []
# make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name=screen_name, count=200)
# get the user object
# user = api.get_user(screen_name=screen_name)
# print(user.lists_subscriptions)
# save most recent tweets
all_tweets.extend(new_tweets)
# save the id of the oldest tweet less one
oldest = all_tweets[-1].id - 1
# keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) < 0:
# print("getting tweets before %s" % oldest)
# all subsequent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
# save most recent tweets
all_tweets.extend(new_tweets)
# update the id of the oldest tweet less one
oldest = all_tweets[-1].id - 1
print("...%s tweets downloaded so far" % (len(all_tweets)))
tweet_text = []
for tweet in all_tweets:
tweet_text.append(tweet.text)
content = []
retweets = []
for tweet in tweet_text:
words = word_tokenize(tweet, 'english')
content.extend(strip_words(words))
if words[0] == 'RT':
retweets.append(words[2])
tweet_distribution = FreqDist(retweets)
print(tweet_distribution.most_common(20))
a = follow_description(api, tweet_distribution.most_common(20), screen_name)
return a
示例4: follow_description
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def follow_description(api, friend_list, screen_name):
the_list = []
all_tags = []
for friend in friend_list:
username = friend[0]
frequency = friend[1]
print(username)
try:
user = api.get_user(screen_name=username)
for list_obj in user.lists_memberships(screen_name=username, count=50):
for w in list_obj.name.lower().split(" "):
# print(w)
all_tags.append(w)
except TweepError as err:
print(err.reason)
break
# print(all_tags)
the_list_name = strip_words(all_tags)
the_list_dist = FreqDist(the_list_name)
# for w in the_list_dist:
# print ('***' + str(w))
print(the_list_dist.most_common(20))
return the_list_dist.most_common(20)
示例5: get_monograms_freqdist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def get_monograms_freqdist(tokens):
freq_dist = FreqDist(tokens)
# print FreqDist.N(freq_dist)
print 'Returned monograms'
print freq_dist.most_common(10)
temp_list = freq_dist.most_common(100)
temp_dict = dict((item[0], item[1]) for item in temp_list)
ordered_freq_dist = OrderedDict(sorted(temp_dict.items(), key=lambda x: x[1], reverse=True))
return ordered_freq_dist
示例6: get_trigrams_freqdist
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def get_trigrams_freqdist(tokens):
tri_grams = trigrams(tokens)
print 'Returned trigrams'
freq_dist_trigrams = FreqDist(tri_grams)
print freq_dist_trigrams.most_common(10)
freq_dist_trigrams_new = dict()
for item in freq_dist_trigrams.items():
temp_str = item[0]
temp_key = temp_str[0] + ' ' + temp_str[1] + ' ' + temp_str[2]
freq_dist_trigrams_new[temp_key] = item[1]
freq_dist_trigrams_new = OrderedDict(sorted(freq_dist_trigrams_new.items(), key=lambda x: x[1], reverse=True))
return freq_dist_trigrams_new
示例7: getTopNFreqWords
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def getTopNFreqWords(textArr,N):
fdist = FreqDist(textArr)
topWordsWithFreq = fdist.most_common(N)
topWords=[]
for word in topWordsWithFreq:
topWords.append(word[0])
return topWords
示例8: get_list_dists_for
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def get_list_dists_for(member_id):
print(member_id, file=sys.stderr)
# cursor.execute(get_listcount_for_member, [member_id])
# mlistcount = cursor.fetchone()[0]
cursor.execute(get_listinfo_for_member, [member_id])
tstrout = ''
rows = cursor.fetchall()
for row in rows:
c_line = str(row)
c_line = ''.join(filter(lambda x: x in string.printable, c_line))
if len(c_line):
parsed_text = f.parse(c_line, True)
strout = ''
if parsed_text is not None:
for item in s.items():
# print(1,str(item[1]))
word = []
for word in item[1]:
strout = strout + ' ' + word
if len(strout):
tstrout = tstrout + ' ' + strout
# print(tstrout)
words = nltk.tokenize.word_tokenize(tstrout)
the_list_dist = FreqDist(words)
return str(member_id) + " on " + str(len(rows)) + " lists: " + str(the_list_dist.most_common(20))
示例9: create_word_freq
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def create_word_freq(db):
db = getattr(db, "Posts")
#client.command("CREATE CLASS concepted EXTENDS E")
client.command("DELETE EDGE concepted")
#client.command('create property frequency.freq string')
#client.command("DELETE VERTEX frequency")
data = db.find().batch_size(50)
concept = client.command("SELECT name FROM concept")
c = [c.name for c in concept]
for d in data:
if not 'Body' in d:
display= ''
else:
display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
tokens = nltk.word_tokenize(display)
fdist=FreqDist(tokens)
i = fdist.most_common()
for k in i:
if k[0].lower() in c:
try:
client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
except:
continue
示例10: generate_ngrams_profile
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def generate_ngrams_profile(self, text, profile_size, min_size=2, max_size=3):
"""
It reads incoming text, generates all possible N-grams, with sizes ranging between min_size and max_size and counts the occurrences of all N-grams.
Parameters
----------
text : unicode
profile_size : int
min_size : int, optional (default=2)
max_size : int, optional (default=3)
Returns
-------
ngram_profile : FreqDist object
"""
raw_ngrams = []
text = self.sanitize_text(text)
for n in range(min_size, max_size+1):
for ngram in ngrams(text, n):
raw_ngrams.append(''.join(unicode(i) for i in ngram))
fdist = FreqDist(raw_ngrams)
ngram_profile = fdist.most_common(n=profile_size)
return ngram_profile
示例11: analyzeTitles
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def analyzeTitles():
fulltitles = []
titles = []
with open('../top100clean.csv', 'rb') as bookfile:
reader = csv.reader(bookfile)
for row in reader:
if "..." in row[0]:
row[0] = " ".join(row[0].split(" ")[:-1])
words = nltk.word_tokenize(row[0])
for w in words:
if w.isalpha() and w.lower() not in ['the','a']:
titles.append(w.lower())
fulltitles.append(row[0])
titleset = nltk.Text(titles)
wordsintitle = [len(f.split(" ")) for f in fulltitles]
wit_fd = FreqDist(wordsintitle)
print "\nw.i.t.\tfreq"
print "--------------------"
for numwords, times in wit_fd.iteritems():
print str(numwords) + "\t" + str(times)
print "\n"
print "\nword\t\tfreq"
print "--------------------"
fd = FreqDist(titleset)
common_words = fd.most_common(25)
for k, v in common_words:
print str(k) + "\t\t" + str(v)
示例12: cleaner
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def cleaner(filename):
textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
text = []
all_dates = []
complete_text = []
words_list = []
nodes = []
for line in textfile:
datetime,chat = line.split('-')
date, time = datetime.split(',')
loc = chat.find(':')
#if len(chat.split(':'))==3:
# print chat
user,text = chat[:loc],chat[loc+2:]
text = text.replace("\n",'')
words = text.split(' ')
for i in words:
words_list.append(i)
complete_text.append(text)
nodes.append(user)
all_dates.append(date)
#print set(nodes)
#print set(all_dates)
fdist = FreqDist(words_list)
f1 = fdist.most_common(100)
create_csv('wordcloud.csv',f1)
textfile.close()
示例13: transmit_vocabulary
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def transmit_vocabulary(t_token, t_lang):
languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian',
'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
voc_stopwords = set()
if t_lang in languages:
voc_stopwords = set(stopwords.words(t_lang))
i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8')
lines = i_f.readlines()
all_tweets = []
corpus_size = 0
for line in lines:
row = line.split('\t')
words = word_tokenize(row[1])
all_tweets.extend([w.lower() for w in words])
corpus_size += 1
freq_distribution = FreqDist(all_tweets)
cats_vocabulary_elements = []
for word, frequency in freq_distribution.most_common(1000):
if word not in voc_stopwords:
cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']')
cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']'
print(cats_vocabulary)
result_data = {'token': t_token, 'result': cats_vocabulary}
json_data = json.dumps(result_data)
results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile')
results_request.add_header('Content-Type', 'application/json')
results_request.data = json_data.encode('utf-8')
urllib2.urlopen(results_request)
print('Transmitted vocabulary for token '+t_token)
os.remove('csv/' + t_token + '.csv')
示例14: find_names
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def find_names(self):
"""creates a frequency distribution of the
most common names in the texts"""
names_list = LIST_OF_NAMES
name_tokens = [w for w in self.tokens if w in names_list]
fd = FreqDist(name_tokens)
return fd.most_common(50)
示例15: setUpOwnSubjectStopWords
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import most_common [as 别名]
def setUpOwnSubjectStopWords():
for topic in topics_table_noun_only_title:
#only limiting it to a specified length
#might want to look into the numeric part
all_description = [ds for ds in topics_table_noun_only_description[topic] if len(ds) > 5].join()
all_topics = [topics for topics in topics_table_noun_only_title[topic] if len(ds) > 5].join()
fdist_description = FreqDist(all_description)
fidst_topics = FreqDist(all_topics)
ten_most_common_descr = fdist_description.most_common(10)
ten_most_common_topic = fdist_description.most_common(10)
built_topic_stop_words[topic] = [word for word,freq in ten_most_common_descr ]
built_topic_stop_words[topic].append([word for word, freq in ten_most_common_topic])
#here we set up the top 5-10 words (we need to look into the data more to find
#the hard margin of the good numerical value to stop, but for simplicity sake, we
#pick 5 for now, let's see how our accuracy changes when change the most frequent words
for topic in built_topic_stop_words:
print built_topic_stop_words[topic]
print "\n"