本文整理汇总了Python中nltk.FreqDist.keys方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.keys方法的具体用法?Python FreqDist.keys怎么用?Python FreqDist.keys使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.FreqDist
的用法示例。
在下文中一共展示了FreqDist.keys方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: prepare_pos_features
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def prepare_pos_features(Language_model_set, output_file):
corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set
texts = PlaintextCorpusReader(corpus_root, '.*')
text = texts.words()
tagged_text = nltk.pos_tag(text)
merged_tag_text = mergeTags(tagged_text)
lists = seperate_pos(merged_tag_text)
nouns_dist = FreqDist(lists[0])
top_nouns = nouns_dist.keys()[:200]
verbs_dist = FreqDist(lists[1])
top_verbs =verbs_dist.keys()[:200]
advs_dist = FreqDist(lists[2])
top_advs =advs_dist.keys()[:100]
prep_dist = FreqDist(lists[3])
top_preps =prep_dist.keys()[:100]
adjs_dist = FreqDist(lists[4])
top_adjs =adjs_dist.keys()[:200]
out = open(output_file, 'w')
for n in top_nouns:
out.write('NN'+ n + '\n')
for v in top_verbs:
out.write('VV'+ v + '\n')
for av in top_advs:
out.write('ADV'+ av + '\n')
for p in top_preps:
out.write('PREP'+ p + '\n')
for aj in top_adjs:
out.write('ADJ'+ aj + '\n')
示例2: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def main():
keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
"Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
"hacker", "exploit code", "Defense", "Intelligence", "Agency"]
file_name = "tweets_output.txt"
pickle_words_file = "words.pickle"
pickle_words(file_name, pickle_words_file, keyword_list)
pickle_tweets_file = "tweets.pickle"
pickle_tweets(file_name, pickle_tweets_file)
words = load(open("words.pickle"))
tweets = load(open("tweets.pickle"))
freq_dist = FreqDist(words)
print tweets
print("===")
print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
print("===")
print("Number of words within the twitter search space: ")
print(len(words))
print("Number of unique words within twitter search space: ")
print(len(set(words)))
print("Lexical Diversity of unique words within twitter search space: ")
print(lexical_diversity(words))
print("===")
print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
print("===")
print("Top 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[:50])
print("===")
print("Bottom 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[-50:])
print("===")
示例3: handle
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def handle(self, *args, **options):
fdist = FreqDist()
print "Analyzing raw data"
limit = 10
if args:
raw_datas = RawData.objects.filter(pk__in=args)
else:
raw_datas = RawData.objects.all()[:limit]
tagged_data = []
for raw_data in raw_datas:
words = nltk.word_tokenize(raw_data.data)
tagged_data.extend(nltk.pos_tag(words))
for word in words:
word = word.strip()
if word:
fdist.inc(word)
print "Anaylzed %s items" % len(raw_datas)
print
print "Top word: %s" % fdist.max()
print
print "Top 10 words"
for word in fdist.keys()[:10]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Bottom 10 words"
for word in fdist.keys()[-10:]:
times = fdist[word]
print " -- %s occurred %s times" % (word, times)
print
print "Words occurring between 50-100 times"
words = [ word for word in fdist.keys() if fdist[word] >= 50 and fdist[word] <= 100 ]
print ", ".join(words)
cfdist = ConditionalFreqDist()
for (word, tag) in tagged_data:
cfdist[tag].inc(word)
print "Most popular noun: %s" % cfdist["NN"].max()
print
print "Top 50 nouns"
for word in cfdist["NN"].keys()[:50]:
times = cfdist["NN"][word]
print " -- %s occurred %s times" % (word, times)
print
示例4: entropy
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def entropy(tokens):
"""
Get the Shannon entropy of a document using it's token distribution
:param tokens: A document represented as a list of tokens.
:return:
"""
doc_len = len(tokens)
frq = FreqDist(tokens)
for key in frq.keys():
frq[key] /= doc_len
ent = 0.0
for key in frq.keys():
ent += frq[key] * math.log(frq[key], 2)
ent = -ent
return ent
示例5: draw_word2vec
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def draw_word2vec():
### Load data
dataloader = csv_dataloader()
dataloader.load("output/data_cache.pk")
print "Read in finished"
### Load pre-train word2vector model
word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300)
print "Pretrained word2vec loaded"
all_tokens = sum(dataloader.data.viewvalues(), [])
print "#Tokens: " + str(len(all_tokens))
fdist = FreqDist(all_tokens)
tokens = fdist.keys()[1:500]
print tokens
tokens_has_vectors = []
for token in tokens:
if word2vec[token] is not None:
tokens_has_vectors.append(token)
print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors))
vectors = word2vec.encode(tokens_has_vectors)
print "#Unique Vectors: " + str(len(vectors))
print ("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000)
# clf = manifold.Isomap(n_components=2, max_iter=100)
vectors_mds = clf.fit_transform(vectors)
print ("Done. Stress: %f" % clf.stress_)
plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
示例6: compress_term_matrix
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def compress_term_matrix(matrix, words):
initials = [item[0] for item in words]
fdist = FreqDist(initials)
letterindices = []
for letter in sorted(fdist.keys()):
letterindices.append((letter, fdist[letter]))
indexmatrix = []
start = 0
for letter, occ in letterindices:
newocc = occ / 5
print letter," ",occ
print " range: ", start," ", start+occ," ",newocc
indexes = np.random.random_integers(start, start+occ, newocc)
indexmatrix.append((letter, indexes.tolist()))
start = start+ occ
allindices = []
for _,v in indexmatrix:
allindices.extend(v)
smatrix = matrix[allindices, :]
return indexmatrix, smatrix
示例7: find_abbreviations
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def find_abbreviations():
import db
from tokenizers import es
from nltk import FreqDist
corpus = db.connect()
#text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
text = '\n'.join([a['text'] for a in corpus.articles.find()])
tokens = es.tokenize(text, ignore_abbreviations=True)
fd = FreqDist()
fd_abbr = FreqDist()
fd_n_abbr = FreqDist()
n_tokens = len(tokens)
for i in range(n_tokens):
fd.inc(tokens[i])
if i < (n_tokens - 1) and tokens[i + 1] == u'.':
fd_abbr.inc(tokens[i])
else:
fd_n_abbr.inc(tokens[i])
adjusted = {}
f_avg = len(fd.keys()) / fd.N()
for t, n in fd_abbr.iteritems():
f = fd.get(t, 0) / fd.N()
deviation = 1 + (f - f_avg)
adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)
items = adjusted.items()
items.sort(key=lambda i: i[1], reverse=True)
for t, n in items[:100]:
print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
示例8: parse
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def parse(filename):
outfilename = filename + ".freq"
entry_string = open(filename, 'r').read()
# convert to lower case
entry_string = entry_string.lower()
# remove punctuation
for c in string.punctuation:
entry_string = entry_string.replace(c, " ")
# remove everything except letters and spaces
entry_string = re.sub("[^a-z ]", " ", entry_string)
# strip out multiple spaces
entry_string = re.sub(r'\s+', r' ', entry_string)
# make the string into a list and remove stopwords from it
entry_string_split = entry_string.split()
entry_string_no_stopwords = remove_stopwords(entry_string_split)
fd = FreqDist(entry_string_no_stopwords)
fout = open(outfilename, "w")
sys.stdout.write(outfilename + "\n")
fout.write(" ".join(fd.keys()))
fout.close()
示例9: category_by_movie
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
示例10: category_by_pos
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
示例11: top
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def top(self, tokens, lowest_rank=50):
''' A list of the most frequent (non-stopword) tokens '''
from operator import itemgetter
content = self.words(tokens)
fdist = FreqDist(content)
vocab = iter(fdist.keys())
# Forget all previous ranking
self.lower_words = {}
frequency = 0
while frequency < lowest_rank:
try:
word = vocab.next()
except StopIteration:
break
word_lower = word.lower()
if word_lower in self.lower_words:
self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word]
else:
self.lower_words[word_lower] = fdist[word]
frequency = frequency + 1
# return sorted(self.lower_words, key=itemgetter(1), reverse=True)
return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
示例12: main
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def main():
userInput = parser.getInput()
fileList = parser.getFiles(userInput['train'])
pdata = parser.parseFiles(fileList)
allsent = ''
for f in pdata:
allsent += f[3]
all_words = FreqDist(w.lower()
for w in word_tokenize(allsent)
if w not in stopwords.words('english') )
global top_words
top_words = all_words.keys()[:500]
# pdata = getParseData()
featdata = featureAggregator(pdata)
print featdata[:10]
示例13: get_word_features
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
示例14: features
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def features(word_list):
freq = FreqDist(word_list)
f = freq.keys()
return {
'biology': 'biolog' in word_list,
'engineering': 'engin' in word_list,
'animal' : 'anim' in word_list,
'behavior': 'behavy' in word_list,
'chemistry': 'chem' in word_list,
'health': 'heal' in word_list,
'physics': 'phys' in word_list,
'math': 'math' in word_list,
'plant': 'plant' in word_list,
'earth': 'earth' in word_list,
'biochemistry': 'biochem' in word_list,
'social': 'soc' in word_list,
'planet': 'planet' in word_list,
'temperature': 'temperature' in word_list,
'blood': 'blood' in word_list,
'tube': 'tube' in word_list,
'pyschology': 'pyscholog' in word_list,
'protein': 'protein' in word_list,
'gene': 'gen' in word_list,
'most_0': f[0],
'most_1': f[1],
'most_2': f[2],
'most_3': f[3],
'most_4': f[4],
'most_5': f[5],
'most_6': f[6],
'most_7': f[7],
}
示例15: bag_of_words
# 需要导入模块: from nltk import FreqDist [as 别名]
# 或者: from nltk.FreqDist import keys [as 别名]
def bag_of_words(data, label_codebook, feature_codebook, theta):
""""""
word_dict = Alphabet()
stopset = set(stopwords.words('english'))
for key, value in data.items():
label_codebook.add(key)
for doc in value:
doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
for word in doc_tokens:
if word not in stopset:
word_dict.add(word)
all_words = word_dict._label_to_index.keys()
fdict = FreqDist([w for w in all_words])
word_feature = fdict.keys()[theta:]
for word in all_words:
if word in word_feature:
feature_codebook.add(word)
instance_list = {}
for label, document_list in data.items():
instance_list[label] = []
for document in document_list:
vector = np.zeros(feature_codebook.size())
tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
indice = 0
for word in tokens:
if feature_codebook.has_label(word):
indice = feature_codebook.get_index(word)
vector[indice] = 1.
instance_list[label].append(vector)
return instance_list