本文整理汇总了Python中nltk.corpus.brown.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了words函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_clusterer
def test_clusterer(self):
"""Here we take 10 documents categorized as 'government' and
'mystery' from the brown corpus, and perform k-means clustering on
these. Optimally we would like the clusterer to divide them in two
clusters.
The clusterer generates clusters depending on random initial
conditions, so the result can be different in different test runs.
In order to account for that that we run a lot of iterations
(50) which hopefully will generate a good result. The success
condition is that a max of 2 out of 10 documents will fall in the
wrong cluster.
"""
clusterer = KMeans()
government_ids = brown.fileids(categories='government')[:10]
mystery_ids = brown.fileids(categories='mystery')[:10]
government_uids = []
mystery_uids = []
for articleid in government_ids:
text = " ".join(brown.words(articleid))
self.folder.invokeFactory('Document', articleid, text=text)
government_uids.append(self.folder[articleid].UID())
for articleid in mystery_ids:
text = " ".join(brown.words(articleid))
self.folder.invokeFactory('Document', articleid, text=text)
mystery_uids.append(self.folder[articleid].UID())
result = clusterer.clusterize(2, 50, repeats=50)
cluster1 = set(result[0])
missed = min(len(cluster1-set(government_uids)),
len(cluster1-set(mystery_uids)))
self.failUnless(missed<=2)
示例2: _build_wordset
def _build_wordset(clazz, obscurity_limit):
# I'm sorry this method is so disgusting.
# It's all in the cause of fast loading in the main case.
from nltk import FreqDist
# Ensure corpora are loaded.
try:
from nltk.corpus import cmudict
cmudict.entries()
except LookupError:
print "CMUDict corpus not found. Downloading..."
from nltk import download
download('cmudict')
print "[Done]"
if obscurity_limit is not None:
from nltk.corpus import brown
try:
brown.words()
except LookupError:
print "Brown corpus not found. Downloading...",
from nltk import download
download('brown')
print "[Done]"
words = cmudict.entries()
if obscurity_limit is not None:
freqs = FreqDist([w.lower() for w in brown.words()])
words = sorted(words,
key=lambda x: freqs[x[0].lower()],
reverse=True)
return words[:obscurity_limit]
else:
return list(words)
示例3: get_type_token_ratio
def get_type_token_ratio(category=''):
# returns the type to token ratio for the given topic
if category=='':
text=brown.words() # get the text from the entire corpus
else:
text=brown.words(categories=category) # get the text from the given category
return len(set(text))/len(text)
示例4: load_brown_freq_ratios
def load_brown_freq_ratios():
brown_freqdist = nltk.FreqDist([w.lower() for w in brown.words()])
num_words = len(brown.words())
ratios = {}
for word, number in brown_freqdist.iteritems():
ratios[word] = float(number) / num_words
return ratios
示例5: get_vocabulary_size
def get_vocabulary_size(category=''):
# returns the size of the vocabulary for a single category from the corpus.
# If no category is given, the function should return the vocabulary size for the entire corpus.
if category=='':
text=brown.words() # get the text from the entire corpus
else:
text=brown.words(categories=category) # get the text from the given category
return len(set(text))
示例6: get_prob_word_in_category
def get_prob_word_in_category(word, category=''):
# returns the probability of the given word appearing in the given category
# (or the entire corpus, if no category is given).
if category=='':
text=brown.words() # get the text from the entire corpus
else:
text=brown.words(categories=category) # get the text from the given category
return text.count(word)/len(text)
示例7: Automated_Readability_Index
def Automated_Readability_Index(section):
sents = len(brown.sents(categories = section))
words = len(brown.words(categories = section))
text = " ".join(brown.words(categories = section))
letters = len(text)
uw = letters / float(words)
us = words / float(sents)
ari = (4.71 * uw) + (0.5 * us) - 21.43
return ari
示例8: get_top_n_words
def get_top_n_words(n, category=''):
#return the most frequent n words from a category (or the entire corpus)
if category=='':
text=brown.words() # get the text from the entire corpus
else:
text=brown.words(categories=category) # get the text from the given category
fdist=FreqDist(text)
top_words=fdist.keys()
return top_words[:n]
示例9: fetchCorpus
def fetchCorpus():
corpus = nltk.pos_tag(brown.words(categories="news")[:CORPUS_SIZE] +
brown.words(categories="editorial")[:CORPUS_SIZE] +
brown.words(categories="reviews")[:CORPUS_SIZE] +
brown.words(categories="lore")[:CORPUS_SIZE] +
brown.words(categories="hobbies")[:CORPUS_SIZE])
categories = list(set(map(lambda x:x[1], corpus)))
return corpus, categories
示例10: print_brown
def print_brown():
from nltk.corpus import brown
print brown.categories()
print brown.words(categories='news')
print brown.words(fileids=['cg22'])
print brown.sents(categories=['news','reviews'])
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
print m+':',fdist[m]
示例11: pre_processor
def pre_processor(grams=3):
vocabulary = set()
t = 0
for di in brown.fileids():
vocabulary = vocabulary.union(set(brown.words(di)))
t += 1
if t == 2:
break
vocabulary = list(vocabulary)
for i, word in enumerate(vocabulary):
wordDic[word] = i
posiDic[i] = word
t = 0
x1 = np.zeros(shape=(0, grams-1), dtype=int)
x2 = np.zeros(shape=(0, grams-1), dtype=int)
y1 = np.zeros(shape=(0, 1), dtype=int)
y2 = np.zeros(shape=(0, 1), dtype=int)
for _id in brown.fileids():
if t == 0:
t += 1
text = brown.words(_id)
size_ant = x1.shape[0]
x1.resize((x1.shape[0] + len(text) - grams - 1, grams-1))
y1.resize((y1.shape[0] + len(text) - grams - 1, 1))
for i in range(size_ant, size_ant + len(text) - grams-1):
x1[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
y1[i] = [wordDic[text[i + grams-1]]]
continue
text = brown.words(_id)
size_ant = x2.shape[0]
x2.resize((x2.shape[0] + len(text) - grams - 1, grams-1))
y2.resize((y2.shape[0] + len(text) - grams - 1, 1))
for i in range(size_ant, size_ant + len(text) - grams-1):
x2[i] = [wordDic[text[index]] for index in range(i, i+grams-1)]
y2[i] = [wordDic[text[i + grams-1]]]
break
return vocabulary, x1, y1, x2, y2
示例12: print_corpus_info
def print_corpus_info(categories, stopwords):
print("Corpus name: Brown Corpus")
tokens = [w for w in brown.words()]
no_stopwords = [w for w in tokens if w not in stopwords]
print_scores(tokens, no_stopwords)
for category in categories:
print("Category:", category)
tokens = [w for w in brown.words(categories=category)]
no_stopwords = [w for w in tokens if w not in stopwords]
print_scores(tokens, no_stopwords)
示例13: syn
def syn():
while True:
#syns=wordnet.synsets(brown.words()[random.randint(1, len(brown.words())-1)].lower())
syns=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower())
syns2=wordnet.synsets(brown.words()[random.randint(1, 1000000)].lower())
try:
word=syns[0].lemmas[0].name
word2=syns2[0].lemmas[0].name
#print "word: ", word
if not (word==word2) and not(word.find("_")>0 or len(word)<4) and not( word2.find("_")>0 or len(word2)<4):
return (word,word2)
except Exception:
continue
示例14: plot_word_counts
def plot_word_counts():
#copying all the words in the Brown corpus
corpus_full_text = brown.words()
corpus_news = brown.words(categories = 'news')
fdist = FreqDist(corpus_news)
xx=fdist.values()
plt.hist(xx, bins=3000)
# Annotate the graph
plt.xlablel('Frequency of occurences')
plt.ylabel('Freqency of words in that bucket')
plt.axis([0,500,0,500])
plt.show()
示例15: compare
def compare(self,file):
word_list = []
for a in brown.words(fileids=['cc17','ca16']):
word_list.append(str(a))
word_list = set(word_list)
text = []
with open(file, "r+b") as f:
while 1:
read_data = f.read(1)
if not read_data :
break
text.append(read_data)
text = "".join(text)
w = set( text.split() )
occurencies = len(word_list & w)
return occurencies