本文整理汇总了Python中nltk.corpus.brown.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了categories函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: print_brown
def print_brown():
from nltk.corpus import brown
print brown.categories()
print brown.words(categories='news')
print brown.words(fileids=['cg22'])
print brown.sents(categories=['news','reviews'])
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
print m+':',fdist[m]
示例2: get_training_test_sentences
def get_training_test_sentences(self):
brown_cats = ",".join(brown.categories())
self.news_text = brown.words(categories= brown.categories())
self.news_tagged_sentences = brown.tagged_sents(categories= brown.categories())
size = int(len(self.news_tagged_sentences) * .9)
brown_train = self.news_tagged_sentences[:size]
brown_test = self.news_tagged_sentences[size:]
self.train_sents = brown_train
self.test_sents = brown_test
示例3: build_all_brown
def build_all_brown(subset_size=None):
documents = []
categories = []
all_categories = set()
try:
fileids = brown.fileids()
for fileid in fileids:
if subset_size:
if len(all_categories) > subset_size:
break
category = brown.categories(fileid)[0]
words = [x.lower() for x in brown.words(fileid)]
documents.append(words)
categories.append(category)
all_categories.add(category)
if subset_size != len(brown.categories()):
# exclude the final item, since it's the sole member of the next group
documents = documents[:-1]
categories = categories[:-1]
documents = [" ".join(d) for d in documents]
except LookupError:
""" we don't have the Brown corpus via nltk on this machine """
try:
with open("brown_docs_cats.pickle") as f:
documents, categories = pickle.load(f)
except IOError:
raise Exception("can't load Brown Corpus via NLTK or file")
# documents = [' '.join(d) for d in documents]
"""
# let's NOT get tempted to hide away the encoding
# we'll probably need to access, e.g., the vectorizer, to do reverse
# transformations once we want to interpret/evaluate the model
doc_vectorizer = CountVectorizer()
doc_vec = doc_vectorizer.fit_transform(documents)
"""
return documents, categories
示例4: import_brown_pos
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(brown.sents())
counter = 0
for category in brown.categories():
cat_handle = ds.insert("#%s" % category)
for sent in brown.tagged_sents(categories=category):
if simplify_tags:
norm = (simplify_tag(t) for t in sent)
norm = [nltk.tuple2str(t) for t in norm]
sen_handle = ds.insert(norm)
ds.link(cat_handle, sen_handle)
if not silent:
counter += 1
if (counter % 100 == 0):
print("importing %s of %s sentences..." % (counter, total),
file=log)
示例5: ex11
def ex11():
from nltk.corpus import brown
modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"])
cfd = nltk.ConditionalFreqDist(
(genre, modal)
for genre in brown.categories()
for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals])
cfd.plot()
示例6: brown_diversity
def brown_diversity():
"""calculate and display lexical diversity score (token/token_type) for each brown corpus category"""
cfd = nltk.ConditionalFreqDist((category, word)
for category in brown.categories()
for word in brown.words(categories=category))
print "{0:15s} {1:10s}".format("CATEGORY", "DIVERSITY")
for category in cfd.conditions():
print "{0:15s} {1:10f}".format(category, (cfd[category].N() * 1.0 / cfd[category].B()))
示例7: fun08
def fun08():
"""fun08"""
cfd = nltk.ConditionalFreqDist((genre, word) \
for genre in brown.categories() \
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
示例8: makeWordSet
def makeWordSet(args=None):
'''Use the Brown corpus to see how many words used'''
word_set = set()
for cat in brown.categories():
word_set = word_set.union(set(brown.words(categories=cat)))
for cat in reuters.categories():
word_set = word_set.union(set(reuters.words(categories=cat)))
return word_set
示例9: exercise_brown2
def exercise_brown2():
"""带条件的频率分布函数"""
cfd = nltk.ConditionalFreqDist(
(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
)
genres = ["news", "religion", "hobbies", "science_fiction", "romance", "humor"]
modals = ["can", "could", "may", "might", "must", "will"]
cfd.tabulate(conditions=genres, samples=modals)
示例10: ex16
def ex16():
from nltk.corpus import brown
lex_div = {}
for category in brown.categories():
words = brown.words(categories=category)
ld = len(words) / len(set(words))
print category, ld
lex_div[category] = ld
print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
示例11: exercise_brown
def exercise_brown():
# 打印布朗语料库中的分类
print brown.categories()
# 打印分类为新闻的文本词汇
print brown.words(categories="news")
# 打印文本'cg22'
print brown.words(fileids=["cg22"])
# 打印句子
print brown.sents(categories=["news", "reviews"])
"""比较不同文体中的情态动词的用法"""
# 获取文本
news_text = brown.words(categories="news")
# 单词定义频率
fdist = nltk.FreqDist([w.lower() for w in news_text])
# 定义情态动词表
modals = ["can", "could", "may", "might", "must", "will"]
for m in modals:
print m + ":", fdist[m]
示例12: print_modals
def print_modals():
from nltk.corpus import brown
cfd=nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
genres=['news','religion','hobbies','science_fiction','romance','humor']
modals=['can','could','may','might','must','will']
cfd.tabulate(conditions=genres,samples=modals)
示例13: test_sentences
def test_sentences(categories=[]):
"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
start = int(TEST_PROPORTION * total) # use the last k sentences for test
sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
return sents
示例14: training_sentences
def training_sentences(use=1.0, categories=[]):
"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
return sents
示例15: init_corpus
def init_corpus():
print 'init corpus.. ',
global categories, category_sentences
categories = brown.categories()
half_cat = int(len(categories) * 0.5)
categories = categories[:half_cat]
for category in categories:
sents = brown.tagged_sents(categories = category)
category_sentences[category] = sents
print 'done'