本文整理汇总了Python中nltk.corpus.reuters.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了categories函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_data
def load_data(config={}):
"""
Load the Reuters dataset.
Returns
-------
data : dict
with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
"""
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]
xs = {'train': [], 'test': []}
xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
xs['test'] = vectorizer.transform(docs['test']).toarray()
ys = {'train': [], 'test': []}
ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
for doc_id in train])
ys['test'] = mlb.transform([reuters.categories(doc_id)
for doc_id in test])
data = {'x_train': xs['train'], 'y_train': ys['train'],
'x_test': xs['test'], 'y_test': ys['test'],
'labels': globals()["labels"]}
return data
示例2: print_reuters
def print_reuters():
from nltk.corpus import reuters
# print reuters.fileids()
# print reuters.categories()
print reuters.categories('training/9865')
print reuters.categories(['training/9865','training/9880'])
print reuters.fileids('barley')
print reuters.fileids(['barely','corn'])
示例3: __init__
def __init__(self):
# print reuters categories
print "reuters categories"
print reuters.categories()
# TODO this is probably bad
print "getting nodes"
self.nodes = database.get_all_nodes()
print "training classifier"
self.classifier = DocumentClassifier()
示例4: explore_categories
def explore_categories(max_len=5000, min_len=100, percentage=0.3):
for cat in reuters.categories():
for cat2 in reuters.categories():
if cat2 > cat:
if len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
l1 = len(reuters.fileids(cat))
l2 = len(reuters.fileids(cat2))
if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
示例5: get_test_set
def get_test_set():
single_categories = [(id, re.categories(id)[0])
for id in re.fileids()
if len(re.categories(id)) == 1]
single_cat_list = distribution(single_categories, itemgetter(1))
used_categories = [x[0]
for x in single_cat_list
if x[1] < 600 and x[1] > 200]
return [pair for pair in single_categories if pair[1] in used_categories]
示例6: get_target
def get_target(self):
# cat1 vs. cat2
if len(self.categories) > 1:
target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0]
for fileid in self.fileids]
# cat1 vs. not cat1
else:
target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0
for fileid in self.fileids]
self.classes, target = np.unique(target, return_inverse=True)
return target
示例7: create_tfidf_data
def create_tfidf_data(docs,categories,n=None):
"""
Crea una struttura [(label,[parole])] parsando il documento
:param docs: lista dei documenti reuters
:param categories: nomi delle categorie da considerare
:param n: numero di documenti da usare
:return: list
"""
if n:
docs = docs[:n]
cat_num = {}; i = 1
for c in categories:
cat_num[c] = i
i += 1
y = []
corpus = []
for d in docs:
c = reuters.categories(d)[0]
if c in categories:
y.append(getSVMCategory(cat_num[c]))
corpus.append(reuters.raw(d).lower())
return y, corpus
示例8: reuters_high_info_words
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
labeled_words = []
for label in reuters.categories():
labeled_words.append((label, reuters.words(categories=[label])))
return high_information_words(labeled_words, score_fn=score_fn)
示例9: get_testset_trainset_nltk_reuters
def get_testset_trainset_nltk_reuters():
from nltk.corpus import reuters
global categories_file_name_dict
global cat_num_docs
clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]
testset = [f for f in clean_files if f[:5]=='test/']
trainset = [f for f in clean_files if f[:9]=='training/']
for cat in reuters.categories():
li=[f for f in reuters.fileids(categories=cat) if f in trainset]
li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
if len(li)>20 and len(li_te)>20:
cat_num_docs[cat]=len(li)
li.extend(li_te)
categories_file_name_dict[cat]=li
return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
[ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]
示例10: collection_stats
def collection_stats():
# List of documents
documents = reuters.fileids()
print(str(len(documents)) + " documents");
train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
print(str(len(train_docs)) + " total train documents");
test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
print(str(len(test_docs)) + " total test documents");
# List of categories
categories = reuters.categories();
print(str(len(categories)) + " categories");
# Documents in a category
category_docs = reuters.fileids("acq");
# Words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0]);
print(document_words);
# Raw document
print(reuters.raw(document_id));
示例11: f2c
def f2c(corpus,fileName):
if corpus=='mr':
from nltk.corpus import movie_reviews as mr
return mr.categories(fileids = fileName)[0]
else:
from nltk.corpus import reuters
return reuters.categories(fileids = fileName)[0]
示例12: import_reuters_files
def import_reuters_files(ds, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(reuters.fileids())
counter = 0
root_handle = ds.insert("#reuters")
for fileid in reuters.fileids():
tags = ["@%s" % category for category in reuters.categories(fileid)]
file_handle = ds.insert(["#%s" % fileid] + tags)
ds.link(root_handle, file_handle)
for sent in reuters.sents(fileid):
norm = [word.lower() for word in sent]
sen_handle = ds.insert(norm)
ds.link(file_handle, sen_handle)
if not silent:
counter += 1
if (counter % 10 == 0):
print("importing %s of %s files..." % (counter, total),
file=log)
示例13: format_data
def format_data(docs, all_categories):
y = []; corpus = []
for d in docs:
current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
if current_categories:
y.append(current_categories[0])
corpus.append(reuters.raw(d).lower())
return y, corpus
示例14: makeWordSet
def makeWordSet(args=None):
'''Use the Brown corpus to see how many words used'''
word_set = set()
for cat in brown.categories():
word_set = word_set.union(set(brown.words(categories=cat)))
for cat in reuters.categories():
word_set = word_set.union(set(reuters.words(categories=cat)))
return word_set
示例15: __iter__
def __iter__(self):
""" Generator of docs while collecting ordered structured info. """
for n, reutersid in enumerate(reuters.fileids()): # 'training|test/xxxx'
dataset, _ = reutersid.split('/') # extract dataset
if self.dataset in dataset: # yield only filtered dataset
if self.categories is not None:
top_category = reuters.categories(reutersid)[0] # grab first category only
self.category_mask.append(self.categories[top_category]) # n-th doc -> classid
yield reuters.raw(reutersid) # return raw document