当前位置: 首页>>代码示例>>Python>>正文


Python reuters.categories函数代码示例

本文整理汇总了Python中nltk.corpus.reuters.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了categories函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_data

def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
开发者ID:MartinThoma,项目名称:algorithms,代码行数:32,代码来源:reuters.py

示例2: print_reuters

def print_reuters():
    from nltk.corpus import reuters
    # print reuters.fileids()
    # print reuters.categories()
    print reuters.categories('training/9865')
    print reuters.categories(['training/9865','training/9880'])
    print reuters.fileids('barley')
    print reuters.fileids(['barely','corn'])
开发者ID:Paul-Lin,项目名称:misc,代码行数:8,代码来源:toturial.py

示例3: __init__

 def __init__(self):
     # print reuters categories
     print "reuters categories"
     print reuters.categories()
     # TODO this is probably bad
     print "getting nodes"
     self.nodes = database.get_all_nodes()
     print "training classifier"
     self.classifier = DocumentClassifier()
开发者ID:nathanjordan,项目名称:bernstein,代码行数:9,代码来源:classifier.py

示例4: explore_categories

def explore_categories(max_len=5000, min_len=100, percentage=0.3):
    for cat in reuters.categories():
        for cat2 in reuters.categories():
            if cat2 > cat:
                if  len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
                    l1 = len(reuters.fileids(cat))
                    l2 = len(reuters.fileids(cat2))
                    if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
                        print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
开发者ID:verasazonova,项目名称:textsim,代码行数:9,代码来源:reuters.py

示例5: get_test_set

def get_test_set():
    single_categories = [(id, re.categories(id)[0])
                         for id in re.fileids()
                         if len(re.categories(id)) == 1]

    single_cat_list = distribution(single_categories, itemgetter(1))
    used_categories = [x[0]
                       for x in single_cat_list
                       if x[1] < 600 and x[1] > 200]

    return [pair for pair in single_categories if pair[1] in used_categories]
开发者ID:simone-trubian,项目名称:blog-posts,代码行数:11,代码来源:clustering.py

示例6: get_target

    def get_target(self):

        # cat1 vs. cat2
        if len(self.categories) > 1:
            target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0]
                       for fileid in self.fileids]
        # cat1 vs. not cat1
        else:
            target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0
                       for fileid in self.fileids]
        self.classes, target = np.unique(target, return_inverse=True)
        return target
开发者ID:verasazonova,项目名称:textsim,代码行数:12,代码来源:reuters.py

示例7: create_tfidf_data

def create_tfidf_data(docs,categories,n=None):
    """
    Crea una struttura [(label,[parole])] parsando il documento
    :param docs: lista dei documenti reuters
    :param categories: nomi delle categorie da considerare
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]

    cat_num = {}; i = 1
    for c in categories:
        cat_num[c] = i
        i += 1

    y = []
    corpus = []
    for d in docs:
        c = reuters.categories(d)[0]
        if c in categories:
            y.append(getSVMCategory(cat_num[c]))
            corpus.append(reuters.raw(d).lower())

    return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:mainSGD.py

示例8: reuters_high_info_words

def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
	labeled_words = []
	
	for label in reuters.categories():
		labeled_words.append((label, reuters.words(categories=[label])))
	
	return high_information_words(labeled_words, score_fn=score_fn)
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:7,代码来源:featx.py

示例9: get_testset_trainset_nltk_reuters

def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]            
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:16,代码来源:Preprocessor.py

示例10: collection_stats

def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:test2.py

示例11: f2c

def f2c(corpus,fileName):
    if corpus=='mr':
        from nltk.corpus import movie_reviews as mr
        return mr.categories(fileids = fileName)[0]    
    else:
        from nltk.corpus import reuters
        return reuters.categories(fileids = fileName)[0]    
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:7,代码来源:Filename_To_Cat.py

示例12: import_reuters_files

def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:27,代码来源:train.py

示例13: format_data

def format_data(docs, all_categories):
    y = []; corpus = []
    for d in docs:
        current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
        if current_categories:
            y.append(current_categories[0])
            corpus.append(reuters.raw(d).lower())
    return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:8,代码来源:main.py

示例14: makeWordSet

def makeWordSet(args=None):
    '''Use the Brown corpus to see how many words used'''
    word_set = set()
    for cat in brown.categories():
        word_set = word_set.union(set(brown.words(categories=cat)))
    for cat in reuters.categories():
        word_set = word_set.union(set(reuters.words(categories=cat)))
    return word_set
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:8,代码来源:wordCheck.py

示例15: __iter__

 def __iter__(self):
     """ Generator of docs while collecting ordered structured info. """
     for n, reutersid in enumerate(reuters.fileids()):         # 'training|test/xxxx'
         dataset, _ = reutersid.split('/')       # extract dataset
         if self.dataset in dataset:             # yield only filtered dataset
             if self.categories is not None:
                 top_category = reuters.categories(reutersid)[0]            # grab first category only
                 self.category_mask.append(self.categories[top_category])   # n-th doc -> classid
             yield reuters.raw(reutersid)        # return raw document
开发者ID:lum4chi,项目名称:IR,代码行数:9,代码来源:reuterscorpus.py


注:本文中的nltk.corpus.reuters.categories函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。