Python reuters.fileids函数代码示例

本文整理汇总了Python中nltk.corpus.reuters.fileids函数的典型用法代码示例。如果您正苦于以下问题：Python fileids函数的具体用法？Python fileids怎么用？Python fileids使用的例子？那么, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了fileids函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tfidf

def tfidf(word, wordCount):

	docCount = len(reuters.fileids())

	wordCountCorpus = 0
	count = 0
	for doc in reuters.fileids():
		count = count + 1
		present = 0
		for word2 in reuters.words(doc):
			if word.lower() == word2.lower():
				present = 1
				break

		if present == 1:
			wordCountCorpus == wordCountCorpus + 1

		if count == 200:
			break

	tf = wordCount
	idf = math.log(docCount/(1 + wordCountCorpus))

	tfidf = tf * idf

	return tfidf

开发者ID:davidjamesmcclure，项目名称:CodeSamples，代码行数:26，代码来源:parsingFunctions.py

示例2: import_reuters_files

def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)

开发者ID:tdiggelm，项目名称:nltk-playground，代码行数:27，代码来源:train.py

示例3: collection_stats

def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));

开发者ID:BugliL，项目名称:SVNexercise，代码行数:25，代码来源:test2.py

示例4: init

    def __init__(self, categories=None, lower=True):
        if categories == None or len(categories) == 1:
            self.fileids = reuters.fileids()
        else:
            self.fileids = reuters.fileids(categories)
        self.categories = categories

        self.lower = lower

开发者ID:verasazonova，项目名称:textsim，代码行数:8，代码来源:reuters.py

示例5: print_reuters

def print_reuters():
    from nltk.corpus import reuters
    # print reuters.fileids()
    # print reuters.categories()
    print reuters.categories('training/9865')
    print reuters.categories(['training/9865','training/9880'])
    print reuters.fileids('barley')
    print reuters.fileids(['barely','corn'])

开发者ID:Paul-Lin，项目名称:misc，代码行数:8，代码来源:toturial.py

示例6: explore_categories

def explore_categories(max_len=5000, min_len=100, percentage=0.3):
    for cat in reuters.categories():
        for cat2 in reuters.categories():
            if cat2 > cat:
                if  len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
                    l1 = len(reuters.fileids(cat))
                    l2 = len(reuters.fileids(cat2))
                    if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
                        print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)

开发者ID:verasazonova，项目名称:textsim，代码行数:9，代码来源:reuters.py

示例7: generateTextList

def generateTextList( category, size, normalize = False ):
	i = 0
	text = []
	
	while i < size and i < len( reuters.fileids( category ) ):
		if not normalize:
			text.insert( i, reuters.words( reuters.fileids( category )[i] ) )
		else:
			text.insert( i, 
			getNormalizedText( reuters.words( reuters.fileids( category )[i] ) ) )
		i += 1
	
	return text

开发者ID:htaunay，项目名称:TextComparison，代码行数:13，代码来源:Exercise02_Stems.py

示例8: create_token_stream

def create_token_stream():
	"""
	A funtion that creates token stream based on the nltk reuters corpus
	A token stream is a list of tuples containing terms to docID
	"""
	token_stream = []
	docID = 1
	termID = 1
	print "Creating token stream..."
	for fileid in reuters.fileids('barley'):
	 	for term in reuters.words(fileid):
 			# Strip punctuatuion from the word and make lower case
 			term = remove_punct_from_word(term).lower()
 			# Check to make sure word is not "" and term is not a number
 			if len(term) > 0 and not is_number(term):
 				stemmed_term = stem().stem_word(term)
 				if stemmed_term not in terms:
 					terms[stemmed_term] = termID
 					termID += 1
 				new_token = (terms[stemmed_term], docID)
 				token_stream.append(new_token)
	 	# Add to docs dictionary mapping docID to file
	 	docs[docID] = fileid
	 	docID += 1
	return token_stream

开发者ID:btawfic，项目名称:COMP90024-Project1，代码行数:25，代码来源:btawfic_project1_compressed_index.py

示例9: load_data

def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data

开发者ID:MartinThoma，项目名称:algorithms，代码行数:32，代码来源:reuters.py

示例10: run

def run():

    """Import the Reuters Corpus which contains 10,788 news articles"""

    from nltk.corpus import reuters
    raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()]

    # Select 100 documents randomly
    rand_idx = random.sample(range(len(raw_docs)), 100)
    raw_docs = [raw_docs[i] for i in rand_idx]

    # Preprocess Documents
    tokenized_docs = [ie_preprocess(doc) for doc in raw_docs]

    # Remove single occurance words
    docs = remove_infrequent_words(tokenized_docs)

    # Create dictionary and corpus
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Build LDA model
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
    for topic in lda.show_topics():
        print topic

开发者ID:alexandercrosson，项目名称:nlp，代码行数:25，代码来源:topic_modeling.py

示例11: get_testset_trainset_nltk_reuters

def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]

开发者ID:genf，项目名称:Naive-Bayes-Document-Classifier，代码行数:16，代码来源:Preprocessor.py

示例12: create_dictionary_index_reuters

def create_dictionary_index_reuters():
	""" 
	A function that creates a dictonary with terms as keys
	and positings lists as values for the nltk reuters corpus
	"""

	idx = {}
	docs = {}
	docID = 1
	for fileid in reuters.fileids():
	 	for word in reuters.words(fileid):
	 		if not is_number(word):
	 			# Strip punctuatuion from the word and make lower case
	 			word = remove_punct_from_word(word).lower()
	 			# Check to make sure word is not ""
	 			if len(word) > 0:
	 				# Check to see if word already is in index
	 				if word in idx:
	 					# Check to see if docID is not already present for word
	 					if docID not in idx[word]:
	 						idx[word].append(docID)
	 				# Otherwise add word and docID in array to index
	 				else:
	 					idx[word] = []
	 					idx[word].append(docID)
	 	
	 	# Add to docs dictionary mapping docID to file
	 	docs[docID] = fileid
	 	docID += 1
	size =  0
	for k in idx.iterkeys():
		size += os.sys.getsizeof(k)
		size += os.sys.getsizeof(idx[k])
	#print "size of original dictionary is:", size
	return idx

开发者ID:btawfic，项目名称:COMP90024-Project1，代码行数:35，代码来源:btawfic_project1_compressed_index.py

示例13: get_reuters_ids_cnt

def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5):
    """To get test data for training a model
    reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download()

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca: int
        maximum number of vocabulary size for the returned corpus
    remove_top_n: int
        remove top n frequently used words

    Returns
    -------
    voca_list: ndarray
        list of vocabulary used to construct a corpus
    doc_ids: list
        list of list of word id for each document
    doc_cnt: list
        list of list of word count for each document
    """
    file_list = reuters.fileids()
    corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)]

    return get_ids_cnt(corpus, max_voca, remove_top_n)

开发者ID:Assios，项目名称:python-topic-model，代码行数:26，代码来源:nltk_corpus.py

示例14: get_reuters_cnt_ids

def get_reuters_cnt_ids(num_doc=100, max_voca=10000):
    ''' to get test data for training a model
    reuters corpus should be installed in nltk_data: nltk.download()
    '''
    file_list = reuters.fileids()
    
    docs = list()
    freq = Counter()

    for i in range(num_doc):
        doc = reuters.words(file_list[i])
        freq.update(doc)
        docs.append(doc)

    voca = [key for key,val in freq.most_common(max_voca)]

    voca_dic = dict()
    voca_list = list()
    for word in voca:
        voca_dic[word] = len(voca_dic)
        voca_list.append(word)

    doc_ids = list()
    doc_cnt = list()

    for doc in docs:
        words = set(doc)
        ids = np.array([int(voca_dic[word]) for word in words if voca_dic.has_key(word)])
        cnt = np.array([int(doc.count(word)) for word in words if voca_dic.has_key(word)])

        doc_ids.append(ids)
        doc_cnt.append(cnt)

    return np.array(voca_list), doc_ids, doc_cnt

开发者ID:judaschrist，项目名称:python-topic-model，代码行数:34，代码来源:nltk_corpus.py

示例15: preProcess

def preProcess():
    print 'PreProcess Reuters Corpus'
    start_time = time.time()
    docs = 0
    bad = 0
    tokenizer = Tokenizer()

    if not os.path.isdir(Paths.base):
        os.makedirs(Paths.base)

    with open(Paths.text_index, 'w') as fileid_out:
      with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
          with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
              for f in reuters.fileids():
                  contents = reuters.open(f).read()
                  try:
                      tokens = tokenizer.tokenize(contents)
                      docs += 1
                      if docs % 1000 == 0:
                          print "Normalised %d documents" % (docs)

                      out.write(' '.join(tokens) + "\n")
                      # if f.startswith("train"):
                      #
                      # else:
                      #     test.write(' '.join(tokens) + "\n")
                      fileid_out.write(f + "\n")

                  except UnicodeDecodeError:
                      bad += 1
    print "Normalised %d documents" % (docs)
    print "Skipped %d bad documents" % (bad)
    print 'Finished building train file ' + Paths.texts_clean
    end_time = time.time()
    print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)

开发者ID:sirty，项目名称:TopicModelingDemo，代码行数:35，代码来源:main.py

注：本文中的nltk.corpus.reuters.fileids函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。