本文整理汇总了Python中nltk.corpus.reuters.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tfidf
def tfidf(word, wordCount):
docCount = len(reuters.fileids())
wordCountCorpus = 0
count = 0
for doc in reuters.fileids():
count = count + 1
present = 0
for word2 in reuters.words(doc):
if word.lower() == word2.lower():
present = 1
break
if present == 1:
wordCountCorpus == wordCountCorpus + 1
if count == 200:
break
tf = wordCount
idf = math.log(docCount/(1 + wordCountCorpus))
tfidf = tf * idf
return tfidf
示例2: import_reuters_files
def import_reuters_files(ds, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(reuters.fileids())
counter = 0
root_handle = ds.insert("#reuters")
for fileid in reuters.fileids():
tags = ["@%s" % category for category in reuters.categories(fileid)]
file_handle = ds.insert(["#%s" % fileid] + tags)
ds.link(root_handle, file_handle)
for sent in reuters.sents(fileid):
norm = [word.lower() for word in sent]
sen_handle = ds.insert(norm)
ds.link(file_handle, sen_handle)
if not silent:
counter += 1
if (counter % 10 == 0):
print("importing %s of %s files..." % (counter, total),
file=log)
示例3: collection_stats
def collection_stats():
# List of documents
documents = reuters.fileids()
print(str(len(documents)) + " documents");
train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
print(str(len(train_docs)) + " total train documents");
test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
print(str(len(test_docs)) + " total test documents");
# List of categories
categories = reuters.categories();
print(str(len(categories)) + " categories");
# Documents in a category
category_docs = reuters.fileids("acq");
# Words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0]);
print(document_words);
# Raw document
print(reuters.raw(document_id));
示例4: __init__
def __init__(self, categories=None, lower=True):
if categories == None or len(categories) == 1:
self.fileids = reuters.fileids()
else:
self.fileids = reuters.fileids(categories)
self.categories = categories
self.lower = lower
示例5: print_reuters
def print_reuters():
from nltk.corpus import reuters
# print reuters.fileids()
# print reuters.categories()
print reuters.categories('training/9865')
print reuters.categories(['training/9865','training/9880'])
print reuters.fileids('barley')
print reuters.fileids(['barely','corn'])
示例6: explore_categories
def explore_categories(max_len=5000, min_len=100, percentage=0.3):
for cat in reuters.categories():
for cat2 in reuters.categories():
if cat2 > cat:
if len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
l1 = len(reuters.fileids(cat))
l2 = len(reuters.fileids(cat2))
if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
示例7: generateTextList
def generateTextList( category, size, normalize = False ):
i = 0
text = []
while i < size and i < len( reuters.fileids( category ) ):
if not normalize:
text.insert( i, reuters.words( reuters.fileids( category )[i] ) )
else:
text.insert( i,
getNormalizedText( reuters.words( reuters.fileids( category )[i] ) ) )
i += 1
return text
示例8: create_token_stream
def create_token_stream():
"""
A funtion that creates token stream based on the nltk reuters corpus
A token stream is a list of tuples containing terms to docID
"""
token_stream = []
docID = 1
termID = 1
print "Creating token stream..."
for fileid in reuters.fileids('barley'):
for term in reuters.words(fileid):
# Strip punctuatuion from the word and make lower case
term = remove_punct_from_word(term).lower()
# Check to make sure word is not "" and term is not a number
if len(term) > 0 and not is_number(term):
stemmed_term = stem().stem_word(term)
if stemmed_term not in terms:
terms[stemmed_term] = termID
termID += 1
new_token = (terms[stemmed_term], docID)
token_stream.append(new_token)
# Add to docs dictionary mapping docID to file
docs[docID] = fileid
docID += 1
return token_stream
示例9: load_data
def load_data(config={}):
"""
Load the Reuters dataset.
Returns
-------
data : dict
with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
"""
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]
xs = {'train': [], 'test': []}
xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
xs['test'] = vectorizer.transform(docs['test']).toarray()
ys = {'train': [], 'test': []}
ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
for doc_id in train])
ys['test'] = mlb.transform([reuters.categories(doc_id)
for doc_id in test])
data = {'x_train': xs['train'], 'y_train': ys['train'],
'x_test': xs['test'], 'y_test': ys['test'],
'labels': globals()["labels"]}
return data
示例10: run
def run():
"""Import the Reuters Corpus which contains 10,788 news articles"""
from nltk.corpus import reuters
raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()]
# Select 100 documents randomly
rand_idx = random.sample(range(len(raw_docs)), 100)
raw_docs = [raw_docs[i] for i in rand_idx]
# Preprocess Documents
tokenized_docs = [ie_preprocess(doc) for doc in raw_docs]
# Remove single occurance words
docs = remove_infrequent_words(tokenized_docs)
# Create dictionary and corpus
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
# Build LDA model
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
for topic in lda.show_topics():
print topic
示例11: get_testset_trainset_nltk_reuters
def get_testset_trainset_nltk_reuters():
from nltk.corpus import reuters
global categories_file_name_dict
global cat_num_docs
clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]
testset = [f for f in clean_files if f[:5]=='test/']
trainset = [f for f in clean_files if f[:9]=='training/']
for cat in reuters.categories():
li=[f for f in reuters.fileids(categories=cat) if f in trainset]
li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
if len(li)>20 and len(li_te)>20:
cat_num_docs[cat]=len(li)
li.extend(li_te)
categories_file_name_dict[cat]=li
return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
[ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]
示例12: create_dictionary_index_reuters
def create_dictionary_index_reuters():
"""
A function that creates a dictonary with terms as keys
and positings lists as values for the nltk reuters corpus
"""
idx = {}
docs = {}
docID = 1
for fileid in reuters.fileids():
for word in reuters.words(fileid):
if not is_number(word):
# Strip punctuatuion from the word and make lower case
word = remove_punct_from_word(word).lower()
# Check to make sure word is not ""
if len(word) > 0:
# Check to see if word already is in index
if word in idx:
# Check to see if docID is not already present for word
if docID not in idx[word]:
idx[word].append(docID)
# Otherwise add word and docID in array to index
else:
idx[word] = []
idx[word].append(docID)
# Add to docs dictionary mapping docID to file
docs[docID] = fileid
docID += 1
size = 0
for k in idx.iterkeys():
size += os.sys.getsizeof(k)
size += os.sys.getsizeof(idx[k])
#print "size of original dictionary is:", size
return idx
示例13: get_reuters_ids_cnt
def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5):
"""To get test data for training a model
reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download()
Parameters
----------
num_doc: int
number of documents to be returned
max_voca: int
maximum number of vocabulary size for the returned corpus
remove_top_n: int
remove top n frequently used words
Returns
-------
voca_list: ndarray
list of vocabulary used to construct a corpus
doc_ids: list
list of list of word id for each document
doc_cnt: list
list of list of word count for each document
"""
file_list = reuters.fileids()
corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)]
return get_ids_cnt(corpus, max_voca, remove_top_n)
示例14: get_reuters_cnt_ids
def get_reuters_cnt_ids(num_doc=100, max_voca=10000):
''' to get test data for training a model
reuters corpus should be installed in nltk_data: nltk.download()
'''
file_list = reuters.fileids()
docs = list()
freq = Counter()
for i in range(num_doc):
doc = reuters.words(file_list[i])
freq.update(doc)
docs.append(doc)
voca = [key for key,val in freq.most_common(max_voca)]
voca_dic = dict()
voca_list = list()
for word in voca:
voca_dic[word] = len(voca_dic)
voca_list.append(word)
doc_ids = list()
doc_cnt = list()
for doc in docs:
words = set(doc)
ids = np.array([int(voca_dic[word]) for word in words if voca_dic.has_key(word)])
cnt = np.array([int(doc.count(word)) for word in words if voca_dic.has_key(word)])
doc_ids.append(ids)
doc_cnt.append(cnt)
return np.array(voca_list), doc_ids, doc_cnt
示例15: preProcess
def preProcess():
print 'PreProcess Reuters Corpus'
start_time = time.time()
docs = 0
bad = 0
tokenizer = Tokenizer()
if not os.path.isdir(Paths.base):
os.makedirs(Paths.base)
with open(Paths.text_index, 'w') as fileid_out:
with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
for f in reuters.fileids():
contents = reuters.open(f).read()
try:
tokens = tokenizer.tokenize(contents)
docs += 1
if docs % 1000 == 0:
print "Normalised %d documents" % (docs)
out.write(' '.join(tokens) + "\n")
# if f.startswith("train"):
#
# else:
# test.write(' '.join(tokens) + "\n")
fileid_out.write(f + "\n")
except UnicodeDecodeError:
bad += 1
print "Normalised %d documents" % (docs)
print "Skipped %d bad documents" % (bad)
print 'Finished building train file ' + Paths.texts_clean
end_time = time.time()
print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)