本文整理匯總了Python中nltk.corpus.reuters.fileids方法的典型用法代碼示例。如果您正苦於以下問題:Python reuters.fileids方法的具體用法?Python reuters.fileids怎麽用?Python reuters.fileids使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.corpus.reuters
的用法示例。
在下文中一共展示了reuters.fileids方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: get_corpus_text
# 需要導入模塊: from nltk.corpus import reuters [as 別名]
# 或者: from nltk.corpus.reuters import fileids [as 別名]
def get_corpus_text():
'''
return raw text of reuters corpus
'''
return [" ".join(reuters.words(fid)) for fid in reuters.fileids()]
示例2: gen_financial_top_words
# 需要導入模塊: from nltk.corpus import reuters [as 別名]
# 或者: from nltk.corpus.reuters import fileids [as 別名]
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
if not os.path.isfile('./input/topWords.json'):
wordCnt = {}
for field in reuters.fileids():
for word in reuters.words(field):
word = unify_word(word)
if word in nltk.corpus.stopwords.words('english'):
continue
wordCnt[word] = wordCnt.get(word, 0) + 1
sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
wordCnt = {} # reset wordCnt
for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
else: return
開發者ID:WayneDW,項目名稱:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代碼行數:17,代碼來源:preprocessing.py
示例3: get_reuters_data
# 需要導入模塊: from nltk.corpus import reuters [as 別名]
# 或者: from nltk.corpus.reuters import fileids [as 別名]
def get_reuters_data(n_vocab):
# return variables
sentences = []
word2idx = {'START': 0, 'END': 1}
idx2word = ['START', 'END']
current_idx = 2
word_idx_count = {0: float('inf'), 1: float('inf')}
tag = 0
for field in reuters.fileids():
sentence = reuters.words(field)
tokens = [unify_word(t) for t in sentence]
for t in tokens:
if t not in word2idx:
word2idx[t] = current_idx
idx2word.append(t)
current_idx += 1
idx = word2idx[t]
word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
sentence_by_idx = [word2idx[t] for t in tokens]
sentences.append(sentence_by_idx)
tag += 1
print(tag)
# restrict vocab size
sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
word2idx_small = {}
new_idx = 0
idx_new_idx_map = {}
for idx, count in sorted_word_idx_count[:n_vocab]:
word = idx2word[idx]
print word, count
word2idx_small[word] = new_idx
idx_new_idx_map[idx] = new_idx
new_idx += 1
# let 'unknown' be the last token
word2idx_small['UNKNOWN'] = new_idx
unknown = new_idx
# map old idx to new idx
sentences_small = []
for sentence in sentences:
if len(sentence) > 1:
new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
sentences_small.append(new_sentence)
return sentences_small, word2idx_small
開發者ID:WayneDW,項目名稱:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代碼行數:48,代碼來源:word_embedding.py