本文整理汇总了Python中nltk.corpus.reuters.fileids方法的典型用法代码示例。如果您正苦于以下问题:Python reuters.fileids方法的具体用法?Python reuters.fileids怎么用?Python reuters.fileids使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.reuters
的用法示例。
在下文中一共展示了reuters.fileids方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_corpus_text
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import fileids [as 别名]
def get_corpus_text():
'''
return raw text of reuters corpus
'''
return [" ".join(reuters.words(fid)) for fid in reuters.fileids()]
示例2: gen_financial_top_words
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import fileids [as 别名]
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
if not os.path.isfile('./input/topWords.json'):
wordCnt = {}
for field in reuters.fileids():
for word in reuters.words(field):
word = unify_word(word)
if word in nltk.corpus.stopwords.words('english'):
continue
wordCnt[word] = wordCnt.get(word, 0) + 1
sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
wordCnt = {} # reset wordCnt
for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
else: return
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:17,代码来源:preprocessing.py
示例3: get_reuters_data
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import fileids [as 别名]
def get_reuters_data(n_vocab):
# return variables
sentences = []
word2idx = {'START': 0, 'END': 1}
idx2word = ['START', 'END']
current_idx = 2
word_idx_count = {0: float('inf'), 1: float('inf')}
tag = 0
for field in reuters.fileids():
sentence = reuters.words(field)
tokens = [unify_word(t) for t in sentence]
for t in tokens:
if t not in word2idx:
word2idx[t] = current_idx
idx2word.append(t)
current_idx += 1
idx = word2idx[t]
word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
sentence_by_idx = [word2idx[t] for t in tokens]
sentences.append(sentence_by_idx)
tag += 1
print(tag)
# restrict vocab size
sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
word2idx_small = {}
new_idx = 0
idx_new_idx_map = {}
for idx, count in sorted_word_idx_count[:n_vocab]:
word = idx2word[idx]
print word, count
word2idx_small[word] = new_idx
idx_new_idx_map[idx] = new_idx
new_idx += 1
# let 'unknown' be the last token
word2idx_small['UNKNOWN'] = new_idx
unknown = new_idx
# map old idx to new idx
sentences_small = []
for sentence in sentences:
if len(sentence) > 1:
new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
sentences_small.append(new_sentence)
return sentences_small, word2idx_small
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:48,代码来源:word_embedding.py