本文整理汇总了Python中nltk.corpus.reuters.words方法的典型用法代码示例。如果您正苦于以下问题:Python reuters.words方法的具体用法?Python reuters.words怎么用?Python reuters.words使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.reuters
的用法示例。
在下文中一共展示了reuters.words方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: word_ngrams_concat_tf5000_l2_w_holdout
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_ngrams_concat_tf5000_l2_w_holdout(headlines, bodies):
"""
Simple bag of words feature extraction
"""
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False,
norm="l2", stop_words='english')
X_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False,
norm="l2", stop_words='english')
X_body = vectorizer_body.fit_transform(bodies)
X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)
return X
vocab = create_word_ngram_vocabulary(ngram_range=(1,1), max_features=5000,
lemmatize=False, use_idf=False, term_freq=True, norm='l2',
include_holdout=True)
X = get_features(vocab)
return X
示例2: get_corpus_text
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def get_corpus_text():
'''
return raw text of reuters corpus
'''
return [" ".join(reuters.words(fid)) for fid in reuters.fileids()]
示例3: latent_dirichlet_allocation_25
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_25(headlines, bodies):
"""
Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
Returns feature vector of cosinus distances between the topic models of headline and bodies.
Links:
https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
"""
return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=25, include_holdout=False,
use_idf=False, term_freq=True, incl_unlbled_test=False)
示例4: latent_dirichlet_allocation_300
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_300(headlines, bodies):
"""
Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
Returns feature vector of cosinus distances between the topic models of headline and bodies.
Links:
https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
"""
return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=300, include_holdout=False,
use_idf=False, term_freq=True, incl_unlbled_test=False)
示例5: latent_dirichlet_allocation_25_holdout_unlbled_test
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_25_holdout_unlbled_test(headlines, bodies):
"""
Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
Returns feature vector of cosinus distances between the topic models of headline and bodies.
Links:
https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
"""
return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=25, include_holdout=True,
use_idf=False, term_freq=True, incl_unlbled_test=True)
示例6: word_unigrams_5000_concat_tf_l2_holdout
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_unigrams_5000_concat_tf_l2_holdout(headlines, bodies):
"""
Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body,
concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set).
"""
def combine_head_and_body(headlines, bodies):
head_and_body = [headline + " " + body for i, (headline, body) in
enumerate(zip(headlines, bodies))]
return head_and_body
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_body = vectorizer_body.fit_transform(bodies)
X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)
return X
# get headlines and bodies of train, test and holdout set
h, b = word_ngrams.get_head_body_tuples(include_holdout=True)
# create the vocab out of the BoW
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=False,
norm='l2')
tfidf.fit_transform(combine_head_and_body(h, b))
vocab = tfidf.vocabulary_
X = get_features(vocab)
return X
示例7: word_ngrams_concat_tf5000_l2_w_holdout_and_test
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_ngrams_concat_tf5000_l2_w_holdout_and_test(headlines, bodies):
"""
Simple bag of words feature extraction
"""
def combine_head_and_body(headlines, bodies):
head_and_body = [headline + " " + body for i, (headline, body) in
enumerate(zip(headlines, bodies))]
return head_and_body
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_body = vectorizer_body.fit_transform(bodies)
X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)
return X
h, b = get_head_body_tuples(include_holdout=True)
h_test, b_test = get_head_body_tuples_test()
print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of heads: " + str(len(h)))
print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of bodies: " + str(len(b)))
h.extend(h_test)
b.extend(b_test)
print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of heads after ext: " + str(len(h)))
print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of bodies after ext: " + str(len(b)))
tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_features=5000, use_idf=True,
norm='l2')
tfidf.fit_transform(combine_head_and_body(h,b))
vocab = tfidf.vocabulary_
X = get_features(vocab)
return X
示例8: gen_financial_top_words
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
if not os.path.isfile('./input/topWords.json'):
wordCnt = {}
for field in reuters.fileids():
for word in reuters.words(field):
word = unify_word(word)
if word in nltk.corpus.stopwords.words('english'):
continue
wordCnt[word] = wordCnt.get(word, 0) + 1
sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
wordCnt = {} # reset wordCnt
for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
else: return
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:17,代码来源:preprocessing.py
示例9: word_unigrams_5000_concat_tf_l2_holdout_unlbled_test
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_unigrams_5000_concat_tf_l2_holdout_unlbled_test(headlines, bodies):
"""
Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body,
concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set).
"""
def combine_head_and_body(headlines, bodies):
head_and_body = [headline + " " + body for i, (headline, body) in
enumerate(zip(headlines, bodies))]
return head_and_body
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
norm="l2", stop_words='english')
X_body = vectorizer_body.fit_transform(bodies)
X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)
return X
# get headlines and bodies of train, test and holdout set
h, b = word_ngrams.get_head_body_tuples(include_holdout=True)
# Comment out for clean ablation tests
# add the unlabeled test data words to the BoW of test+train+holdout data
h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test()
h.extend(h_unlbled_test)
b.extend(b_unlbled_test)
# create the vocab out of the BoW
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=True,
norm='l2')
tfidf.fit_transform(combine_head_and_body(h, b))
vocab = tfidf.vocabulary_
X = get_features(vocab)
return X
#'PPDB' paraphrase database
#hungarian alignment score
#computing score of each word of headline with each word of body - very resource-hungry
示例10: latent_dirichlet_allocation
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation(headlines, bodies):
# https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca
# https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
# https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(", ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
def combine_head_and_body(headlines, bodies):
head_and_body = [headline + " " + body for i, (headline, body) in
enumerate(zip(headlines, bodies))]
return head_and_body
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=25, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
vocab = create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=True,
norm='l2')
X = get_features(vocab)
return X
示例11: get_reuters_data
# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def get_reuters_data(n_vocab):
# return variables
sentences = []
word2idx = {'START': 0, 'END': 1}
idx2word = ['START', 'END']
current_idx = 2
word_idx_count = {0: float('inf'), 1: float('inf')}
tag = 0
for field in reuters.fileids():
sentence = reuters.words(field)
tokens = [unify_word(t) for t in sentence]
for t in tokens:
if t not in word2idx:
word2idx[t] = current_idx
idx2word.append(t)
current_idx += 1
idx = word2idx[t]
word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
sentence_by_idx = [word2idx[t] for t in tokens]
sentences.append(sentence_by_idx)
tag += 1
print(tag)
# restrict vocab size
sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
word2idx_small = {}
new_idx = 0
idx_new_idx_map = {}
for idx, count in sorted_word_idx_count[:n_vocab]:
word = idx2word[idx]
print word, count
word2idx_small[word] = new_idx
idx_new_idx_map[idx] = new_idx
new_idx += 1
# let 'unknown' be the last token
word2idx_small['UNKNOWN'] = new_idx
unknown = new_idx
# map old idx to new idx
sentences_small = []
for sentence in sentences:
if len(sentence) > 1:
new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
sentences_small.append(new_sentence)
return sentences_small, word2idx_small
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:48,代码来源:word_embedding.py