当前位置: 首页>>代码示例>>Python>>正文


Python reuters.words方法代码示例

本文整理汇总了Python中nltk.corpus.reuters.words方法的典型用法代码示例。如果您正苦于以下问题:Python reuters.words方法的具体用法?Python reuters.words怎么用?Python reuters.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.reuters的用法示例。


在下文中一共展示了reuters.words方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: word_ngrams_concat_tf5000_l2_w_holdout

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_ngrams_concat_tf5000_l2_w_holdout(headlines, bodies):
    """
    Simple bag of words feature extraction
    """
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False,
                                          norm="l2", stop_words='english')
        X_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False,
                                          norm="l2", stop_words='english')
        X_body = vectorizer_body.fit_transform(bodies)

        X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)

        return X


    vocab = create_word_ngram_vocabulary(ngram_range=(1,1), max_features=5000,
                                         lemmatize=False, use_idf=False, term_freq=True, norm='l2',
                                         include_holdout=True)

    X = get_features(vocab)

    return X 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:27,代码来源:feature_engineering.py

示例2: get_corpus_text

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def get_corpus_text():
    '''
    return raw text of reuters corpus
    '''
    return [" ".join(reuters.words(fid)) for fid in reuters.fileids()] 
开发者ID:inikdom,项目名称:rnn-speech,代码行数:7,代码来源:setuptextcorpus.py

示例3: latent_dirichlet_allocation_25

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_25(headlines, bodies):
    """
    Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
    Returns feature vector of cosinus distances between the topic models of headline and bodies.

    Links:
        https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
        https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
        https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
    """
    return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=25, include_holdout=False,
                                                        use_idf=False, term_freq=True, incl_unlbled_test=False) 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:14,代码来源:feature_engineering.py

示例4: latent_dirichlet_allocation_300

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_300(headlines, bodies):
    """
    Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
    Returns feature vector of cosinus distances between the topic models of headline and bodies.

    Links:
        https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
        https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
        https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
    """
    return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=300, include_holdout=False,
                                                        use_idf=False, term_freq=True, incl_unlbled_test=False) 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:14,代码来源:feature_engineering.py

示例5: latent_dirichlet_allocation_25_holdout_unlbled_test

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation_25_holdout_unlbled_test(headlines, bodies):
    """
    Sklearn LDA implementation based on the 5000 most important words (based on train+test+holdout+ unlabeled test data's term freq => bleeding).
    Returns feature vector of cosinus distances between the topic models of headline and bodies.

    Links:
        https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca
        https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
        https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text
    """
    return topic_models.latent_dirichlet_allocation_cos(headlines, bodies, n_topics=25, include_holdout=True,
                                                        use_idf=False, term_freq=True, incl_unlbled_test=True) 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:14,代码来源:feature_engineering.py

示例6: word_unigrams_5000_concat_tf_l2_holdout

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_unigrams_5000_concat_tf_l2_holdout(headlines, bodies):
    """
    Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body,
    concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set).
    """

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]
        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_body = vectorizer_body.fit_transform(bodies)

        X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)

        return X

    # get headlines and bodies of train, test and holdout set
    h, b = word_ngrams.get_head_body_tuples(include_holdout=True)

    # create the vocab out of the BoW
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=False,
                            norm='l2')
    tfidf.fit_transform(combine_head_and_body(h, b))
    vocab = tfidf.vocabulary_

    X = get_features(vocab)

    return X 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:38,代码来源:feature_engineering.py

示例7: word_ngrams_concat_tf5000_l2_w_holdout_and_test

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_ngrams_concat_tf5000_l2_w_holdout_and_test(headlines, bodies):
    """
    Simple bag of words feature extraction
    """

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]
        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_body = vectorizer_body.fit_transform(bodies)

        X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)

        return X

    h, b = get_head_body_tuples(include_holdout=True)
    h_test, b_test = get_head_body_tuples_test()

    print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of heads: " + str(len(h)))
    print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of bodies: " + str(len(b)))
    h.extend(h_test)
    b.extend(b_test)
    print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of heads after ext: " + str(len(h)))
    print("word_ngrams_concat_tf5000_l2_w_holdout_and_test length of bodies after ext: " + str(len(b)))

    tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_features=5000, use_idf=True,
                            norm='l2')
    tfidf.fit_transform(combine_head_and_body(h,b))
    vocab = tfidf.vocabulary_

    X = get_features(vocab)

    return X 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:43,代码来源:feature_engineering.py

示例8: gen_financial_top_words

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
    if not os.path.isfile('./input/topWords.json'):
        wordCnt = {}
        for field in reuters.fileids():
            for word in reuters.words(field):
                word = unify_word(word)
                if word in nltk.corpus.stopwords.words('english'):
                    continue
                wordCnt[word] = wordCnt.get(word, 0) + 1

        sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
        wordCnt = {} # reset wordCnt
        for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
        with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
    else: return 
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:17,代码来源:preprocessing.py

示例9: word_unigrams_5000_concat_tf_l2_holdout_unlbled_test

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def word_unigrams_5000_concat_tf_l2_holdout_unlbled_test(headlines, bodies):
    """
    Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body,
    concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set).
    """

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]
        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True,
                                          norm="l2", stop_words='english')
        X_body = vectorizer_body.fit_transform(bodies)

        X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1)

        return X

    # get headlines and bodies of train, test and holdout set
    h, b = word_ngrams.get_head_body_tuples(include_holdout=True)


    # Comment out for clean ablation tests
    # add the unlabeled test data words to the BoW of test+train+holdout data
    h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test()
    h.extend(h_unlbled_test)
    b.extend(b_unlbled_test)

    # create the vocab out of the BoW
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=True,
                            norm='l2')
    tfidf.fit_transform(combine_head_and_body(h, b))
    vocab = tfidf.vocabulary_

    X = get_features(vocab)

    return X



#'PPDB' paraphrase database
#hungarian alignment score
#computing score of each word of headline with each word of body - very resource-hungry 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:51,代码来源:feature_engineering.py

示例10: latent_dirichlet_allocation

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def latent_dirichlet_allocation(headlines, bodies):
    # https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca
    # https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
    # https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text

    def print_top_words(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic #%d:" % topic_idx)
            print(", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]

        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=25, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X


    vocab = create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=True,
                                         norm='l2')
    X = get_features(vocab)
    return X 
开发者ID:UKPLab,项目名称:coling2018_fake-news-challenge,代码行数:58,代码来源:feature_engineering.py

示例11: get_reuters_data

# 需要导入模块: from nltk.corpus import reuters [as 别名]
# 或者: from nltk.corpus.reuters import words [as 别名]
def get_reuters_data(n_vocab):
    # return variables
    sentences = []
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']
    current_idx = 2
    word_idx_count = {0: float('inf'), 1: float('inf')}
    tag = 0
    for field in reuters.fileids():
        sentence = reuters.words(field)
        tokens = [unify_word(t) for t in sentence]
        for t in tokens:
            if t not in word2idx:
                word2idx[t] = current_idx
                idx2word.append(t)
                current_idx += 1
            idx = word2idx[t]
            word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
        sentence_by_idx = [word2idx[t] for t in tokens]
        sentences.append(sentence_by_idx)
        tag += 1
        print(tag)

    # restrict vocab size
    sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        print word, count
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1
    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx 
    unknown = new_idx

    # map old idx to new idx
    sentences_small = []
    for sentence in sentences:
        if len(sentence) > 1:
            new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
            sentences_small.append(new_sentence)

    return sentences_small, word2idx_small 
开发者ID:WayneDW,项目名称:Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction,代码行数:48,代码来源:word_embedding.py


注:本文中的nltk.corpus.reuters.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。