当前位置: 首页>>代码示例>>Python>>正文


Python movie_reviews.words方法代码示例

本文整理汇总了Python中nltk.corpus.movie_reviews.words方法的典型用法代码示例。如果您正苦于以下问题:Python movie_reviews.words方法的具体用法?Python movie_reviews.words怎么用?Python movie_reviews.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.movie_reviews的用法示例。


在下文中一共展示了movie_reviews.words方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_bigram_feats

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#//////////////////////////////////////////////////////////// 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:27,代码来源:util.py

示例2: extract_unigram_feats

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_unigram_feats(document, unigrams, handle_negation=False):
    """
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:25,代码来源:util.py

示例3: load_movie_reviews

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def load_movie_reviews():

    # movie_reviews is a sizeable corpus to import, so only load it if we have to
    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    # NLTK's corpus is structured in an interesting way
    # first iterate through the two categories (pos and neg)
    for category in movie_reviews.categories():

        if category == 'pos':
            pretty_category_name = 'positive'
        elif category == 'neg':
            pretty_category_name = 'negative'

        # each of these categories is just fileids, so grab those
        for fileid in movie_reviews.fileids(category):

            # then each review is a NLTK class where each item in that class instance is a word
            review_words = movie_reviews.words(fileid)
            review_text = ''

            for word in review_words:
                review_text += ' ' + word

            review_dictionary = {
                'text': review_text,
                'sentiment': pretty_category_name
            }

            raw_data.append(review_dictionary)

    return raw_data 
开发者ID:ClimbsRocks,项目名称:empythy,代码行数:43,代码来源:utils.py

示例4: mark_negation

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def mark_negation(document, double_neg_flip=False, shallow=False):
    """
    Append _NEG suffix to words that appear in the scope between a negation
    and a punctuation mark.

    :param document: a list of words/tokens, or a tuple (words, label).
    :param shallow: if True, the method will modify the original document in place.
    :param double_neg_flip: if True, double negation is considered affirmation
        (we activate/deactivate negation scope everytime we find a negation).
    :return: if `shallow == True` the method will modify the original document
        and return it. If `shallow == False` the method will return a modified
        document, leaving the original unmodified.

    >>> sent = "I didn't like this movie . It was bad .".split()
    >>> mark_negation(sent)
    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
    """
    if not shallow:
        document = deepcopy(document)
    # check if the document is labeled. If so, do not consider the label.
    labeled = document and isinstance(document[0], (tuple, list))
    if labeled:
        doc = document[0]
    else:
        doc = document
    neg_scope = False
    for i, word in enumerate(doc):
        if NEGATION_RE.search(word):
            if not neg_scope or (neg_scope and double_neg_flip):
                neg_scope = not neg_scope
                continue
            else:
                doc[i] += '_NEG'
        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
            neg_scope = not neg_scope
        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
            doc[i] += '_NEG'

    return document 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:41,代码来源:util.py

示例5: word_feats

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def word_feats(words):
    return dict([(word, True) for word in words]) 
开发者ID:merqurio,项目名称:twitter_trolls,代码行数:4,代码来源:haters.py

示例6: find_features

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def find_features(document, word_features):
    words = word_tokenize(document["text"])
    features = {}
    for t in word_features:
        for w in t[0].keys():
            features[w] = (w in words)

    return features 
开发者ID:merqurio,项目名称:twitter_trolls,代码行数:10,代码来源:haters.py

示例7: demo_movie_reviews

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances) 
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:54,代码来源:util.py

示例8: extract_features

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_features(words):
    return dict([(word, True) for word in words]) 
开发者ID:PacktPublishing,项目名称:Artificial-Intelligence-with-Python,代码行数:4,代码来源:sentiment_analyzer.py

示例9: demo_subjectivity

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
开发者ID:Thejas-1,项目名称:Price-Comparator,代码行数:61,代码来源:util.py

示例10: demo_movie_reviews

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from sentiment_analyzer import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances) 
开发者ID:jarrellmark,项目名称:neighborhood_mood_aws,代码行数:54,代码来源:util.py

示例11: demo_subjectivity

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from sentiment_analyzer import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
开发者ID:jarrellmark,项目名称:neighborhood_mood_aws,代码行数:61,代码来源:util.py

示例12: getFeatures

# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def getFeatures(numWordsToUse):
    # stopwords are common words that occur so frequently as to be useless for NLP
    stopWords = set(stopwords.words('english'))


    # read in all the words of each movie review, and it's associated sentiment
    reviewDocuments = []
    sentiment = []

    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            reviewWords = movie_reviews.words(fileid)

            cleanedReview = []
            for word in reviewWords:
                if word not in stopWords:
                    cleanedReview.append(word)

            reviewDocuments.append(cleanedReview)
            if category == 'pos':
                sentiment.append(1)
            elif category == 'neg':
                sentiment.append(0)
            else:
                print 'We are not sure what this category is: ' + category

    global popularWords
    formattedReviews, sentiment, popularWords = utils.nlpFeatureEngineering(
            reviewDocuments, sentiment, 50, numWordsToUse, 'counts'
        )


    # transform list of dictionaries into a sparse matrix
    sparseFeatures = dv.fit_transform(formattedReviews)

    return sparseFeatures, sentiment 
开发者ID:ClimbsRocks,项目名称:nlpSentiment,代码行数:38,代码来源:nltkMovieReviews.py


注:本文中的nltk.corpus.movie_reviews.words方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。