本文整理汇总了Python中nltk.corpus.movie_reviews.words方法的典型用法代码示例。如果您正苦于以下问题:Python movie_reviews.words方法的具体用法?Python movie_reviews.words怎么用?Python movie_reviews.words使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.movie_reviews
的用法示例。
在下文中一共展示了movie_reviews.words方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_bigram_feats
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
示例2: extract_unigram_feats
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_unigram_feats(document, unigrams, handle_negation=False):
"""
Populate a dictionary of unigram features, reflecting the presence/absence in
the document of each of the tokens in `unigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of words/tokens whose presence/absence has to be
checked in `document`.
:param handle_negation: if `handle_negation == True` apply `mark_negation`
method to `document` before checking for unigram presence/absence.
:return: a dictionary of unigram features {unigram : boolean}.
>>> words = ['ice', 'police', 'riot']
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_unigram_feats(document, words).items())
[('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
"""
features = {}
if handle_negation:
document = mark_negation(document)
for word in unigrams:
features['contains({0})'.format(word)] = word in set(document)
return features
示例3: load_movie_reviews
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def load_movie_reviews():
# movie_reviews is a sizeable corpus to import, so only load it if we have to
from nltk.corpus import movie_reviews
try:
movie_reviews.categories()
except:
import nltk
print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
raw_data = []
# NLTK's corpus is structured in an interesting way
# first iterate through the two categories (pos and neg)
for category in movie_reviews.categories():
if category == 'pos':
pretty_category_name = 'positive'
elif category == 'neg':
pretty_category_name = 'negative'
# each of these categories is just fileids, so grab those
for fileid in movie_reviews.fileids(category):
# then each review is a NLTK class where each item in that class instance is a word
review_words = movie_reviews.words(fileid)
review_text = ''
for word in review_words:
review_text += ' ' + word
review_dictionary = {
'text': review_text,
'sentiment': pretty_category_name
}
raw_data.append(review_dictionary)
return raw_data
示例4: mark_negation
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def mark_negation(document, double_neg_flip=False, shallow=False):
"""
Append _NEG suffix to words that appear in the scope between a negation
and a punctuation mark.
:param document: a list of words/tokens, or a tuple (words, label).
:param shallow: if True, the method will modify the original document in place.
:param double_neg_flip: if True, double negation is considered affirmation
(we activate/deactivate negation scope everytime we find a negation).
:return: if `shallow == True` the method will modify the original document
and return it. If `shallow == False` the method will return a modified
document, leaving the original unmodified.
>>> sent = "I didn't like this movie . It was bad .".split()
>>> mark_negation(sent)
['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
"""
if not shallow:
document = deepcopy(document)
# check if the document is labeled. If so, do not consider the label.
labeled = document and isinstance(document[0], (tuple, list))
if labeled:
doc = document[0]
else:
doc = document
neg_scope = False
for i, word in enumerate(doc):
if NEGATION_RE.search(word):
if not neg_scope or (neg_scope and double_neg_flip):
neg_scope = not neg_scope
continue
else:
doc[i] += '_NEG'
elif neg_scope and CLAUSE_PUNCT_RE.search(word):
neg_scope = not neg_scope
elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
doc[i] += '_NEG'
return document
示例5: word_feats
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def word_feats(words):
return dict([(word, True) for word in words])
示例6: find_features
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def find_features(document, word_features):
words = word_tokenize(document["text"])
features = {}
for t in word_features:
for w in t[0].keys():
features[w] = (w in words)
return features
示例7: demo_movie_reviews
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_movie_reviews(trainer, n_instances=None, output=None):
"""
Train classifier on all instances of the Movie Reviews dataset.
The corpus has been preprocessed using the default sentence tokenizer and
WordPunctTokenizer.
Features are composed of:
- most frequent unigrams
:param trainer: `train` method of a classifier.
:param n_instances: the number of total reviews that have to be used for
training and testing. Reviews will be equally split between positive and
negative.
:param output: the output file where results have to be reported.
"""
from nltk.corpus import movie_reviews
from nltk.sentiment import SentimentAnalyzer
if n_instances is not None:
n_instances = int(n_instances/2)
pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
# We separately split positive and negative instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
training_docs = train_pos_docs+train_neg_docs
testing_docs = test_pos_docs+test_neg_docs
sentim_analyzer = SentimentAnalyzer()
all_words = sentim_analyzer.all_words(training_docs)
# Add simple unigram word features
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Apply features to obtain a feature-value representation of our datasets
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
classifier = sentim_analyzer.train(trainer, training_set)
try:
classifier.show_most_informative_features()
except AttributeError:
print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
Instances=n_instances)
示例8: extract_features
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def extract_features(words):
return dict([(word, True) for word in words])
示例9: demo_subjectivity
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
"""
Train and test a classifier on instances of the Subjective Dataset by Pang and
Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
All tokens (words and punctuation marks) are separated by a whitespace, so
we use the basic WhitespaceTokenizer to parse the data.
:param trainer: `train` method of a classifier.
:param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
:param n_instances: the number of total sentences that have to be used for
training and testing. Sentences will be equally split between positive
and negative.
:param output: the output file where results have to be reported.
"""
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import subjectivity
if n_instances is not None:
n_instances = int(n_instances/2)
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_subj_docs, test_subj_docs = split_train_test(subj_docs)
train_obj_docs, test_obj_docs = split_train_test(obj_docs)
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# Add simple unigram word features handling negation
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Apply features to obtain a feature-value representation of our datasets
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
classifier = sentim_analyzer.train(trainer, training_set)
try:
classifier.show_most_informative_features()
except AttributeError:
print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if save_analyzer == True:
save_file(sentim_analyzer, 'sa_subjectivity.pickle')
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
Tokenizer='WhitespaceTokenizer', Feats=extr,
Instances=n_instances, Results=results)
return sentim_analyzer
示例10: demo_movie_reviews
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_movie_reviews(trainer, n_instances=None, output=None):
"""
Train classifier on all instances of the Movie Reviews dataset.
The corpus has been preprocessed using the default sentence tokenizer and
WordPunctTokenizer.
Features are composed of:
- most frequent unigrams
:param trainer: `train` method of a classifier.
:param n_instances: the number of total reviews that have to be used for
training and testing. Reviews will be equally split between positive and
negative.
:param output: the output file where results have to be reported.
"""
from nltk.corpus import movie_reviews
from sentiment_analyzer import SentimentAnalyzer
if n_instances is not None:
n_instances = int(n_instances/2)
pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
# We separately split positive and negative instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
training_docs = train_pos_docs+train_neg_docs
testing_docs = test_pos_docs+test_neg_docs
sentim_analyzer = SentimentAnalyzer()
all_words = sentim_analyzer.all_words(training_docs)
# Add simple unigram word features
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Apply features to obtain a feature-value representation of our datasets
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
classifier = sentim_analyzer.train(trainer, training_set)
try:
classifier.show_most_informative_features()
except AttributeError:
print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
Instances=n_instances)
示例11: demo_subjectivity
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
"""
Train and test a classifier on instances of the Subjective Dataset by Pang and
Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
All tokens (words and punctuation marks) are separated by a whitespace, so
we use the basic WhitespaceTokenizer to parse the data.
:param trainer: `train` method of a classifier.
:param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
:param n_instances: the number of total sentences that have to be used for
training and testing. Sentences will be equally split between positive
and negative.
:param output: the output file where results have to be reported.
"""
from sentiment_analyzer import SentimentAnalyzer
from nltk.corpus import subjectivity
if n_instances is not None:
n_instances = int(n_instances/2)
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_subj_docs, test_subj_docs = split_train_test(subj_docs)
train_obj_docs, test_obj_docs = split_train_test(obj_docs)
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# Add simple unigram word features handling negation
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Apply features to obtain a feature-value representation of our datasets
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
classifier = sentim_analyzer.train(trainer, training_set)
try:
classifier.show_most_informative_features()
except AttributeError:
print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if save_analyzer == True:
save_file(sentim_analyzer, 'sa_subjectivity.pickle')
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
Tokenizer='WhitespaceTokenizer', Feats=extr,
Instances=n_instances, Results=results)
return sentim_analyzer
示例12: getFeatures
# 需要导入模块: from nltk.corpus import movie_reviews [as 别名]
# 或者: from nltk.corpus.movie_reviews import words [as 别名]
def getFeatures(numWordsToUse):
# stopwords are common words that occur so frequently as to be useless for NLP
stopWords = set(stopwords.words('english'))
# read in all the words of each movie review, and it's associated sentiment
reviewDocuments = []
sentiment = []
for category in movie_reviews.categories():
for fileid in movie_reviews.fileids(category):
reviewWords = movie_reviews.words(fileid)
cleanedReview = []
for word in reviewWords:
if word not in stopWords:
cleanedReview.append(word)
reviewDocuments.append(cleanedReview)
if category == 'pos':
sentiment.append(1)
elif category == 'neg':
sentiment.append(0)
else:
print 'We are not sure what this category is: ' + category
global popularWords
formattedReviews, sentiment, popularWords = utils.nlpFeatureEngineering(
reviewDocuments, sentiment, 50, numWordsToUse, 'counts'
)
# transform list of dictionaries into a sparse matrix
sparseFeatures = dv.fit_transform(formattedReviews)
return sparseFeatures, sentiment