本文整理匯總了Python中nltk.sentiment.SentimentAnalyzer.bigram_collocation_feats方法的典型用法代碼示例。如果您正苦於以下問題:Python SentimentAnalyzer.bigram_collocation_feats方法的具體用法?Python SentimentAnalyzer.bigram_collocation_feats怎麽用?Python SentimentAnalyzer.bigram_collocation_feats使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk.sentiment.SentimentAnalyzer
的用法示例。
在下文中一共展示了SentimentAnalyzer.bigram_collocation_feats方法的1個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: demo_tweets
# 需要導入模塊: from nltk.sentiment import SentimentAnalyzer [as 別名]
# 或者: from nltk.sentiment.SentimentAnalyzer import bigram_collocation_feats [as 別名]
def demo_tweets(trainer, n_instances=None, output=None):
"""
Train and test Naive Bayes classifier on 10000 tweets, tokenized using
TweetTokenizer.
Features are composed of:
- 1000 most frequent unigrams
- 100 top bigrams (using BigramAssocMeasures.pmi)
:param trainer: `train` method of a classifier.
:param n_instances: the number of total tweets that have to be used for
training and testing. Tweets will be equally split between positive and
negative.
:param output: the output file where results have to be reported.
"""
from nltk.tokenize import TweetTokenizer
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import twitter_samples, stopwords
# Different customizations for the TweetTokenizer
tokenizer = TweetTokenizer(preserve_case=False)
# tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
# tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
if n_instances is not None:
n_instances = int(n_instances/2)
fields = ['id', 'text']
positive_json = twitter_samples.abspath("positive_tweets.json")
positive_csv = 'positive_tweets.csv'
json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
negative_json = twitter_samples.abspath("negative_tweets.json")
negative_csv = 'negative_tweets.csv'
json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
# We separately split subjective and objective instances to keep a balanced
# uniform class distribution in both train and test sets.
train_pos_docs, test_pos_docs = split_train_test(pos_docs)
train_neg_docs, test_neg_docs = split_train_test(neg_docs)
training_tweets = train_pos_docs+train_neg_docs
testing_tweets = test_pos_docs+test_neg_docs
sentim_analyzer = SentimentAnalyzer()
# stopwords = stopwords.words('english')
# all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
# Add simple unigram word features
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# Add bigram collocation features
bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
top_n=100, min_freq=12)
sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)
training_set = sentim_analyzer.apply_features(training_tweets)
test_set = sentim_analyzer.apply_features(testing_tweets)
classifier = sentim_analyzer.train(trainer, training_set)
# classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
try:
classifier.show_most_informative_features()
except AttributeError:
print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)
if output:
extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
Tokenizer=tokenizer.__class__.__name__, Feats=extr,
Results=results, Instances=n_instances)