本文整理汇总了Python中nltk.corpus.movie_reviews.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了categories函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: prep_reviews_data
def prep_reviews_data(self): # messy code to test classifier with movie reviews
if not self.movie_review_data:
print 'Preparing movie reviews...\n'
from nltk.corpus import movie_reviews
docs = [movie_reviews.raw(fileid)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
process = lambda x: 1 if x == 'pos' else -1
labels = [process(category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
docs, labels = double_shuffle(docs, labels)
training, testing = divide_list_by_ratio(docs)
self.train_labs, self.test_labs = divide_list_by_ratio(labels)
train_vecs = self.vectorizer.fit_transform(training)
test_vecs = self.vectorizer.transform(testing)
if isinstance(self.model, naive_bayes.GaussianNB):
train_vecs = train_vecs.toarray()
test_vecs = test_vecs.toarray()
self.train_vecs = train_vecs
self.test_vecs = test_vecs
self.movie_review_data = True
self.news_market_data = False
示例2: download_data_if_not_yet
def download_data_if_not_yet():
"""
Download the data set, if the data set is not download.
"""
try:
# make sure that nltk can find the data
if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print "Downloading movie_reviews data set, please wait....."
nltk.download(
'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path
示例3: createFeatureSet
def createFeatureSet(numOfExamples):
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)[:numOfExamples]]
with open('documents.txt', 'wb') as f:
pickle.dump(documents, f)
## #read from file
## with open('documents.txt', 'rb') as f:
## documents = pickle.load(f)
random.shuffle(documents)
all_words = []
## for w in movie_reviews.words():
## all_words.append(w.lower())
#write to file
## with open('allwords.txt', 'wb') as f:
## pickle.dump(all_words, f)
#read from file
with open('allwords.txt', 'rb') as f:
all_words = pickle.load(f)
freqDist = nltk.FreqDist(all_words)
#print('freq dist')
#print(freqDist.most_common(50))
word_features = freqDist.most_common(3000)
featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
return featuresets
示例4: main
def main():
documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
featuresets = [(document_features8b(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
示例5: create_dataset
def create_dataset():
'''Create dataset from movie reviews dataset'''
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
return documents
示例6: create_sentiment
def create_sentiment():
"""
Train sentiment model and save.
Input type: None
Output: Model as pickle
"""
random.seed(1)
test = [
("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'),
("His hands are shaking Dude looks so stoked and scared at the same time",'pos'),
("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'),
("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'),
("He was so nervous shaking all over his voice quivering",'neg'),
("The game looked nice too very cute art style ",'pos'),
("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement I hope it works out for them aswell",'pos'),
("However following that up with the weird PvZ thing was odd To say the least",'neg'),
("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'),
("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'),
("I want to give him a cookie",'pos'),
("Im getting a copy Im gonna support my indie devs",'pos'),
("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'),
("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'),
("Honored Im 100 sure that was intentional",'neg'),
("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'),
("The confirmation was who was talking not what they were talking about ",'neg'),
("How awkward is it for a pop singer to perform at a video game conference",'neg'),
("Oh god did they warn him that he will get zero reaction",'neg'),
("I really hope so",'pos'),
("Almost as bad as Aisha fucking up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg')
]
# Grab review data
reviews = [
(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)
]
random.shuffle(reviews)
# Divide into 10% train/test splits
new_train, new_test = reviews[:1900], reviews[1900:]
# Train the NB classifier on the train split
cl = NaiveBayesClassifier(new_train)
# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))
# Show 5 most informative features
cl.show_informative_features(5)
# Save model for use in creating social model sentiment
with open('sentiment_clf_full.pkl', 'wb') as pk:
pickle.dump(cl, pk)
print 'done saving model'
示例7: f2c
def f2c(corpus,fileName):
if corpus=='mr':
from nltk.corpus import movie_reviews as mr
return mr.categories(fileids = fileName)[0]
else:
from nltk.corpus import reuters
return reuters.categories(fileids = fileName)[0]
示例8: prepareSentimentClassifier
def prepareSentimentClassifier():
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
global word_featuresSent
word_featuresSent = list(all_words.keys())[:3000]
featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100)
return sentimentClassifier
开发者ID:koskinap,项目名称:Popularity_StyleOfPlay_DS2015_Group3_Soton,代码行数:27,代码来源:realTimeMatchAnalyzer.py
示例9: documentClassification
def documentClassification():
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5)
示例10: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
示例11: main
def main():
"""
Sample training using the movie reviews corpus (Pang, Lee).
"""
#== load inputs
documents = np.array([movie_reviews.raw(review_id)
for category in movie_reviews.categories()
for review_id in movie_reviews.fileids(category)])
sentiment_scores = np.array([0 if category == 'neg' else 1
for category in movie_reviews.categories()
for review_id in movie_reviews.fileids(category)])
#== select random indices
n = documents.shape[0]
indices = np.random.permutation(n)
threshold = np.floor(n*0.8) # 80% training set / 20% test set
train_idx, test_idx = indices[:threshold], indices[threshold:]
#== select training and validation sets according to these indicies
x_train, x_test = documents[:, train_idx], documents[:, test_idx]
y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx]
#== train the model
print '===== Training the model...'
sentiment = SentimentMachine(x_train.tolist(), y_train.tolist())
w = sentiment.train(speed=0.001, stochastic=False)
print '===== Model trained.'
#== test efficiency of the model
print '===== Testing the model...'
# compute the MSE
h = lambda a,b: sigmoid(np.dot(a,b))
x = sentiment.compute_features_matrix(x_test.tolist())
mse = cost(w, x, y_test, h)
# compute the number of valid classifications
n_test = y_test.shape[0]
valid = 0
for i in xrange(n_test):
valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0
percent = 100.0 * valid / n_test
# print results
print ('== Number of well-classified documents: {0} / {1} ({2}%)'
.format(valid, n_test, percent))
print '== Cost value on the test set: %.4f' % mse
示例12: data_run
def data_run():
# print "Preparing Data..."
labels = movie_reviews.categories()
labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
high_info_words = set(Params.high_information_words(labeled_words))
feat_det = lambda words: Params.bag_of_words_in_set(words, high_info_words)
feats = Train.label_feat_from_corps(movie_reviews, feature_detector=feat_det)
return Train.split_label_feats(feats)
示例13: label_docs
def label_docs():
docs = [(list(movie_reviews.words(fid)), cat)
for cat in movie_reviews.categories()
for fid in movie_reviews.fileids(cat)]
random.seed(42)
random.shuffle(docs)
return docs
示例14: __init__
def __init__(self):
self.documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(self.documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
示例15: get_documents
def get_documents():
"""
Retrieve shuffled movie reviews from the nltk
"""
print("Retrieving Movie Reviews\n")
reviews = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
# so I have no idea why, but shuffle() gives me a none type
return random.sample(reviews, len(reviews))