本文整理汇总了Python中nltk.corpus.movie_reviews.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_data
def load_data():
global posfeats,negfeats
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
return
示例2: prep_reviews_data
def prep_reviews_data(self): # messy code to test classifier with movie reviews
if not self.movie_review_data:
print 'Preparing movie reviews...\n'
from nltk.corpus import movie_reviews
docs = [movie_reviews.raw(fileid)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
process = lambda x: 1 if x == 'pos' else -1
labels = [process(category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
docs, labels = double_shuffle(docs, labels)
training, testing = divide_list_by_ratio(docs)
self.train_labs, self.test_labs = divide_list_by_ratio(labels)
train_vecs = self.vectorizer.fit_transform(training)
test_vecs = self.vectorizer.transform(testing)
if isinstance(self.model, naive_bayes.GaussianNB):
train_vecs = train_vecs.toarray()
test_vecs = test_vecs.toarray()
self.train_vecs = train_vecs
self.test_vecs = test_vecs
self.movie_review_data = True
self.news_market_data = False
示例3: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
示例4: main
def main():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = int(len(negfeats) * 3 / 4)
poscutoff = int(len(posfeats) * 3 / 4)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
with open("output.json") as fin:
sid = SentimentIntensityAnalyzer()
data = json.load(fin)
for key in data:
reviews = data[key]["reviews"]
for i in range(len(reviews)):
text = reviews[i]["review"]
sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
prob = classifier.prob_classify(word_feats(text.split(" ")))
classification = classifier.classify(word_feats(text.split(" ")))
sentiment_dict['positive_probability'] = prob.prob('pos')
sentiment_dict['negative_probability'] = prob.prob('neg')
sentiment_dict['label'] = classification
reviews[i]["sentiment"] = sentiment_dict
data[key]["reviews"] = reviews
with open('out_with_sentiment.json', 'w') as outfile:
json.dump(data, outfile)
示例5: train_with_movie_db
def train_with_movie_db(self):
"""
Training possible with movie reviews
- this does not yield particularly good results
"""
self.use_movie_reviews = True
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"negative") for f in negids]
posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"positive") for f in posids]
negcutoff = len(negfeats) * 3 / 4
poscutoff = len(posfeats) * 3 / 4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))
self.classifier = NaiveBayesClassifier.train(trainfeats)
DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
DLOG(self.classifier.show_most_informative_features())
示例6: evaluate_classifier
def evaluate_classifier(featx):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
示例7: train
def train(test=False):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
if(test):
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
else:
return NaiveBayesClassifier.train(negfeats+posfeats)
示例8: median_approach
def median_approach(llimit,ulimit,isphrase,pathname):
posmedlist=[]
negmedlist=[]
medians=[]
lpcount=0
totalcount=ulimit-llimit
cnt_var=0
print '\nNo of +ve reviews trained : '
for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
posmedlist.append(testmed)
lpcount=lpcount+1
cnt_var+=1
print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
lpcount=0
cnt_var=0
print '\nNo of -ve reviews trained : '
for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
negmedlist.append(testmed)
lpcount=lpcount+1
cnt_var+=1
print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)])
medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)])
f = open('train_result\proximity_median_train_result_'+str(isphrase),'w')
json.dump(medians,f)
f.close()
示例9: maketrainset
def maketrainset(movie_reviews, tokenizer, stemmer):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids]
posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids]
trainfeats = negfeats + posfeats
return trainfeats
示例10: evaluate_features
def evaluate_features(self,feature_extractor, N):
self.negative = movie_reviews.fileids('neg') #list of all names of the documents under neg folder
self.positive = movie_reviews.fileids('pos') #list of all names of the documents under pos folder
self.maintrain, self.maintest = self.stratifiedSplit(self.negative, self.positive, N)
lst = []
trainvocabulary = []
for doc,lbl in self.maintrain:
x = (feature_extractor(movie_reviews.words(fileids=[doc])),lbl)
lst.append(x)
trainvocabulary = trainvocabulary + x[0].keys()
trainvocabulary = set(trainvocabulary)
if q2_1.W == 0:
q2_1.W = len(trainvocabulary)
print "no. of features in train:", self.W
nb = classifier.train(lst)
self.testClassify = self.classifyTest(self.maintest, nb, feature_extractor)
print "accuracy = ", accuracy(self.maintest, self.testClassify)
print "Negative:"
print " precision = ", self.calcPrec('neg', self.maintest, self.testClassify)
print " recall = ", self.calcRecall('neg', self.maintest, self.testClassify)
print " f measure = ", self.calcFMeasur('neg', self.maintest, self.testClassify)
print "Positive:"
print " precision = ", self.calcPrec('pos', self.maintest, self.testClassify)
print " recall = ", self.calcRecall('pos', self.maintest, self.testClassify)
print " f measure = ", self.calcFMeasur('pos', self.maintest, self.testClassify)
nb.show_most_informative_features()
return nb
示例11: main
def main(argv):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
#print negids
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]
trainfeats = posfeats+negfeats
#print trainfeats
# break
classifier = NaiveBayesClassifier.train(trainfeats)
#classifier = pickle.load(open("classifier.p", "rb"))
topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
for line in sys.stdin:
try:
tolk_posset = word_tokenize(line.rstrip())
d = word_feats(tolk_posset)
for topic in topicList:
subjectFull = subj(line, topic)
if not subjectFull == "No match":
#print d
print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"
except:
#print "Error"
continue
示例12: main
def main():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home."
to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil."
to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.'''
reviews = []
reviews.append(to_review1)
reviews.append(to_review2)
reviews.append(to_review3)
for to_review in reviews:
to_review_words = to_review.split(" ")
print "Reviewing",to_review,"\n\n\n"
print ''' Normal classification ''',"\n\n"
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' Without Punctuations ''',"\n\n"
negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' Without Stop Words ''',"\n\n"
negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
wordstoreview = []
for each in to_review_words:
if each not in stopwords.words('english'):
wordstoreview.append(each)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' With Lemmatizer ''',"\n\n"
negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
开发者ID:saransh2405,项目名称:sentiment-Analysis-using-Maximum-Entropy-Classification,代码行数:59,代码来源:maxent.py
示例13: __init__
def __init__(self, train1=True, train2=True, train3=True, train4=True):
self.trainfeats = []
if train1:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
self.trainfeats = neg_movies + pos_movies
if train2:
f = open("out.txt", "r")
negfeats = []
posfeats = []
for line in f:
status = line[0]
texto = line[2:]
if status == '0':
negfeats.append((self.word_feats(texto.split(" ")), 'neg'))
elif status == '1':
posfeats.append((self.word_feats(texto.split(" ")), 'pos'))
self.trainfeats += negfeats + posfeats
if train3:
f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r')
for l in f:
data = l.strip().split('\t')
self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos'))
if train4:
f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r')
pos = []
neutral = []
neg = []
for line in f:
if line.startswith("pos"):
pos.append(line)
elif line.startswith("neutral"):
neutral.append(line)
elif line.startswith("neg"):
neg.append(line)
print len(pos), len(neutral), len(neg)
total = pos + neutral[:200] + neg
for line in total:
data = line.split(' .:. ')
self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) )
self.classifier = NaiveBayesClassifier.train(self.trainfeats)
print self.classifier.show_most_informative_features(20)
示例14: setup_demo
def setup_demo(lower):
print 'running movie reviews demo. data dir: ', nltk_movie_reviews_data_root
negative_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('neg'))
positive_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('pos'))
pos = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower)
neg = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower)
pos_bigrams = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower, wordlist_to_bigrams_dict)
neg_bigrams = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower, wordlist_to_bigrams_dict)
return (pos, neg, pos_bigrams, neg_bigrams)
示例15: __init__
def __init__(self, load = False, loadFile = ""):
if(load):
self.loadClassifier(loadFile)
else:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')]
posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')]
trainfeats = negfeats + posfeats
self.classifier = NaiveBayesClassifier.train(trainfeats)