本文整理汇总了Python中nltk.corpus.util.LazyCorpusLoader.categories方法的典型用法代码示例。如果您正苦于以下问题:Python LazyCorpusLoader.categories方法的具体用法?Python LazyCorpusLoader.categories怎么用?Python LazyCorpusLoader.categories使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.util.LazyCorpusLoader
的用法示例。
在下文中一共展示了LazyCorpusLoader.categories方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadClassifier
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import categories [as 别名]
def loadClassifier(outputdir):
classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle")
word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
word_features = pickleLoad("word_features.pickle")
# classifier = pickleLoad("originalnaivebayes.pickle")
# MNB_classifier = pickleLoad("MNB_classifier.pickle")
# BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
# LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
# SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
# LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#
# voted_classifier = VoteClassifier(classifier,
## NuSVC_classifier,
# LinearSVC_classifier,
# SGDClassifier_classifier,
# MNB_classifier,
# BernoulliNB_classifier,
# LogisticRegression_classifier)
voted_classifier= pickleLoad("voted_classifier.pickle")
return voted_classifier, word_features
else:
criticas_cine = LazyCorpusLoader(
'criticas_cine', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
encoding='utf-8')
# criticas_cine = LazyCorpusLoader(
# 'criticas_cine_neu', CategorizedPlaintextCorpusReader,
# r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
# encoding='utf-8')
documents = [(list(criticas_cine.words(fileid)), category)
for category in criticas_cine.categories()
for fileid in criticas_cine.fileids(category)]
#
# document_pos = [(list(criticas_cine.words(fileid)), "pos")
# for fileid in criticas_cine.fileids("pos")]
# document_neg = [(list(criticas_cine.words(fileid)), "neg")
# for fileid in criticas_cine.fileids("neg")]
# document_neu = [(list(criticas_cine.words(fileid)), "neu")
# for fileid in criticas_cine.fileids("neu")]
random.shuffle(documents)
# random.shuffle(document_pos)
# random.shuffle(document_neg)
# random.shuffle(document_neu)
all_words = []
for w in criticas_cine.words():
all_words.append(w.lower())
# for w in criticas_cine.words():
# if not is_filtered(w.lower()):
# all_words.append(w.lower())
#
all_words = nltk.FreqDist(all_words)
#print (all_words.most_common(50))
# Filtering by type of word
# for sample in all_words:
word_features = list(all_words.keys())[:3000]
pickleDump(word_features, "word_features.pickle")
featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
# featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
# featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
# featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
# training_set = featuresetpos[:1000]
# training_set.extend(featuresetneg[:1000])
# training_set.extend(featuresetneu[:1000])
# testing_set = featuresetpos[1000:1273]
# testing_set.extend(featuresetneg[1000:])
# testing_set.extend(featuresetneu[1000:])
# pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
# neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
# neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
training_set = featuresets[:2000]
testing_set = featuresets[2000:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
# pickleDump(classifier, "originalnaivebayes.pickle")
NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
accuracy = Accuracy(classifier,testing_set)
print(accuracy)
# order: neu, neg, pos
# print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
# print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#.........这里部分代码省略.........
示例2: LazyCorpusLoader
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import categories [as 别名]
reader_kwargs['cat_file'] = args.cat_file
if args.delimiter and args.delimiter != ' ':
reader_kwargs['delimiter'] = args.delimiter
if args.cat_pattern:
reader_args.append(args.cat_pattern)
else:
reader_args.append('.+/.+')
elif args.cat_pattern:
reader_args.append(args.cat_pattern)
reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)
categorized_corpus = LazyCorpusLoader(args.corpus, reader_class[args.reader],
*reader_args, **reader_kwargs)
labels = categorized_corpus.categories()
nlabels = len(labels)
if args.trace:
print '%d labels: %s' % (nlabels, labels)
if not nlabels:
raise ValueError('corpus does not have any categories')
elif nlabels == 1:
raise ValueError('corpus must have more than 1 category')
elif nlabels == 2 and args.multi:
raise ValueError('corpus must have more than 2 categories if --multi is specified')
########################
## text normalization ##
########################
示例3: LazyCorpusLoader
# 需要导入模块: from nltk.corpus.util import LazyCorpusLoader [as 别名]
# 或者: from nltk.corpus.util.LazyCorpusLoader import categories [as 别名]
interrogazioni = LazyCorpusLoader(
'opp_interrogazioni_macro',
CategorizedPlaintextCorpusReader,
r'\d*', cat_file='cats.txt', cat_delimiter=','
)
print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]
print "generating list of documents for each category"
documents = [
(list(interrogazioni.words(fileid)), category)
for category in interrogazioni.categories()
for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)
print "building the classifier"
featuresets = [(document_features(d, top_words), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print "classifier accuracy: ", nltk.classify.accuracy(classifier, test_set)