本文整理汇总了Python中tokenizer.Tokenizer.bag_of_char_ngrams方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.bag_of_char_ngrams方法的具体用法?Python Tokenizer.bag_of_char_ngrams怎么用?Python Tokenizer.bag_of_char_ngrams使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.bag_of_char_ngrams方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_authors
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import bag_of_char_ngrams [as 别名]
def test_authors(p, bag_of_words = True, alpha = 0.05, bag_of_char_ngrams = False, ngram_len = 5, set_of_words = False, complexity_features = False,
print_predictions = True):
""" Tests the classifiers with the given feature sets.
p is the Preprocessor object holding the path data. It must have been initialized by
calling p.organize_authors()
If bag_of_words argument is True, the program uses Multinomial Naive Bayes classifier
with the given alpha and bag of words as the feature set.
If bag_of_char_ngrams argument is True, the program uses Multinomial Naive Bayes classifier
with the given alpha, and a bag of character n-grams as the feature set.
If set_of_words argument is True, the program uses Binarized Multinomial Naive Bayes clasifier
with the given alpha, and a set of words as the feature set.
If complexity_features argument is True, the program uses Normalizing Naive Bayes, which
is my term for a classifier which simply fits all features for all classes into their own normal
distributions and calculates probabilities using the pdfs.
Returns a 4-tuple, each being the score tuple a different feature set, in the order they are written
above. Any feature sets not used will return a score of None.
"""
authors = p.get_authors()
classifiers = (None if not bag_of_words else MultinomialNaiveBayes(authors, alpha = alpha),
None if not bag_of_char_ngrams else MultinomialNaiveBayes(authors, alpha = alpha),
None if not set_of_words else BinarizedMultinomialNaiveBayes(authors, alpha = alpha),
None if not complexity_features else NormalizingNaiveBayes(authors, 8))
# Train the bayes classifiers for each training data
for author in authors:
for clsf in classifiers:
if clsf is not None: clsf.add_documents(author, len(p.training_data(author)))
for data in p.training_data(author):
# Featurize and add features to the classifiers
t = Tokenizer(p.file_path(author,data))
if classifiers[0] is not None: classifiers[0].add_feature_counts(author, t.bag_of_words())
if classifiers[1] is not None: classifiers[1].add_feature_counts(author, t.bag_of_char_ngrams(ngram_len))
if classifiers[2] is not None: classifiers[2].add_feature_counts(author, t.bag_of_words())
if classifiers[3] is not None: classifiers[3].add_features(author, classifiers[3].vectorize(t.features()))
for clsf in classifiers:
if clsf is not None: clsf.train()
testers = (None if not bag_of_words else Tester(classifiers[0].get_classes()),
None if not bag_of_char_ngrams else Tester(classifiers[1].get_classes()),
None if not set_of_words else Tester(classifiers[2].get_classes()),
None if not complexity_features else Tester(classifiers[3].get_classes()))
# Check the classifier predictions for each test data
for author in authors:
for data in p.test_data(author):
# Featurize and classify
t = Tokenizer(p.file_path(author,data, training_data = False))
class_predicted = [None, None, None, None]
if classifiers[0] is not None:
class_predicted[0] = classifiers[0].most_probable_class(classifiers[0].vectorize(t.bag_of_words()))
testers[0].add_stat(class_predicted[0], author)
if classifiers[1] is not None:
class_predicted[1] = classifiers[1].most_probable_class(classifiers[1].vectorize(t.bag_of_char_ngrams(ngram_len)))
testers[1].add_stat(class_predicted[1], author)
if classifiers[2] is not None:
class_predicted[2] = classifiers[2].most_probable_class(classifiers[2].vectorize(t.bag_of_words()))
testers[2].add_stat(class_predicted[2], author)
if classifiers[3] is not None:
class_predicted[3] = classifiers[3].most_probable_class(classifiers[3].vectorize(t.features()))
testers[3].add_stat(class_predicted[3], author)
if print_predictions: print('predicted:',[pr for pr in class_predicted if pr is not None],'actual:',author)
return (testers[0].scores() if testers[0] is not None else None, testers[1].scores() if testers[1] is not None else None,
testers[2].scores() if testers[2] is not None else None, testers[3].scores() if testers[3] is not None else None)