本文整理汇总了Python中nltk.tag.UnigramTagger.evaluate方法的典型用法代码示例。如果您正苦于以下问题:Python UnigramTagger.evaluate方法的具体用法?Python UnigramTagger.evaluate怎么用?Python UnigramTagger.evaluate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tag.UnigramTagger
的用法示例。
在下文中一共展示了UnigramTagger.evaluate方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tag_words
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def tag_words(self, words, sents):
train_sents = treebank.tagged_sents()
tagger = UnigramTagger(train_sents)
test_sents = tagger.tag(sents[0])
# test_sents = treebank.tagged_sents()[3000:]
# print treebank.tagged_sents()[1:]
# print "accuracy: " + str(self._tagger.evaluate(test_sents))
# print self._tagger.tag(words)
# print test_sents
print tagger.evaluate(test_sents)
示例2: baseline
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def baseline(tagged_sentences):
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from collections import Counter
# lowercase everything
# remove all instances of non-universal tags for propper comparison with
# the other methods
new_tagged_sentences = []
for sent in tagged_sentences:
sent = [(x[0].lower(), x[1]) for x in sent]
sent = [x for x in sent if x[1] in _UNI]
new_tagged_sentences.append(sent)
tagged_sentences = new_tagged_sentences
# size of corpus
corpus_size = sum([len(sent) for sent in tagged_sentences])
print('Corpus size: {} docs'.format(len(tagged_sentences)))
print('Corpus size: {} tokens'.format(corpus_size))
# train/test split
test_pct = 0.3
test_len = int(len(tagged_sentences) * test_pct)
test_idx = len(tagged_sentences) - test_len
train_set = tagged_sentences[:test_idx]
test_set = tagged_sentences[test_idx:]
print('Train set: {} docs'.format(len(train_set)))
print('Test set: {} docs'.format(len(test_set)))
# calculate test set size in tokens
test_size = sum([len(sent) for sent in test_set])
print('Test set: {} tokens'.format(test_size))
# calculate most comman tag in the train set
# this should be 'NOUN'
tag_dist = []
for sent in train_set:
tag_dist += [x[1] for x in sent]
counts = Counter()
counts.update(tag_dist)
most_common = counts.most_common(1)[0][0]
print('Most common tag: {}'.format(most_common))
# Create model
backoff = DefaultTagger(most_common)
tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)
# Evaluate
acc = tagger.evaluate(test_set)
print('Baseline: {}'.format(acc))
示例3: RegexpTagger
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
rt = RegexpTagger(patterns)
print rt.evaluate(test_data)
print rt.tag(tokens)
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)
print ut.evaluate(test_data)
print ut.tag(tokens)
print bt.evaluate(test_data)
print bt.tag(tokens)
print tt.evaluate(test_data)
print tt.tag(tokens)
def combined_tagger(train_data, taggers, backoff=None):
for tagger in taggers:
backoff = tagger(train_data, backoff=backoff)
return backoff
ct = combined_tagger(train_data=train_data,
taggers=[UnigramTagger, BigramTagger, TrigramTagger],
示例4: print
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
brown_tagged_sents = brown.tagged_sents(categories='news')
#print(brown_tagged_sents)
# [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...]
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))
# 0.13089484257215028
brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]]
print(default_tagger.evaluate(brown_tagged_sents2))
# 0.3333333333333333
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063
regexp_tagger = RegexpTagger(
[( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
( r'(The|the|A|a|An|an)$', 'AT'), # articles
( r'.*able$', 'JJ'), # adjectives
( r'.*ness$', 'NN'), # nouns formed from adj
示例5: postag
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
#.........这里部分代码省略.........
:param separate_baseline_data: use a fraction of the training data exclusively for training baseline
:type separate_baseline_data: C{bool}
:param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
deterministic output from the baseline unigram tagger between python versions)
:type cache_baseline_tagger: C{string}
Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
is fast and fine for a demo, but is likely to generalize worse on unseen data.
Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
"""
# defaults
baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
if templates is None:
from nltk.tag.brill import describe_template_sets, brill24
# some pre-built template sets taken from typical systems or publications are
# available. Print a list with describe_template_sets()
# for instance:
templates = brill24()
(training_data, baseline_data, gold_data, testing_data) = \
_demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)
# creating (or reloading from cache) a baseline tagger (unigram tagger)
# this is just a mechanism for getting deterministic output from the baseline between
# python versions
if cache_baseline_tagger:
if not os.path.exists(cache_baseline_tagger):
baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
with open(cache_baseline_tagger, 'w') as print_rules:
pickle.dump(baseline_tagger, print_rules)
print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
with open(cache_baseline_tagger, "r") as print_rules:
baseline_tagger= pickle.load(print_rules)
print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
else:
baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
print("Trained baseline tagger")
if gold_data:
print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))
# creating a Brill tagger
tbrill = time.time()
trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
print("Training tbl tagger...")
brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
if gold_data:
print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))
# printing the learned rules, if learned silently
if trace == 1:
print("\nLearned rules: ")
for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))
# printing template statistics (optionally including comparison with the training data)
# note: if not separate_baseline_data, then baseline accuracy will be artificially high
if incremental_stats:
print("Incrementally tagging the test data, collecting individual rule statistics")
(taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
print(" Rule statistics collected")
if not separate_baseline_data:
print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
"will be artificially high")
trainstats = brill_tagger.train_stats()
if template_stats:
brill_tagger.print_template_statistics(teststats)
if learning_curve_output:
_demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
print("Wrote plot of learning curve to {0}".format(learning_curve_output))
else:
print("Tagging the test data")
taggedtest = brill_tagger.tag_sents(testing_data)
if template_stats:
brill_tagger.print_template_statistics()
# writing error analysis to file
if error_output is not None:
with open(error_output, 'w') as f:
f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
print("Wrote tagger errors including context to {0}".format(error_output))
# serializing the tagger to a pickle file and reloading (just to see it works)
if serialize_output is not None:
taggedtest = brill_tagger.tag_sents(testing_data)
with open(serialize_output, 'w') as print_rules:
pickle.dump(brill_tagger, print_rules)
print("Wrote pickled tagger to {0}".format(serialize_output))
with open(serialize_output, "r") as print_rules:
brill_tagger_reloaded = pickle.load(print_rules)
print("Reloaded pickled tagger from {0}".format(serialize_output))
taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
if taggedtest == taggedtest_reloaded:
print("Reloaded tagger tried on test set, results identical")
else:
print("PROBLEM: Reloaded tagger gave different results on test set")
示例6: print
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
testing = treebank.tagged_sents()[2000:]
print(unitagger.evaluate(testing))
示例7: print
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))
示例8: cltk_pos_cv
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def cltk_pos_cv(full_training_set, local_dir_rel):
print("full_training_set", full_training_set)
unigram_accuracies = []
bigram_accuracies = []
trigram_accuracies = []
backoff_accuracies = []
tnt_accuracies = []
with open(full_training_set) as f:
training_set_string = f.read()
pos_set = training_set_string.split('\n\n') # mk into a list
sentence_count = len(pos_set) # 3473
tenth = math.ceil(int(sentence_count) / int(10))
random.seed(0)
random.shuffle(pos_set)
def chunks(l, n):
"""Yield successive n-sized chunks from l.
http://stackoverflow.com/a/312464
"""
for i in range(0, len(l), n):
yield l[i:i+n]
# a list of 10 lists
ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each
#for counter in list(range(10)):
for counter, part in list(enumerate(ten_parts)):
# map test list to part of given loop
test_set = ten_parts[counter] # or: test_set = part
# filter out this loop's test index
training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
# next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
training_set = [item for sublist in training_set_lists for item in sublist]
# save shuffled tests to file (as NLTK trainers expect)
#local_dir_rel = '~/cltk_data/user_data'
local_dir = os.path.expanduser(local_dir_rel)
if not os.path.isdir(local_dir):
os.makedirs(local_dir)
test_path = os.path.join(local_dir, 'test.pos')
with open(test_path, 'w') as f:
f.write('\n\n'.join(test_set))
train_path = os.path.join(local_dir, 'train.pos')
with open(train_path, 'w') as f:
f.write('\n\n'.join(training_set))
# read POS corpora
print("local_dir", local_dir)
train_reader = TaggedCorpusReader(local_dir, 'train.pos')
train_sents = train_reader.tagged_sents()
test_reader = TaggedCorpusReader(local_dir, 'test.pos')
test_sents = test_reader.tagged_sents()
print('Loop #' + str(counter))
# make unigram tagger
unigram_tagger = UnigramTagger(train_sents)
# evaluate unigram tagger
unigram_accuracy = None
unigram_accuracy = unigram_tagger.evaluate(test_sents)
unigram_accuracies.append(unigram_accuracy)
print('Unigram:', unigram_accuracy)
# make bigram tagger
bigram_tagger = BigramTagger(train_sents)
# evaluate bigram tagger
bigram_accuracy = None
bigram_accuracy = bigram_tagger.evaluate(test_sents)
bigram_accuracies.append(bigram_accuracy)
print('Bigram:', bigram_accuracy)
# make trigram tagger
trigram_tagger = TrigramTagger(train_sents)
# evaluate trigram tagger
trigram_accuracy = None
trigram_accuracy = trigram_tagger.evaluate(test_sents)
trigram_accuracies.append(trigram_accuracy)
print('Trigram:', trigram_accuracy)
# make 1, 2, 3-gram backoff tagger
tagger1 = UnigramTagger(train_sents)
tagger2 = BigramTagger(train_sents, backoff=tagger1)
tagger3 = TrigramTagger(train_sents, backoff=tagger2)
# evaluate trigram tagger
backoff_accuracy = None
backoff_accuracy = tagger3.evaluate(test_sents)
backoff_accuracies.append(backoff_accuracy)
print('1, 2, 3-gram backoff:', backoff_accuracy)
# make tnt tagger
tnt_tagger = tnt.TnT()
#.........这里部分代码省略.........
示例9:
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
######## UNIGRAM TAGGER ##########
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
#We use the first 3000 sentences of the treebank corpus as the training set to initialize
#the UnigramTagger class
#Unigram tagger can be trained by giving it a list of tagged sentences at initialization.
train_sents=treebank.tagged_sents()[:3000]
tagger=UnigramTagger(train_sents)
print treebank.sents()[0]
print tagger.tag(treebank.sents()[0])
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
tagger=UnigramTagger(model={'Pierre':'NN'})
tagger.tag(treebank.sents())[0]
示例10: DefaultTagger
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
from nltk.tag import UnigramTagger, DefaultTagger
from nltk.corpus import treebank
from tag_util import train_sents, test_sents
# train
default_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=default_tagger)
# test
print(tagger.evaluate(test_sents))
# save to pickle
import pickle
with open('unitagger.pkl', 'wb') as output:
pickle.dump(tagger, output)
# load from pickle
with open('unitagger.pkl', 'rb') as data_file:
tagger2 = pickle.load(data_file)
print(tagger2.evaluate(test_sents))
# or nltk.data.load('unitagger.pkl') to load
示例11: RegexpTagger
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
# Training set
training_data = tagged_data_list[:cutoff]
# Evaluation set
evaulation_data = tagged_data_list[cutoff:development_size]
# print "Data is splitted!"
# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')])
# Unigram tagger
unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger)
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)
# Bigram tagger
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)
# Trigram tagger
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)
# Brill tagger templates
templates = [
Template(brill.Pos([1, 1])),
Template(brill.Pos([2, 2])),
示例12: indivUnigram
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def indivUnigram(bambara,backoff):
unigram= UnigramTagger(bambara.train_sents, backoff=backoff)
print("Unigram accuracy: ",unigram.evaluate(bambara.test_sents))
return unigram
示例13: print
# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))
print("------------Bigram Tagger------------")
print(bigramTagger.tag(sent))
print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))
print("------------Brill Tagger------------")
print(brillTagger.tag(sent))
print("------------Accuracy: Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.evaluate(brown_test_sents))
print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3)
print(unigramTagger.evaluate(brown_test_sents))
print("------------Accuracy: Bigram Tagger Trained------------")
print(bigramTagger.evaluate(brown_test_sents))
print("------------Accuracy: Trigram Tagger Trained------------")
print(trigramTagger.evaluate(brown_test_sents))
print("------------Accuracy: Unigram Tagger with backoff enabled. Backoff Chain: UnigramTagger -> DefaultTagger------------")
unigramTagger = UnigramTagger(brown_train_sents, backoff=defaultTagger)
print(unigramTagger.evaluate(brown_test_sents))