当前位置: 首页>>代码示例>>Python>>正文


Python UnigramTagger.evaluate方法代码示例

本文整理汇总了Python中nltk.tag.UnigramTagger.evaluate方法的典型用法代码示例。如果您正苦于以下问题:Python UnigramTagger.evaluate方法的具体用法?Python UnigramTagger.evaluate怎么用?Python UnigramTagger.evaluate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tag.UnigramTagger的用法示例。


在下文中一共展示了UnigramTagger.evaluate方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tag_words

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
开发者ID:jayvachon,项目名称:managerisk-reflection-search,代码行数:12,代码来源:sentiment-analysis.py

示例2: baseline

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
开发者ID:lrei,项目名称:xlime_twitter_corpus,代码行数:52,代码来源:experiment.py

示例3: RegexpTagger

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
开发者ID:000Nelson000,项目名称:text-analytics-with-python,代码行数:33,代码来源:pos_tagging.py

示例4: print

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
brown_tagged_sents = brown.tagged_sents(categories='news')
#print(brown_tagged_sents)
# [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...]
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.evaluate(brown_tagged_sents))
# 0.13089484257215028

brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]]
print(default_tagger.evaluate(brown_tagged_sents2))
# 0.3333333333333333

train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063

regexp_tagger = RegexpTagger(
    [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    ( r'(The|the|A|a|An|an)$', 'AT'), # articles
    ( r'.*able$', 'JJ'), # adjectives
    ( r'.*ness$', 'NN'), # nouns formed from adj
开发者ID:jzm17173,项目名称:Learn,代码行数:33,代码来源:词性标注器.py

示例5: postag

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]

#.........这里部分代码省略.........
    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    """

    # defaults
    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
    if templates is None:
        from nltk.tag.brill import describe_template_sets, brill24
        # some pre-built template sets taken from typical systems or publications are
        # available. Print a list with describe_template_sets()
        # for instance:
        templates = brill24()
    (training_data, baseline_data, gold_data, testing_data) = \
       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)

    # creating (or reloading from cache) a baseline tagger (unigram tagger)
    # this is just a mechanism for getting deterministic output from the baseline between
    # python versions
    if cache_baseline_tagger:
        if not os.path.exists(cache_baseline_tagger):
            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
            with open(cache_baseline_tagger, 'w') as print_rules:
                pickle.dump(baseline_tagger, print_rules)
            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
        with open(cache_baseline_tagger, "r") as print_rules:
            baseline_tagger= pickle.load(print_rules)
            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
    else:
        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
        print("Trained baseline tagger")
    if gold_data:
        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))

    # creating a Brill tagger
    tbrill = time.time()
    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
    print("Training tbl tagger...")
    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
    if gold_data:
        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))

    # printing the learned rules, if learned silently
    if trace == 1:
        print("\nLearned rules: ")
        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))


    # printing template statistics (optionally including comparison with the training data)
    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
    if  incremental_stats:
        print("Incrementally tagging the test data, collecting individual rule statistics")
        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
        print("    Rule statistics collected")
        if not separate_baseline_data:
            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
                  "will be artificially high")
        trainstats = brill_tagger.train_stats()
        if template_stats:
            brill_tagger.print_template_statistics(teststats)
        if learning_curve_output:
            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
    else:
        print("Tagging the test data")
        taggedtest = brill_tagger.tag_sents(testing_data)
        if template_stats:
            brill_tagger.print_template_statistics()

    # writing error analysis to file
    if error_output is not None:
        with open(error_output, 'w') as f:
            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
            f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
        print("Wrote tagger errors including context to {0}".format(error_output))

    # serializing the tagger to a pickle file and reloading (just to see it works)
    if serialize_output is not None:
        taggedtest = brill_tagger.tag_sents(testing_data)
        with open(serialize_output, 'w') as print_rules:
            pickle.dump(brill_tagger, print_rules)
        print("Wrote pickled tagger to {0}".format(serialize_output))
        with open(serialize_output, "r") as print_rules:
            brill_tagger_reloaded = pickle.load(print_rules)
        print("Reloaded pickled tagger from {0}".format(serialize_output))
        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
        if taggedtest == taggedtest_reloaded:
            print("Reloaded tagger tried on test set, results identical")
        else:
            print("PROBLEM: Reloaded tagger gave different results on test set")
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:104,代码来源:demo.py

示例6: print

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
testing = treebank.tagged_sents()[2000:]
print(unitagger.evaluate(testing))
开发者ID:xenron,项目名称:sandbox-da-python,代码行数:9,代码来源:ch4_17.py

示例7: print

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))
开发者ID:xenron,项目名称:sandbox-da-python,代码行数:11,代码来源:ch4_19.py

示例8: cltk_pos_cv

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
#.........这里部分代码省略.........
开发者ID:wencanluo,项目名称:cltk_pos,代码行数:103,代码来源:pos_cltk_cv.py

示例9:

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
######## UNIGRAM TAGGER ##########

from nltk.tag import UnigramTagger
from nltk.corpus import treebank

#We use the first 3000 sentences of the treebank corpus as the training set to initialize
#the UnigramTagger class
#Unigram tagger can be trained by giving it a list of tagged sentences at initialization.
train_sents=treebank.tagged_sents()[:3000]
tagger=UnigramTagger(train_sents)
print treebank.sents()[0]
print tagger.tag(treebank.sents()[0])

test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)



tagger=UnigramTagger(model={'Pierre':'NN'})
tagger.tag(treebank.sents())[0]
开发者ID:bindaasamit,项目名称:pycode,代码行数:22,代码来源:tutPosTagging02.py

示例10: DefaultTagger

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
from nltk.tag import UnigramTagger, DefaultTagger
from nltk.corpus import treebank

from tag_util import train_sents, test_sents

# train
default_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=default_tagger)

# test
print(tagger.evaluate(test_sents))

# save to pickle
import pickle
with open('unitagger.pkl', 'wb') as output:
    pickle.dump(tagger, output)

# load from pickle
with open('unitagger.pkl', 'rb') as data_file:
    tagger2 = pickle.load(data_file)

print(tagger2.evaluate(test_sents))

# or nltk.data.load('unitagger.pkl') to load
开发者ID:anderscui,项目名称:nlpy,代码行数:26,代码来源:combining_taggers.py

示例11: RegexpTagger

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
# Training set
training_data = tagged_data_list[:cutoff]

# Evaluation set
evaulation_data = tagged_data_list[cutoff:development_size]

# print "Data is splitted!" 


# Regular expression tagger
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')])

# Unigram tagger
unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger)
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)

# Bigram tagger 
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)

# Trigram tagger 
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)

# Brill tagger templates
templates = [
    Template(brill.Pos([1, 1])),
    Template(brill.Pos([2, 2])),
开发者ID:onuryilmaz,项目名称:turkish-pos-tagger,代码行数:33,代码来源:training_tagger.py

示例12: indivUnigram

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
def indivUnigram(bambara,backoff):
    unigram= UnigramTagger(bambara.train_sents, backoff=backoff)
    print("Unigram accuracy: ",unigram.evaluate(bambara.test_sents))
    return unigram
开发者ID:Batene,项目名称:Bamanankan,代码行数:6,代码来源:indivTaggers.py

示例13: print

# 需要导入模块: from nltk.tag import UnigramTagger [as 别名]
# 或者: from nltk.tag.UnigramTagger import evaluate [as 别名]
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))

print("------------Bigram Tagger------------")
print(bigramTagger.tag(sent))

print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))

print("------------Brill Tagger------------")
print(brillTagger.tag(sent))

print("------------Accuracy: Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3)
print(unigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Bigram Tagger Trained------------")
print(bigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Trigram Tagger Trained------------")
print(trigramTagger.evaluate(brown_test_sents))

print("------------Accuracy: Unigram Tagger with backoff enabled. Backoff Chain: UnigramTagger -> DefaultTagger------------")
unigramTagger = UnigramTagger(brown_train_sents, backoff=defaultTagger)
print(unigramTagger.evaluate(brown_test_sents))
开发者ID:dxr1988,项目名称:NLTK-Research,代码行数:32,代码来源:CompareTaggers.py


注:本文中的nltk.tag.UnigramTagger.evaluate方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。