本文整理汇总了Python中nltk.corpus.brown.tagged_sents函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_sents函数的具体用法?Python tagged_sents怎么用?Python tagged_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tagged_sents函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
# run Simple unigram tagger
brown_news_tagged = brown.tagged_sents(categories='news')
brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]
nn_tagger = nltk.DefaultTagger('NN')
ut2 = nltk.UnigramTagger(brown_train, backoff=nn_tagger)
simpleUnigramTagger = SimpleUnigramTagger(brown_train, backoff=nn_tagger)
print 'Simple Unigram tagger accuracy: %4.1f%%' % ( 100.0 * simpleUnigramTagger.evaluate(brown_test))
print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * ut2.evaluate(brown_test))
# run affix tagger with entropy
brown_news_tagged = brown.tagged_sents(categories='news')
brown_train = brown_news_tagged[:int(0.8*len(brown_news_tagged))]
rest = brown_news_tagged[int(0.8*len(brown_news_tagged)):]
brown_development = rest[:int(0.5*len(rest))]
brown_test = rest[int(0.5*len(rest)):]
affix_tagger = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=2)
nltk.AffixTagger._train = _train
nltk.AffixTagger.H = _H
optcutoff = optimize_parameter()
print "the optimal cutoff param is: %d " % optcutoff
affix_tagger2 = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=optcutoff)
print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * affix_tagger.evaluate(brown_test))
print 'Unigram tagger accuracy with entropy: %4.1f%%' % ( 100.0 * affix_tagger2.evaluate(brown_test))
示例2: __init__
def __init__(self):
'''initialize and train brill and naive bayes classifiers'''
#TODO: Fix bug where it loads tagger from calling module dir
if exists(file):
input = open(file, 'rb')
self.classifier = load(input)
input.close()
print 'Successfully loaded saved classifier'
return
self.bayes = NaiveBayesTagger()
boundary = int(len(brown.tagged_sents())*0.8)
train = brown.tagged_sents(simplify_tags=True)[:boundary]
brill_trainer = FastBrillTaggerTrainer(initial_tagger = self.bayes,
templates = templates,
trace = 3,
deterministic = True)
self.classifier = brill_trainer.train(train, max_rules=10)
print 'Saving Taggers to file: "pos_tagger.pickle"'
output = open(file, 'wb')
dump(self.classifier, output, 1)
output.close()
示例3: demo
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
from nltk.corpus import brown
import textwrap
# Define a very simple feature detector
def fd(sentence, index):
word = sentence[index]
return dict(word=word, suffix=word[-2:], len=len(word))
# Let nltk know where java & mallet are.
nltk.internals.config_java(java_home)
nltk.classify.mallet.config_mallet(mallet_home)
# Get the training & test corpus. We simplify the tagset a little:
# just the first 2 chars.
def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
for sent in corpus]
brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])
crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
transduction_type='VITERBI')
sample_output = crf.tag([w for (w,t) in brown_test[5]])
acc = nltk.tag.accuracy(crf, brown_test)
print('\nAccuracy: %.1f%%' % (acc*100))
print('Sample output:')
print(textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
initial_indent=' ', subsequent_indent=' ')+'\n')
# Clean up
print('Clean-up: deleting', crf.filename)
os.remove(crf.filename)
return crf
示例4: demo
def demo(train_size=100, test_size=100, java_home="/usr/local/jdk1.5.0/", mallet_home="/usr/local/mallet-0.4"):
from nltk.corpus import brown
import textwrap
# Define a very simple feature detector
def fd(sentence, index):
word = sentence[index]
return dict(word=word, suffix=word[-2:], len=len(word))
# Let nltk know where java & mallet are.
nltk.internals.config_java(java_home)
nltk.classify.mallet.config_mallet(mallet_home)
# Get the training & test corpus. We simplify the tagset a little:
# just the first 2 chars.
def strip(corpus):
return [[(w, t[:2]) for (w, t) in sent] for sent in corpus]
brown_train = strip(brown.tagged_sents(categories="news")[:train_size])
brown_test = strip(brown.tagged_sents(categories="editorial")[:test_size])
crf = MalletCRF.train(fd, brown_train, transduction_type="VITERBI") #'/tmp/crf-model',
sample_output = crf.tag([w for (w, t) in brown_test[5]])
acc = nltk.tag.accuracy(crf, brown_test)
print "\nAccuracy: %.1f%%" % (acc * 100)
print "Sample output:"
print textwrap.fill(
" ".join("%s/%s" % w for w in sample_output), initial_indent=" ", subsequent_indent=" "
) + "\n"
# Clean up
print "Clean-up: deleting", crf.filename
os.remove(crf.filename)
return crf
示例5: training_sentences
def training_sentences(use=1.0, categories=[]):
"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
return sents
示例6: test_sentences
def test_sentences(categories=[]):
"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
start = int(TEST_PROPORTION * total) # use the last k sentences for test
sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
return sents
示例7: exercise2
def exercise2():
print
print "Exercise 2:"
brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
trigram_tagger = nltk.TrigramTagger(brown_news_tagged_sents)
brown_news_eval = trigram_tagger.evaluate(brown_news_tagged_sents)
brown_lore_eval = trigram_tagger.evaluate(brown_lore_tagged_sents)
print "Evaluation of the trigram tagger on 'News': %f " % brown_news_eval
print "Evaluation of the trigram tagger on 'Lore': %f " % brown_lore_eval
print
示例8: precisionRecall
def precisionRecall():
def tag_list(tagged_sents):
return [tag for sent in tagged_sents for (word, tag) in sent]
def apply_tagger(tagger, corpus):
return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print cm.pp(sort_by_count=True, show_percents=True, truncate=9)
示例9: evaluate
def evaluate(self):
'''run tests on conll2000 and treebank data'''
test = treebank.tagged_sents()[:100]
treebank_result = (100*self.classifier.evaluate(test))
test = conll2000.tagged_sents()[:100]
conll2000_result = (100*self.classifier.evaluate(test))
test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
brown_result = (100*self.classifier.evaluate(test))
return (treebank_result, conll2000_result, brown_result)
示例10: testSet
def testSet():
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')
示例11: get_tagged_tokens
def get_tagged_tokens(self, corpus=TAGGED, testing=False):
"""This tokenizes, segments, and tags all the files in a directory."""
if testing:
# train against a smaller version of the corpus so that it
# doesn't take years during testing.
tagger = build_trainer(brown.tagged_sents(categories='news'))
else:
tagger = build_trainer(brown.tagged_sents())
tokens_and_spans = self.tokenize_corpus(corpus)
tagged_spanned_tokens = tag_token_spans(
tokens_and_spans,
tagger,
)
return tagged_spanned_tokens
示例12: exercise1
def exercise1():
print
print "Exercise 1:"
brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents)
brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents)
brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents)
print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval
print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval
brown_lore = bn.sents(categories = 'lore')
b_lore = unigram_tagger.tag(brown_lore[200])
print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: "
print b_lore
print
示例13: create_tagger
def create_tagger():
"""Train a tagger from the Brown Corpus. This should not be called very
often; only in the event that the tagger pickle wasn't found."""
print "Building tagger..."
train_sents = brown.tagged_sents()
# These regexes were lifted from the NLTK book tagger chapter.
t0 = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
print "got t0"
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
print "got t1"
t2 = nltk.BigramTagger(train_sents, backoff=t1)
print "got t2"
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
print "Built tagger!"
return t3
示例14: read_datas
def read_datas(self):
brown_tagged_sentence = brown.tagged_sents()
brown_sent = brown.sents()
size = int(len(brown_tagged_sentence) * 0.9)
train_set = brown_tagged_sentence[:size]
test_set = brown_tagged_sentence[size:]
return (train_set,test_set)
示例15: ch05_11_train_test_affix_tagger
def ch05_11_train_test_affix_tagger():
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories="news"))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
affix_tagger = nltk.AffixTagger(model=most_freq_pos)
print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))