本文整理汇总了Python中nltk.corpus.brown.tagged_sents方法的典型用法代码示例。如果您正苦于以下问题:Python brown.tagged_sents方法的具体用法?Python brown.tagged_sents怎么用?Python brown.tagged_sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.brown
的用法示例。
在下文中一共展示了brown.tagged_sents方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: demo
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
from nltk.corpus import brown
sents = list(brown.tagged_sents())
test = list(brown.sents())
# create and train the tagger
tagger = TnT()
tagger.train(sents[200:1000])
# tag some data
tagged_data = tagger.tagdata(test[100:120])
# print results
for j in range(len(tagged_data)):
s = tagged_data[j]
t = sents[j+100]
for i in range(len(s)):
print(s[i],'--', t[i])
print()
示例2: load_pos
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def load_pos(num_sents):
from nltk.corpus import brown
sentences = brown.tagged_sents(categories='news')[:num_sents]
tag_re = re.compile(r'[*]|--|[^+*-]+')
tag_set = set()
symbols = set()
cleaned_sentences = []
for sentence in sentences:
for i in range(len(sentence)):
word, tag = sentence[i]
word = word.lower() # normalize
symbols.add(word) # log this word
# Clean up the tag.
tag = tag_re.match(tag).group()
tag_set.add(tag)
sentence[i] = (word, tag) # store cleaned-up tagged token
cleaned_sentences += [sentence]
return cleaned_sentences, list(tag_set), list(symbols)
示例3: get_pos_tagger
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
示例4: demo
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
from nltk.tag import tnt
from nltk.corpus import brown
sents = list(brown.tagged_sents())
test = list(brown.sents())
# create and train the tagger
tagger = tnt.TnT()
tagger.train(sents[200:1000])
# tag some data
tagged_data = tagger.tagdata(test[100:120])
# print results
for j in range(len(tagged_data)):
s = tagged_data[j]
t = sents[j+100]
for i in range(len(s)):
print s[i],'--', t[i]
print
示例5: get_pos_tagger
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
示例6: store_pos_tag_dicts
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def store_pos_tag_dicts():
pos_tag_dict = defaultdict(tuple)
tagged = treebank.tagged_sents()
for sent in tagged:
for tup in sent:
if not tup[1] in pos_tag_dict[tup[0].lower()]:
pos_tag_dict[tup[0].lower()] += (tup[1],)
pos_tag_dict_univ = defaultdict(tuple)
penn_tagged_univ = treebank.tagged_sents(tagset='universal')
brown_tagged_univ = brown.tagged_sents(tagset='universal')
for text in [penn_tagged_univ, brown_tagged_univ]:
for sent in text:
for tup in sent:
if not tup[1] in pos_tag_dict_univ[tup[0].lower()]:
pos_tag_dict_univ[tup[0].lower()] += (tup[1],)
for word in states.values():
pos_tag_dict[word.lower()] += ('NNP',)
pos_tag_dict_univ[word.lower()] += ('NOUN',)
dicts = (pos_tag_dict, pos_tag_dict_univ)
with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file:
pickle.dump(dicts, file, protocol=2)
示例7: demo
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
from nltk.corpus import brown
sents = list(brown.tagged_sents())
test = list(brown.sents())
# create and train the tagger
tagger = TnT()
tagger.train(sents[200:1000])
# tag some data
tagged_data = tagger.tagdata(test[100:120])
# print results
for j in range(len(tagged_data)):
s = tagged_data[j]
t = sents[j + 100]
for i in range(len(s)):
print(s[i], '--', t[i])
print()
示例8: load_pos
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def load_pos(num_sents):
from nltk.corpus import brown
sentences = brown.tagged_sents(categories='news')[:num_sents]
tag_re = re.compile(r'[*]|--|[^+*-]+')
tag_set = set()
symbols = set()
cleaned_sentences = []
for sentence in sentences:
for i in range(len(sentence)):
word, tag = sentence[i]
word = word.lower() # normalize
symbols.add(word) # log this word
# Clean up the tag.
tag = tag_re.match(tag).group()
tag_set.add(tag)
sentence[i] = (word, tag) # store cleaned-up tagged token
cleaned_sentences += [sentence]
return cleaned_sentences, list(tag_set), list(symbols)
示例9: demo2
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
from nltk.corpus import treebank
d = list(treebank.tagged_sents())
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=True)
t.train(d[(11)*100:])
s.train(d[(11)*100:])
for i in range(10):
tacc = t.evaluate(d[i*100:((i+1)*100)])
tp_un = float(t.unknown) / float(t.known +t.unknown)
tp_kn = float(t.known) / float(t.known + t.unknown)
t.unknown = 0
t.known = 0
print('Capitalization off:')
print('Accuracy:', tacc)
print('Percentage known:', tp_kn)
print('Percentage unknown:', tp_un)
print('Accuracy over known words:', (tacc / tp_kn))
sacc = s.evaluate(d[i*100:((i+1)*100)])
sp_un = float(s.unknown) / float(s.known +s.unknown)
sp_kn = float(s.known) / float(s.known + s.unknown)
s.unknown = 0
s.known = 0
print('Capitalization on:')
print('Accuracy:', sacc)
print('Percentage known:', sp_kn)
print('Percentage unknown:', sp_un)
print('Accuracy over known words:', (sacc / sp_kn))
示例10: demo2
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
from nltk import tag
from nltk.tag import tnt
from nltk.corpus import treebank
d = list(treebank.tagged_sents())
t = tnt.TnT(N=1000, C=False)
s = tnt.TnT(N=1000, C=True)
t.train(d[(11)*100:])
s.train(d[(11)*100:])
for i in range(10):
tacc = tag.accuracy(t, d[i*100:((i+1)*100)])
tp_un = float(t.unknown) / float(t.known +t.unknown)
tp_kn = float(t.known) / float(t.known + t.unknown)
t.unknown = 0
t.known = 0
print 'Capitalization off:'
print 'Accuracy:', tacc
print 'Percentage known:', tp_kn
print 'Percentage unknown:', tp_un
print 'Accuracy over known words:', (tacc / tp_kn)
sacc = tag.accuracy(s, d[i*100:((i+1)*100)])
sp_un = float(s.unknown) / float(s.known +s.unknown)
sp_kn = float(s.known) / float(s.known + s.unknown)
s.unknown = 0
s.known = 0
print 'Capitalization on:'
print 'Accuracy:', sacc
print 'Percentage known:', sp_kn
print 'Percentage unknown:', sp_un
print 'Accuracy over known words:', (sacc / sp_kn)
示例11: demo
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
from nltk.corpus import brown
import textwrap
# Define a very simple feature detector
def fd(sentence, index):
word = sentence[index]
return dict(word=word, suffix=word[-2:], len=len(word))
# Let nltk know where java & mallet are.
nltk.internals.config_java(java_home)
nltk.classify.mallet.config_mallet(mallet_home)
# Get the training & test corpus. We simplify the tagset a little:
# just the first 2 chars.
def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
for sent in corpus]
brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])
crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
transduction_type='VITERBI')
sample_output = crf.tag([w for (w,t) in brown_test[5]])
acc = nltk.tag.accuracy(crf, brown_test)
print '\nAccuracy: %.1f%%' % (acc*100)
print 'Sample output:'
print textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
initial_indent=' ', subsequent_indent=' ')+'\n'
# Clean up
print 'Clean-up: deleting', crf.filename
os.remove(crf.filename)
return crf
示例12: demo2
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
from nltk.corpus import treebank
d = list(treebank.tagged_sents())
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=True)
t.train(d[(11) * 100 :])
s.train(d[(11) * 100 :])
for i in range(10):
tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)])
tp_un = t.unknown / (t.known + t.unknown)
tp_kn = t.known / (t.known + t.unknown)
t.unknown = 0
t.known = 0
print('Capitalization off:')
print('Accuracy:', tacc)
print('Percentage known:', tp_kn)
print('Percentage unknown:', tp_un)
print('Accuracy over known words:', (tacc / tp_kn))
sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
sp_un = s.unknown / (s.known + s.unknown)
sp_kn = s.known / (s.known + s.unknown)
s.unknown = 0
s.known = 0
print('Capitalization on:')
print('Accuracy:', sacc)
print('Percentage known:', sp_kn)
print('Percentage unknown:', sp_un)
print('Accuracy over known words:', (sacc / sp_kn))
示例13: get_pos_tagger
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
]
)
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
# Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
backoff=trigram_tagger,
)
return main_tagger
示例14: demo3
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo3():
from nltk.corpus import treebank, brown
d = list(treebank.tagged_sents())
e = list(brown.tagged_sents())
d = d[:1000]
e = e[:1000]
d10 = int(len(d)*0.1)
e10 = int(len(e)*0.1)
tknacc = 0
sknacc = 0
tallacc = 0
sallacc = 0
tknown = 0
sknown = 0
for i in range(10):
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=False)
dtest = d[(i*d10):((i+1)*d10)]
etest = e[(i*e10):((i+1)*e10)]
dtrain = d[:(i*d10)] + d[((i+1)*d10):]
etrain = e[:(i*e10)] + e[((i+1)*e10):]
t.train(dtrain)
s.train(etrain)
tacc = t.evaluate(dtest)
tp_un = float(t.unknown) / float(t.known +t.unknown)
tp_kn = float(t.known) / float(t.known + t.unknown)
tknown += tp_kn
t.unknown = 0
t.known = 0
sacc = s.evaluate(etest)
sp_un = float(s.unknown) / float(s.known + s.unknown)
sp_kn = float(s.known) / float(s.known + s.unknown)
sknown += sp_kn
s.unknown = 0
s.known = 0
tknacc += (tacc / tp_kn)
sknacc += (sacc / tp_kn)
tallacc += tacc
sallacc += sacc
#print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
print("brown: acc over words known:", 10 * tknacc)
print(" : overall accuracy:", 10 * tallacc)
print(" : words known:", 10 * tknown)
print("treebank: acc over words known:", 10 * sknacc)
print(" : overall accuracy:", 10 * sallacc)
print(" : words known:", 10 * sknown)
示例15: demo3
# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo3():
from nltk.corpus import treebank, brown
d = list(treebank.tagged_sents())
e = list(brown.tagged_sents())
d = d[:1000]
e = e[:1000]
d10 = int(len(d) * 0.1)
e10 = int(len(e) * 0.1)
tknacc = 0
sknacc = 0
tallacc = 0
sallacc = 0
tknown = 0
sknown = 0
for i in range(10):
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=False)
dtest = d[(i * d10) : ((i + 1) * d10)]
etest = e[(i * e10) : ((i + 1) * e10)]
dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
etrain = e[: (i * e10)] + e[((i + 1) * e10) :]
t.train(dtrain)
s.train(etrain)
tacc = t.evaluate(dtest)
tp_un = t.unknown / (t.known + t.unknown)
tp_kn = t.known / (t.known + t.unknown)
tknown += tp_kn
t.unknown = 0
t.known = 0
sacc = s.evaluate(etest)
sp_un = s.unknown / (s.known + s.unknown)
sp_kn = s.known / (s.known + s.unknown)
sknown += sp_kn
s.unknown = 0
s.known = 0
tknacc += tacc / tp_kn
sknacc += sacc / tp_kn
tallacc += tacc
sallacc += sacc
# print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
print("brown: acc over words known:", 10 * tknacc)
print(" : overall accuracy:", 10 * tallacc)
print(" : words known:", 10 * tknown)
print("treebank: acc over words known:", 10 * sknacc)
print(" : overall accuracy:", 10 * sallacc)
print(" : words known:", 10 * sknown)