本文整理汇总了Python中nltk.corpus.treebank.tagged_sents函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_sents函数的具体用法?Python tagged_sents怎么用?Python tagged_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tagged_sents函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_data
def load_data(self, percentage):
print("Started Loading the Data")
# Get the complete data
data_set = treebank.fileids()
# Partition the data into train and test data sets
training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]
# How much percentage of files consider for training?
index = int(percentage*len(training_data_fileIds))
training_data_fileIds = training_data_fileIds[:index]
tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)
tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)
# print(len(tagged_training_data1), len(tagged_testing_data1))
# UnTag the data for other uses
untagged_training_data = [untag(item) for item in tagged_training_data]
untagged_testing_data = [untag(item) for item in tagged_testing_data]
print("Data Loaded Successfully. Stats are")
print("Training Data Sentences: ", len(tagged_training_data))
print("Testing Data Sentences: ", len(tagged_testing_data))
return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
示例2: demo
def demo(corpus, num_sents):
"""
Loads a few sentences from the Brown corpus or the Wall Street Journal
corpus, trains them, tests the tagger's accuracy and tags an unseen
sentence.
@type corpus: C{str}
@param corpus: Name of the corpus to load, either C{brown} or C{treebank}.
@type num_sents: C{int}
@param num_sents: Number of sentences to load from a corpus. Use a small
number, as training might take a while.
"""
if corpus.lower() == "brown":
from nltk.corpus import brown
tagged_sents = brown.tagged_sents()[:num_sents]
elif corpus.lower() == "treebank":
from nltk.corpus import treebank
tagged_sents = treebank.tagged_sents()[:num_sents]
else:
print "Please load either the 'brown' or the 'treebank' corpus."
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
maxent_tagger = MaxentPosTagger()
maxent_tagger.train(train_sents)
print "tagger accuracy (test %i sentences, after training %i):" % \
(size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
print "\n\n"
print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so",
"slow", "!"])
print "\n\n"
print "show the 10 most informative features:"
print maxent_tagger.classifier.show_most_informative_features(10)
示例3: demo3
def demo3():
from nltk.corpus import treebank, brown
d = list(treebank.tagged_sents())
e = list(brown.tagged_sents())
d = d[:1000]
e = e[:1000]
d10 = int(len(d) * 0.1)
e10 = int(len(e) * 0.1)
tknacc = 0
sknacc = 0
tallacc = 0
sallacc = 0
tknown = 0
sknown = 0
for i in range(10):
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=False)
dtest = d[(i * d10) : ((i + 1) * d10)]
etest = e[(i * e10) : ((i + 1) * e10)]
dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
etrain = e[: (i * e10)] + e[((i + 1) * e10) :]
t.train(dtrain)
s.train(etrain)
tacc = t.evaluate(dtest)
tp_un = t.unknown / (t.known + t.unknown)
tp_kn = t.known / (t.known + t.unknown)
tknown += tp_kn
t.unknown = 0
t.known = 0
sacc = s.evaluate(etest)
sp_un = s.unknown / (s.known + s.unknown)
sp_kn = s.known / (s.known + s.unknown)
sknown += sp_kn
s.unknown = 0
s.known = 0
tknacc += tacc / tp_kn
sknacc += sacc / tp_kn
tallacc += tacc
sallacc += sacc
# print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
print("brown: acc over words known:", 10 * tknacc)
print(" : overall accuracy:", 10 * tallacc)
print(" : words known:", 10 * tknown)
print("treebank: acc over words known:", 10 * sknacc)
print(" : overall accuracy:", 10 * sallacc)
print(" : words known:", 10 * sknown)
示例4: get_accuracy
def get_accuracy(self, sentences=[]):
if sentences == []:
test_sents = treebank.tagged_sents()[6000:]
else:
test_sents = sentences
print self._tagger.evaluate(test_sents)
示例5: tag_matching
def tag_matching(sequences):
treebank_sentences = treebank.tagged_sents()
#treebank_sentences = brown.tagged_sents()
# Return best count/sequence
best = (0, None)
count = 0
errors = 0
resultset = []
for seq in sequences:
for sent in treebank_sentences:
for i, word in enumerate(sent):
if sent[i][1] == seq[0]:
try:
if sent[i+1][1] == seq[1]:
count += 1
#if sent[i+2][1] == seq[2]:
# count += 1
except IndexError:
errors += 1
if count > best[0]:
best = (count, seq)
resultset.append((seq, count, errors))
count, erros = 0, 0
return resultset
示例6: demo2
def demo2():
from nltk.corpus import treebank
d = list(treebank.tagged_sents())
t = TnT(N=1000, C=False)
s = TnT(N=1000, C=True)
t.train(d[(11)*100:])
s.train(d[(11)*100:])
for i in range(10):
tacc = t.evaluate(d[i*100:((i+1)*100)])
tp_un = float(t.unknown) / float(t.known +t.unknown)
tp_kn = float(t.known) / float(t.known + t.unknown)
t.unknown = 0
t.known = 0
print('Capitalization off:')
print('Accuracy:', tacc)
print('Percentage known:', tp_kn)
print('Percentage unknown:', tp_un)
print('Accuracy over known words:', (tacc / tp_kn))
sacc = s.evaluate(d[i*100:((i+1)*100)])
sp_un = float(s.unknown) / float(s.known +s.unknown)
sp_kn = float(s.known) / float(s.known + s.unknown)
s.unknown = 0
s.known = 0
print('Capitalization on:')
print('Accuracy:', sacc)
print('Percentage known:', sp_kn)
print('Percentage unknown:', sp_un)
print('Accuracy over known words:', (sacc / sp_kn))
示例7: main
def main():
### Globals ###
regexp_tagger = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
training_data = treebank.tagged_sents()
unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger)
trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger)
unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w"))
bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w"))
trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w"))
unigram_pickler.dump(unigram_tagger)
bigram_pickler.dump(bigram_tagger)
trigram_pickler.dump(trigram_tagger)
示例8: traintest_bigram_trigram_tagger
def traintest_bigram_trigram_tagger(self):
from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
train_sents = treebank.tagged_sents()[:3000]
print 'trainging bigramTagger'
bitagger = BigramTagger(train_sents)
print 'evaluation bitagger'
print bitagger.evaluate(test_sents)
print 'trainging trigram Tagger'
tritagger = TrigramTagger(train_sents)
print 'evaluation bitagger'
print tritagger.evaluate(test_sents)
print 'tagging'
示例9: benchmark_aptagger
def benchmark_aptagger():
'''
Benchmark the aptagger vs the Penn Treebank sample in nltk
'''
from nltk.corpus import treebank
# we want to remove "-NONE-" tags since these appear to be garbage
text = []
tags = []
k = 0
for sentence in treebank.tagged_sents():
text.append([ele[0] for ele in sentence if ele[1] != '-NONE-'])
tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-'])
k += 1
t1 = time.time()
predicted = tagger.tag_sents(text)
t2 = time.time()
ncorrect = sum(bool(t == p[1])
for t, p in izip(tags, chain.from_iterable(predicted)))
print("For Penn Treebank sample in NLTK:")
print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % (
t2 - t1, len(tags), int(len(tags) / (t2 - t1))))
print("Accuracy: %s" % (float(ncorrect) / len(tags)))
示例10: create_input_dataset
def create_input_dataset():
print 'Loading input'
input_data = []
tags = []
sents = wsj.sents()
json_file = open('data.json','w')
counter = 0
for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
prev = None
prev_prev = None
for j,word in enumerate(sentence):
datapoint = {}
temp = []
len_sentence = len(sentence)
if(j > 0):
temp.append(sents[i][j-1])
else:
temp.append('*')
if(j > 1):
temp.append(sents[i][j-2])
else:
temp.append('*')
temp.append(sents[i][j])
if(j < len_sentence-1):
temp.append(sents[i][j+1])
else:
temp.append('*')
if(j < len_sentence-2):
temp.append(sents[i][j+2])
else:
temp.append('*')
datapoint['wn'] = temp
datapoint['index'] = j
datapoint['i'] = counter
counter += 1
if(prev == None):
datapoint['t_minus_one'] = '*'
else:
datapoint['t_minus_one'] = prev[1]
if(prev_prev == None):
datapoint['t_minus_two'] = '*'
else:
datapoint['t_minus_two'] = prev_prev[1]
prev_prev = prev
prev = word
# print datapoint,word[1]
datapoint['tag'] = word[1]
json_file.write(json.dumps(datapoint))
json_file.write('\n')
input_data.append(datapoint)
tags.append(word[1])
print 'Done'
json_file.close()
return input_data, tags
示例11: get_pos_tagger
def get_pos_tagger():
train_sents = treebank.tagged_sents()
tagger = nltk.TrigramTagger(train_sents, backoff=
nltk.BigramTagger(train_sents, backoff=
nltk.UnigramTagger(train_sents, backoff=
nltk.DefaultTagger("NN"))))
return tagger
示例12: _demo_prepare_data
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
# train is the proportion of data used in training; the rest is reserved
# for testing.
if tagged_data is None:
print("Loading tagged data from treebank... ")
tagged_data = treebank.tagged_sents()
if num_sents is None or len(tagged_data) <= num_sents:
num_sents = len(tagged_data)
if randomize:
random.seed(len(tagged_data))
random.shuffle(tagged_data)
cutoff = int(num_sents * train)
training_data = tagged_data[:cutoff]
gold_data = tagged_data[cutoff:num_sents]
testing_data = [[t[0] for t in sent] for sent in gold_data]
if not separate_baseline_data:
baseline_data = training_data
else:
bl_cutoff = len(training_data) // 3
(baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
(trainseqs, traintokens) = corpus_size(training_data)
(testseqs, testtokens) = corpus_size(testing_data)
(bltrainseqs, bltraintokens) = corpus_size(baseline_data)
print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
return (training_data, baseline_data, gold_data, testing_data)
示例13: make_sentences
def make_sentences():
dictionary = [k.strip() for k in open("./embeddings/words.lst")]
ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)}
taglst = [k.strip() for k in open("data/tags.lst")]
tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)}
bracket_rep = { "-LRB-":"(",
"-RRB-":")",
"-RSB-":"[",
"-RSB-":"]",
"-LCB-":"{",
"-RCB-":"}"}
sentences = list(treebank.tagged_sents())
for i,sent in enumerate(sentences):
sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-']
sent = [(bracket_rep.get(item, item), tag) for (item,tag) in sent]
sent = [(u'0', tag) if item[0].isdigit() else (item,tag) for (item,tag) in sent]
sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent]
# 1 indexed!!!
sent = [(ind_lookup[item], tag_lookup[tag]) for (item,tag) in sent]
sentences[i] = sent
sentences = [i for i in sentences if len(i) > 4]
print(sum(map(len, sentences)) / float(len(sentences)))
return sentences
示例14: split_sents
def split_sents(self, train=0.95, total=3500,
document_class=TaggedSentence):
sents = tagged_corpus.tagged_sents()[:total]
total = len(sents) if total is None else total
i = int(round(train * total))
j = i + int(round(total - train * total))
return (map(document_class, sents[0:i]),
map(document_class, sents[i:j]))
示例15: demo
def demo(corpus, num_sents):
"""
Loads a few sentences from the Brown corpus or the Wall Street Journal
corpus, trains them, tests the tagger's accuracy and tags an unseen
sentence.
@type corpus: C{str}
@param corpus: Name of the corpus to load, either C{brown} or C{treebank}.
@type num_sents: C{int}
@param num_sents: Number of sentences to load from a corpus. Use a small
number, as training might take a while.
"""
if corpus.lower() == "brown":
from nltk.corpus import brown
tagged_sents = brown.tagged_sents()[:num_sents]
elif corpus.lower() == "treebank":
from nltk.corpus import treebank
tagged_sents = treebank.tagged_sents()[:num_sents]
elif corpus.lower() == "floresta":
from nltk.corpus import floresta
tagged_sents = floresta.tagged_sents()[:num_sents]
elif corpus.lower() == "cintil":
print "Loading CINTIL"
#column_types = ['ignore','words','ignore','ignore','pos','ignore']
#cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
column_types = ['words','pos','ignore']
#cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
tagged_sents = cintil.tagged_sents()[:num_sents]
else:
print "Please load either the 'brown' or the 'treebank' corpus."
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
maxent_tagger = MaxentPosTagger()
maxent_tagger.train(train_sents)
maxent_tagger.evaluate(test_sents)
"""
print "tagger accuracy (test %i sentences, after training %i):" % \
(size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
print "\n\n"
print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
print "\n\n"
print "show the 40 most informative features:"
print maxent_tagger.classifier.show_most_informative_features(40)
"""
fModel = open('test.pkl',"wb")
pickle.dump(maxent_tagger, fModel,1)
fModel.close()