当前位置: 首页>>代码示例>>Python>>正文


Python treebank.tagged_sents函数代码示例

本文整理汇总了Python中nltk.corpus.treebank.tagged_sents函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_sents函数的具体用法?Python tagged_sents怎么用?Python tagged_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了tagged_sents函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_data

    def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
开发者ID:GaddipatiAsish,项目名称:Natural-Language-Processing,代码行数:29,代码来源:Q6_Part1.py

示例2: demo

def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]
    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]
    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)
    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so",
        "slow", "!"])
    print "\n\n"
    print "show the 10 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(10)
开发者ID:Big-Data,项目名称:nltk-maxent-pos-tagger,代码行数:34,代码来源:mxpost.py

示例3: demo3

def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10) : ((i + 1) * d10)]
        etest = e[(i * e10) : ((i + 1) * e10)]

        dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
        etrain = e[: (i * e10)] + e[((i + 1) * e10) :]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown)
开发者ID:prz3m,项目名称:kind2anki,代码行数:60,代码来源:tnt.py

示例4: get_accuracy

	def get_accuracy(self, sentences=[]):

		if sentences == []:
			test_sents = treebank.tagged_sents()[6000:]
		else:
			test_sents = sentences
		print self._tagger.evaluate(test_sents)
开发者ID:jayvachon,项目名称:managerisk-reflection-search,代码行数:7,代码来源:sentiment-analysis.py

示例5: tag_matching

def tag_matching(sequences):

    treebank_sentences = treebank.tagged_sents()
    #treebank_sentences = brown.tagged_sents()

    # Return best count/sequence
    best = (0, None)

    count = 0
    errors = 0

    resultset = []

    for seq in sequences:
        for sent in treebank_sentences:
            for i, word in enumerate(sent):
                if sent[i][1] == seq[0]:
                    try:
                        if sent[i+1][1] == seq[1]:
                            count += 1
                            #if sent[i+2][1] == seq[2]:
                            #   count += 1
                    except IndexError:
                        errors += 1
        if count > best[0]:
            best = (count, seq)
        resultset.append((seq, count, errors))
        count, erros = 0, 0
    return resultset
开发者ID:Liechti,项目名称:exam-ai,代码行数:29,代码来源:syntax_filtering.py

示例6: demo2

def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = t.evaluate(d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn))
开发者ID:Arttii,项目名称:TextBlob,代码行数:34,代码来源:tnt.py

示例7: main

def main():
    ### Globals ###
    regexp_tagger = nltk.RegexpTagger(
           [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
           (r'(The|the|A|a|An|an)$', 'AT'),   # articles
           (r'.*able$', 'JJ'),                # adjectives
           (r'.*ness$', 'NN'),                # nouns formed from adjectives
           (r'.*ly$', 'RB'),                  # adverbs
           (r'.*s$', 'NNS'),                  # plural nouns
           (r'.*ing$', 'VBG'),                # gerunds
           (r'.*ed$', 'VBD'),                 # past tense verbs
           (r'.*', 'NN')                      # nouns (default)
    ])

    training_data = treebank.tagged_sents()
           
    unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger)

    unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w"))
    bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w"))
    trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w"))

    unigram_pickler.dump(unigram_tagger)
    bigram_pickler.dump(bigram_tagger)
    trigram_pickler.dump(trigram_tagger)
开发者ID:DrDub,项目名称:question-answer-engine,代码行数:27,代码来源:gen_taggers.py

示例8: traintest_bigram_trigram_tagger

 def traintest_bigram_trigram_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger 
     from nltk.corpus import treebank        
     test_sents  = treebank.tagged_sents()[3000:]          
     train_sents = treebank.tagged_sents()[:3000]
     
     print 'trainging bigramTagger'                
     bitagger = BigramTagger(train_sents)
     print 'evaluation bitagger'
     print bitagger.evaluate(test_sents)
     
     print 'trainging trigram Tagger'
     tritagger = TrigramTagger(train_sents)
     print 'evaluation bitagger'
     print tritagger.evaluate(test_sents)
     print 'tagging'
开发者ID:nicorotstein,项目名称:juan_alberto,代码行数:16,代码来源:linguatools.py

示例9: benchmark_aptagger

def benchmark_aptagger():
    '''
    Benchmark the aptagger vs the Penn Treebank sample in nltk
    '''
    from nltk.corpus import treebank

    # we want to remove "-NONE-" tags since these appear to be garbage
    text = []
    tags = []
    k = 0
    for sentence in treebank.tagged_sents():
        text.append([ele[0] for ele in sentence if ele[1] != '-NONE-'])
        tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-'])
        k += 1

    t1 = time.time()
    predicted = tagger.tag_sents(text)
    t2 = time.time()

    ncorrect = sum(bool(t == p[1])
        for t, p in izip(tags, chain.from_iterable(predicted)))

    print("For Penn Treebank sample in NLTK:")
    print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % (
        t2 - t1, len(tags), int(len(tags) / (t2 - t1))))
    print("Accuracy: %s" % (float(ncorrect) / len(tags)))
开发者ID:seomoz,项目名称:mltk,代码行数:26,代码来源:bench.py

示例10: create_input_dataset

def create_input_dataset():
	print 'Loading input'
	input_data = []
	tags = []
	sents = wsj.sents()
	json_file  = open('data.json','w') 
	counter = 0
	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			len_sentence = len(sentence)

			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			temp.append(sents[i][j])
			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			datapoint['wn'] = temp
			
			datapoint['index'] = j

			datapoint['i'] = counter
			counter += 1
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			datapoint['tag'] = word[1]
			json_file.write(json.dumps(datapoint))
			json_file.write('\n')
			input_data.append(datapoint)
			tags.append(word[1])
	print 'Done'
	json_file.close()
	return input_data, tags
开发者ID:karthikradhakrishnan96,项目名称:ccbd,代码行数:60,代码来源:memm_pyspark.py

示例11: get_pos_tagger

def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger
开发者ID:kaiyaunchen,项目名称:nlp,代码行数:7,代码来源:ner.py

示例12: _demo_prepare_data

def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    if tagged_data is None:
        print("Loading tagged data from treebank... ")
        tagged_data = treebank.tagged_sents()
    if num_sents is None or len(tagged_data) <= num_sents:
        num_sents = len(tagged_data)
    if randomize:
        random.seed(len(tagged_data))
        random.shuffle(tagged_data)
    cutoff = int(num_sents * train)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:num_sents]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    if not separate_baseline_data:
        baseline_data = training_data
    else:
        bl_cutoff = len(training_data) // 3
        (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
    (trainseqs, traintokens) = corpus_size(training_data)
    (testseqs, testtokens) = corpus_size(testing_data)
    (bltrainseqs, bltraintokens) = corpus_size(baseline_data)
    print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
    print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
    print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
        bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
    return (training_data, baseline_data, gold_data, testing_data)
开发者ID:osu-ling5802-2016,项目名称:Sigmorphon2016,代码行数:28,代码来源:demo.py

示例13: make_sentences

def make_sentences():
    dictionary = [k.strip() for k in open("./embeddings/words.lst")]
    ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)}

    taglst = [k.strip() for k in open("data/tags.lst")]
    tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)}

    bracket_rep = { "-LRB-":"(",
                    "-RRB-":")",
                    "-RSB-":"[",
                    "-RSB-":"]",
                    "-LCB-":"{",
                    "-RCB-":"}"}

    sentences = list(treebank.tagged_sents())
    for i,sent in enumerate(sentences):
        sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-']
        sent = [(bracket_rep.get(item, item), tag)                          for (item,tag) in sent]
        sent = [(u'0', tag) if item[0].isdigit() else (item,tag)            for (item,tag) in sent]
        sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent]
        # 1 indexed!!!
        sent = [(ind_lookup[item], tag_lookup[tag])                         for (item,tag) in sent]
        sentences[i] = sent

    sentences = [i for i in sentences if len(i) > 4]
    print(sum(map(len, sentences)) / float(len(sentences)))

    return sentences
开发者ID:ebetica,项目名称:lstm,代码行数:28,代码来源:data_gen.py

示例14: split_sents

 def split_sents(self, train=0.95, total=3500,
                 document_class=TaggedSentence):
     sents = tagged_corpus.tagged_sents()[:total]
     total = len(sents) if total is None else total
     i = int(round(train * total))
     j = i + int(round(total - train * total))
     return (map(document_class, sents[0:i]),
             map(document_class, sents[i:j]))
开发者ID:KechenQin,项目名称:Hidden_Markov_Model,代码行数:8,代码来源:test_hmm.py

示例15: demo

def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "floresta":
        from nltk.corpus import floresta
        tagged_sents = floresta.tagged_sents()[:num_sents]

    elif corpus.lower() == "cintil":
        print "Loading CINTIL"
        #column_types = ['ignore','words','ignore','ignore','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
        column_types = ['words','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
        cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
        tagged_sents = cintil.tagged_sents()[:num_sents]

    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)

    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)

    maxent_tagger.evaluate(test_sents)

    """
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
    print "\n\n"
    print "show the 40 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(40)
    """

    fModel = open('test.pkl',"wb")
    pickle.dump(maxent_tagger, fModel,1)
    fModel.close()
开发者ID:davidsbatista,项目名称:minhash-classifier,代码行数:58,代码来源:mxpost.py


注:本文中的nltk.corpus.treebank.tagged_sents函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。