当前位置: 首页>>代码示例>>Python>>正文


Python brown.tagged_sents方法代码示例

本文整理汇总了Python中nltk.corpus.brown.tagged_sents方法的典型用法代码示例。如果您正苦于以下问题:Python brown.tagged_sents方法的具体用法?Python brown.tagged_sents怎么用?Python brown.tagged_sents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.brown的用法示例。


在下文中一共展示了brown.tagged_sents方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: demo

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print(s[i],'--', t[i])
        print() 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:21,代码来源:tnt.py

示例2: load_pos

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def load_pos(num_sents):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(categories='news')[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    cleaned_sentences = []
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:24,代码来源:hmm.py

示例3: get_pos_tagger

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:27,代码来源:glue.py

示例4: demo

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
    from nltk.tag import tnt
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = tnt.TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print s[i],'--', t[i]
        print 
开发者ID:blackye,项目名称:luscan-devel,代码行数:22,代码来源:tnt.py

示例5: get_pos_tagger

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
开发者ID:blackye,项目名称:luscan-devel,代码行数:26,代码来源:glue.py

示例6: store_pos_tag_dicts

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def store_pos_tag_dicts():
    pos_tag_dict = defaultdict(tuple)
    tagged = treebank.tagged_sents()
    for sent in tagged:
        for tup in sent:
            if not tup[1] in pos_tag_dict[tup[0].lower()]:
                pos_tag_dict[tup[0].lower()] += (tup[1],)

    pos_tag_dict_univ = defaultdict(tuple)
    penn_tagged_univ = treebank.tagged_sents(tagset='universal')
    brown_tagged_univ = brown.tagged_sents(tagset='universal')
    for text in [penn_tagged_univ, brown_tagged_univ]:
        for sent in text:
            for tup in sent:
                if not tup[1] in pos_tag_dict_univ[tup[0].lower()]:
                    pos_tag_dict_univ[tup[0].lower()] += (tup[1],)
    for word in states.values():
        pos_tag_dict[word.lower()] += ('NNP',)
        pos_tag_dict_univ[word.lower()] += ('NOUN',)
    dicts = (pos_tag_dict, pos_tag_dict_univ)
    with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file:
        pickle.dump(dicts, file, protocol=2) 
开发者ID:EFord36,项目名称:normalise,代码行数:24,代码来源:pos_tag_dict.py

示例7: demo

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo():
    from nltk.corpus import brown

    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j + 100]
        for i in range(len(s)):
            print(s[i], '--', t[i])
        print() 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:22,代码来源:tnt.py

示例8: load_pos

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def load_pos(num_sents):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(categories='news')[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    cleaned_sentences = []
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)  # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:24,代码来源:hmm.py

示例9: demo2

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = t.evaluate(d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn)) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:36,代码来源:tnt.py

示例10: demo2

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
    from nltk import tag
    from nltk.tag import tnt
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = tnt.TnT(N=1000, C=False)
    s = tnt.TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = tag.accuracy(t, d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print 'Capitalization off:'
        print 'Accuracy:', tacc
        print 'Percentage known:', tp_kn
        print 'Percentage unknown:', tp_un
        print 'Accuracy over known words:', (tacc / tp_kn)

        sacc = tag.accuracy(s, d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print 'Capitalization on:'
        print 'Accuracy:', sacc
        print 'Percentage known:', sp_kn
        print 'Percentage unknown:', sp_un
        print 'Accuracy over known words:', (sacc / sp_kn) 
开发者ID:blackye,项目名称:luscan-devel,代码行数:38,代码来源:tnt.py

示例11: demo

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
                               for sent in corpus]
    brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
    brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])

    crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
                          transduction_type='VITERBI')
    sample_output = crf.tag([w for (w,t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print '\nAccuracy: %.1f%%' % (acc*100)
    print 'Sample output:'
    print textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
                        initial_indent='  ', subsequent_indent='  ')+'\n'

    # Clean up
    print 'Clean-up: deleting', crf.filename
    os.remove(crf.filename)

    return crf 
开发者ID:blackye,项目名称:luscan-devel,代码行数:36,代码来源:crf.py

示例12: demo2

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11) * 100 :])
    s.train(d[(11) * 100 :])

    for i in range(10):
        tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)])
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn)) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:36,代码来源:tnt.py

示例13: get_pos_tagger

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN'),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:30,代码来源:glue.py

示例14: demo3

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d)*0.1)
    e10 = int(len(e)*0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i*d10):((i+1)*d10)]
        etest = e[(i*e10):((i+1)*e10)]

        dtrain = d[:(i*d10)] + d[((i+1)*d10):]
        etrain = e[:(i*e10)] + e[((i+1)*e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += (tacc / tp_kn)
        sknacc += (sacc / tp_kn)
        tallacc += tacc
        sallacc += sacc

        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc


    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:63,代码来源:tnt.py

示例15: demo3

# 需要导入模块: from nltk.corpus import brown [as 别名]
# 或者: from nltk.corpus.brown import tagged_sents [as 别名]
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10) : ((i + 1) * d10)]
        etest = e[(i * e10) : ((i + 1) * e10)]

        dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
        etrain = e[: (i * e10)] + e[((i + 1) * e10) :]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown) 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:62,代码来源:tnt.py


注:本文中的nltk.corpus.brown.tagged_sents方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。