当前位置: 首页>>代码示例>>Python>>正文


Python conll2000.chunked_sents函数代码示例

本文整理汇总了Python中nltk.corpus.conll2000.chunked_sents函数的典型用法代码示例。如果您正苦于以下问题:Python chunked_sents函数的具体用法?Python chunked_sents怎么用?Python chunked_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了chunked_sents函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main():
    train_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('train.txt', chunk_types=['NP']))
    # test_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('test.txt', chunk_types=['NP']))
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

    fd = np_tags_fd(train_sents)
    print_frequencies(fd, num_results=50)
    # pattern = regex_generator(fd)
    # print pattern
    # pattern = r"NP: {<NN>}"

    print nltk.RegexpParser("").evaluate(test_sents)
    print ''

    pattern_book = r"NP: {<[CDJNP].*>+}"
    print nltk.RegexpParser(pattern_book).evaluate(test_sents)
    print ''

    pattern_modified = r"NP: {<(\$)>?<[CDJNP].*>+}"
    print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
    print ''

    pattern_modified = r"""NP: {<(\$)>?<[CDJNP].*>+}
                               {<W(P|DT)>}"""
    print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
开发者ID:mikeholler,项目名称:CSC499-NLP,代码行数:25,代码来源:ngram_chunker_exploration.py

示例2: evaluate

def evaluate():

    text = '''
    he PRP B-NP
    accepted VBD B-VP
    the DT B-NP
    position NN I-NP
    of IN B-PP
    vice NN B-NP
    chairman NN I-NP
    of IN B-PP
    Carlyle NNP B-NP
    Group NNP I-NP
    , , O
    a DT B-NP
    merchant NN I-NP
    banking NN I-NP
    concern NN I-NP
    . . O
    '''

    nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()

    print conll2000.chunked_sents('train.txt')[99]
    print conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]
开发者ID:AkiraKane,项目名称:Python,代码行数:25,代码来源:c07_information_extraction.py

示例3: __init__

    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
开发者ID:pschulam-attic,项目名称:SCLE,代码行数:7,代码来源:chunker.py

示例4: get_noun_phrases_and_named_entities_data

def get_noun_phrases_and_named_entities_data(data):
    # print data
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    chunker = BigramChunker.BigramChunker(train_sents + test_sents)

    tagged_data = []
    for sent in data:
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)
        tagged_data.append(tagged)

    noun_phrases = []
    for tagged_sent in tagged_data:
        tree = chunker.parse(tagged_sent)
        noun_phrases += nltk.chunk.tree2conlltags(tree)

    named_entities = []
    for tagged_sent in tagged_data:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    words = []
    cnt = 0
    for sent in data:
        cnt += 1
        tokens = nltk.word_tokenize(sent)
        for token in tokens:
            words.append((token, cnt))

    # print words
    # print noun_phrases
    # print named_entities

    return (words, noun_phrases, named_entities)
开发者ID:fydlzr,项目名称:coreference_resolution-1,代码行数:35,代码来源:mention_extraction.py

示例5: chunk_with_unigram_tagger

def chunk_with_unigram_tagger():
  # use unigram tagger to find the IOB tag given its POS tag
  from nltk.corpus import conll2000
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
  unigram_chunker = UnigramChunker(train_sents)
  print unigram_chunker.evaluate(test_sents)
  postags = sorted(set(pos for sent in train_sents
                           for (word, pos) in sent.leaves()))
  print unigram_chunker.tagger.tag(postags)
开发者ID:447327642,项目名称:nltk-examples,代码行数:10,代码来源:ch07.py

示例6: _load_data

 def _load_data():
     try:
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     except Exception:
         if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False:
             sys.exit(0)
         nltk.download('conll2000')
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set]
     test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set]
     return train_data, test_data
开发者ID:cdj0311,项目名称:nlp-architect,代码行数:13,代码来源:conll2000.py

示例7: main

def main(convert_func = None):        
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

    if convert_func:
        # transform the sentence
        print "convert leaf nodes"
        test_sents = [convert_leaf_node(sent, convert_func) 
                      for sent in test_sents]
    print "train..."
    chunker = ConsecutiveNPChunker(train_sents)
    print "evaluate..."
    print(chunker.evaluate(test_sents))
开发者ID:xiaohan2012,项目名称:capitalization-restoration-train,代码行数:13,代码来源:chunk.py

示例8: exercise3

def exercise3():
    print "Exercise - 3"
    grammar1 = r"""
    NP: {<DT>?<JJ><NNS>}
        {<CD><NNS>}
    """
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])[:100]
    cp1 = nltk.RegexpParser(grammar1)
    res1 = cp1.evaluate(test_sents)
    print "Statistics data for custom chunker"
    print res1
    print


    cp2 = nltk.RegexpParser("")
    res2 = cp2.evaluate(test_sents)
    print "Statistics data for baseline chunker"
    print res2
    print

    grammar3 = r"""
    NP: {<DT>?<JJ><NNS>}
        {<CD><NNS>}
        {<DT><NN>}
    """
    cp3 = nltk.RegexpParser(grammar3)
    res3 = cp3.evaluate(test_sents)
    print "Statistics data for custom chunker with added regular expression: {<DT><NN>}"
    print res3
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:30,代码来源:GirishSrinivas_ch7.py

示例9: get_noun_phrases_and_named_entities

def get_noun_phrases_and_named_entities(file_name, start_index, end_index):

    sentences = conll2000.sents(file_name)
    noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP'])
    pos_tagged_sentences = conll2000.tagged_sents(file_name)

    sentences = sentences[start_index:end_index]
    pos_tagged_sentences = pos_tagged_sentences[start_index:end_index]
    noun_phrase_sentences = noun_phrase_sentences[start_index:end_index]

    # Extacting mentions.
    words = []
    cnt = 0
    for sent in sentences:
        cnt += 1
        for word in sent:
            words.append((word, cnt))

    noun_phrases = []
    for sent in noun_phrase_sentences:
        noun_phrases += nltk.chunk.tree2conlltags(sent)

    named_entities = []
    for tagged_sent in pos_tagged_sentences:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    return (words, noun_phrases, named_entities)
开发者ID:fydlzr,项目名称:coreference_resolution-1,代码行数:28,代码来源:mention_extraction.py

示例10: __init__

 def __init__(self):
   try:
     tagger = cPickle.load(open('nerdb_tagger.pkl'))
   except IOError:
     print 'failed to load nerdb_tagger, recreating...'
     train_sents = conll2000.tagged_sents() + brown.tagged_sents()
     tagger = nltk.DefaultTagger('NN')
     tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
     tagger = nltk.BigramTagger(train_sents, backoff=tagger)
     tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
     cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w'))
     print 'done'
   try:
     chunker = cPickle.load(open('nerdb_chunker.pkl'))
   except IOError:
     print 'failed to load nerdb_chunker, recreating...'
     train_sents = conll2000.chunked_sents()
     chunker = ConsecutiveNPChunker(tagger, train_sents)
     cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w'))
     print 'done'
   self.chunker = chunker
   self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()]
   self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()]
   self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()]
   self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
开发者ID:jamt,项目名称:IMDBot,代码行数:25,代码来源:NERDb.py

示例11: __init__

 def __init__(self):
     try:
         tagger = cPickle.load(open("nerdb_tagger.pkl"))
     except IOError:
         print "failed to load nerdb_tagger, recreating..."
         train_sents = conll2000.tagged_sents() + brown.tagged_sents()
         tagger = nltk.DefaultTagger("NN")
         tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
         tagger = nltk.BigramTagger(train_sents, backoff=tagger)
         tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
         cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
         print "done"
     try:
         chunker = cPickle.load(open("nerdb_chunker.pkl"))
     except IOError:
         print "failed to load nerdb_chunker, recreating..."
         train_sents = conll2000.chunked_sents()
         chunker = ConsecutiveNPChunker(tagger, train_sents)
         cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
         print "done"
     self.chunker = chunker
     self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
     self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
     self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
     self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
     self.numbers = eval(open("numbers.txt").read())
开发者ID:gabsl,项目名称:IMDBot,代码行数:26,代码来源:NERDb.py

示例12: simple_np_bgram

def simple_np_bgram(documents):
	bgram = BigramChunker(conll2000.chunked_sents('train.txt'))
	for doc in documents:
		buf = []
		for sent in pos.preprocess(doc):
			buf.append(bgram.parse(sent))
		yield buf
开发者ID:juchiyama,项目名称:bigdata_fall2015,代码行数:7,代码来源:chunk.py

示例13: evaluate_chunker

def evaluate_chunker():
  from nltk.corpus import conll2000
  cp = nltk.RegexpParser("") # baseline
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  print cp.evaluate(test_sents)
  grammar = r"NP: {<[CDJNP].*>+}"
  cp1 = nltk.RegexpParser(grammar) # naive tagger, look for all tags in NP chunk
  print cp1.evaluate(test_sents)
开发者ID:447327642,项目名称:nltk-examples,代码行数:8,代码来源:ch07.py

示例14: _build_training_sents

 def _build_training_sents(self ):
     # This method randomly select a corpus from the provided lists and then
     # build and return a train sentences that the chunkers will use
     corpuses = [(conll2000,'train.txt'),(conll2002,'esp.train')]
     #trainer = random.choice(corpuses)
     #train_sents = trainer[0].chunked_sents(trainer[1],chunk_types=['NP'])
     train_sents = conll2000.chunked_sents('train.txt',chunk_types=['NP'])
     return train_sents
开发者ID:donama,项目名称:phraseengine,代码行数:8,代码来源:__init__.py

示例15: train_chunker

def train_chunker(filesDir):
    # Create chunked sentences in the CoNLL format.
    train_sents = conll2000.chunked_sents('train_locations.txt', chunk_types=['Loc'])

    # Train the chunker with the NaiveBayesClassifier
    chunker = ConsecutiveNPChunker(train_sents, combine_features, nltk.NaiveBayesClassifier)

    return chunker
开发者ID:chatbotimporved,项目名称:chatbot,代码行数:8,代码来源:read_emails.py


注:本文中的nltk.corpus.conll2000.chunked_sents函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。