Python nltk.chunk方法代码示例

本文整理汇总了Python中nltk.chunk方法的典型用法代码示例。如果您正苦于以下问题：Python nltk.chunk方法的具体用法？Python nltk.chunk怎么用？Python nltk.chunk使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk的用法示例。

在下文中一共展示了nltk.chunk方法的6个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_candidate_phrases

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            sent = self.normalize(sent)
            if not sent: continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(
                    chunks, lambda term: term[-1] != 'O'
                ) if key
            ]
            for phrase in phrases:
                yield phrase

开发者ID:foxbook，项目名称:atap，代码行数:21，代码来源:transformer.py

示例2: structure_ne

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def structure_ne(self, ne_tree):
        ne = []
        for subtree in ne_tree:
            if type(subtree) == Tree:  # If subtree is a noun chunk, i.e. NE != "O"
                ne_label = subtree.label()
                ne_string = " ".join([token for token, pos in subtree.leaves()])
                ne.append((ne_string, ne_label))
        return ne

    # Nltk Named Entity Recognizer

开发者ID:singnet，项目名称:nlp-services，代码行数:12，代码来源:entity_recognizer_mod.py

示例3: train_merger

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def train_merger(self, train_file_path, test_split=0.1):
        print("Loading Data...")
        file = open(train_file_path, "r", encoding='utf-8')
        file_content = file.read()
        file_content = file_content.split("\n\n")

        data_list = []
        for line in file_content:
            line = nltk.chunk.util.conllstr2tree(line, chunk_types=('NP',), root_label='S')
            if (len(line) > 0):
                data_list.append(line)

        # train_sents, test_sents = train_test_split(data_list, test_size=test_split, random_state=91)
        train_sents = data_list
        test_sents = []

        print("Training the model ...")

        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in train_sents]

        # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
        def triplets2tagged_pairs(iob_sent):
            return [((word, pos), chunk) for word, pos, chunk in iob_sent]

        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]

        self.feature_detector = self.features
        self.tagger = ClassifierBasedTagger(
            train=chunked_sents,
            feature_detector=self.features)

        token_merger_model = self.tagger

        if len(test_sents) > 0:
            print("evaluating...")
            print(token_merger_model.evaluate(test_sents))

        return token_merger_model

开发者ID:ICTRC，项目名称:Parsivar，代码行数:41，代码来源:token_merger.py

示例4: measure_pattern_time_v2

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def measure_pattern_time_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
  global pattern_search_result    #Make measure_me able to modify the value
  pattern_search_result = search("%s", text_tree)
  #print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
  runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
  average = sum(runtimes)/len(runtimes)
#  return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
  return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive()

开发者ID:nicolashernandez，项目名称:PyRATA，代码行数:34，代码来源:do_benchmark.py

示例5: write_pattern_v2

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def write_pattern_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      string=string.replace(ch,'_')
  return string  
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
  print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive()

开发者ID:nicolashernandez，项目名称:PyRATA，代码行数:35，代码来源:do_benchmark.py

示例6: pyrata2conll

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def pyrata2conll (dictList, **kwargs):
  """ 
  See 3.1   Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html
  
  can be used wi
  nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
  """

  if 'raw' in kwargs.keys(): 
    rawFeatureName = kwargs['raw']
  if 'pos' in kwargs.keys(): 
    posFeatureName = kwargs['pos']   
  if 'chunk' in kwargs.keys(): 
    chunkFeatureName = kwargs['chunk']

  text = ''
  for e in dictList:
    text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n']))

  return text


# extend a given dictList 

# merge dictList

# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Run all the tests
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

开发者ID:nicolashernandez，项目名称:PyRATA，代码行数:31，代码来源:nltk.py

注：本文中的nltk.chunk方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。