本文整理汇总了Python中nltk.chunk方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.chunk方法的具体用法?Python nltk.chunk怎么用?Python nltk.chunk使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.chunk方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_candidate_phrases
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def extract_candidate_phrases(self, sents):
"""
For a document, parse sentences using our chunker created by
our grammar, converting the parse tree into a tagged sequence.
Extract phrases, rejoin with a space, and yield the document
represented as a list of it's keyphrases.
"""
for sent in sents:
sent = self.normalize(sent)
if not sent: continue
chunks = tree2conlltags(self.chunker.parse(sent))
phrases = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in groupby(
chunks, lambda term: term[-1] != 'O'
) if key
]
for phrase in phrases:
yield phrase
示例2: structure_ne
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def structure_ne(self, ne_tree):
ne = []
for subtree in ne_tree:
if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
ne_label = subtree.label()
ne_string = " ".join([token for token, pos in subtree.leaves()])
ne.append((ne_string, ne_label))
return ne
# Nltk Named Entity Recognizer
示例3: train_merger
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def train_merger(self, train_file_path, test_split=0.1):
print("Loading Data...")
file = open(train_file_path, "r", encoding='utf-8')
file_content = file.read()
file_content = file_content.split("\n\n")
data_list = []
for line in file_content:
line = nltk.chunk.util.conllstr2tree(line, chunk_types=('NP',), root_label='S')
if (len(line) > 0):
data_list.append(line)
# train_sents, test_sents = train_test_split(data_list, test_size=test_split, random_state=91)
train_sents = data_list
test_sents = []
print("Training the model ...")
# Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
chunked_sents = [tree2conlltags(sent) for sent in train_sents]
# Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
def triplets2tagged_pairs(iob_sent):
return [((word, pos), chunk) for word, pos, chunk in iob_sent]
chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]
self.feature_detector = self.features
self.tagger = ClassifierBasedTagger(
train=chunked_sents,
feature_detector=self.features)
token_merger_model = self.tagger
if len(test_sents) > 0:
print("evaluating...")
print(token_merger_model.evaluate(test_sents))
return token_merger_model
示例4: measure_pattern_time_v2
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def measure_pattern_time_v2(iteration_number, size, pattern):
gw = execnet.makegateway("popen//python=python2.7")
channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8', # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
global pattern_search_result #Make measure_me able to modify the value
pattern_search_result = search("%s", text_tree)
#print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
average = sum(runtimes)/len(runtimes)
# return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
""" % (size, pattern, iteration_number, iteration_number))
channel.send([])
return channel.receive()
示例5: write_pattern_v2
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def write_pattern_v2(iteration_number, size, pattern):
gw = execnet.makegateway("popen//python=python2.7")
channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8', # Input string encoding.
tagset = None) # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
if ch in string:
string=string.replace(ch,'_')
return string
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
""" % (size, pattern, iteration_number, iteration_number))
channel.send([])
return channel.receive()
示例6: pyrata2conll
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import chunk [as 别名]
def pyrata2conll (dictList, **kwargs):
"""
See 3.1 Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html
can be used wi
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
"""
if 'raw' in kwargs.keys():
rawFeatureName = kwargs['raw']
if 'pos' in kwargs.keys():
posFeatureName = kwargs['pos']
if 'chunk' in kwargs.keys():
chunkFeatureName = kwargs['chunk']
text = ''
for e in dictList:
text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n']))
return text
# extend a given dictList
# merge dictList
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Run all the tests
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""