本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer.span_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python WordPunctTokenizer.span_tokenize方法的具体用法?Python WordPunctTokenizer.span_tokenize怎么用?Python WordPunctTokenizer.span_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.WordPunctTokenizer
的用法示例。
在下文中一共展示了WordPunctTokenizer.span_tokenize方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tokens
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def tokens(self):
"""Tokenize the text.
"""
tokenizer = WordPunctTokenizer()
# Get token character spans.
spans = list(tokenizer.span_tokenize(self.text))
# Materialize the token stream.
tokens = [self.text[c1:c2] for c1, c2 in spans]
tags = pos_tag(tokens)
return [
Token(
token=token.lower(),
char1=c1,
char2=c2,
pos=pos,
)
for (c1, c2), token, (_, pos) in
zip(spans, tokens, tags)
]
示例2: tokenize
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def tokenize(text):
"""Tokenize a raw text.
Args:
text (str)
Returns: list of {token, char1, char2, pos}
"""
tokenizer = WordPunctTokenizer()
# Get token character spans.
spans = list(tokenizer.span_tokenize(text))
# Materialize the token stream.
tokens = [text[c1:c2] for c1, c2 in spans]
# Tag parts-of-speech.
tags = pos_tag(tokens)
return [
dict(
token=token.lower(),
char1=c1,
char2=c2,
pos=pos,
)
for (c1, c2), token, (_, pos) in
zip(spans, tokens, tags)
]
示例3: change_db2
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def change_db2(text, origin_dict, id):
print origin_dict
tokens_ar = []
word_punct_tokenizer = WordPunctTokenizer()
for token in word_punct_tokenizer.span_tokenize(origin_dict):
tokens_ar.append(token)
for line in text.split("\n"):
markup_error_line = line.split(';')
print "MARKUP", markup_error_line
convert_coord_2dbformat(markup_error_line, tokens_ar, id)
示例4: convert
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def convert(sgm_path, apf_path, bio_path=None):
xml_parser = etree.XMLParser(recover=True)
try:
sgm_tree = etree.parse(sgm_path, xml_parser)
apf_tree = etree.parse(apf_path, xml_parser)
if not bio_path:
bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
output = open(bio_path, 'w')
except:
print 'Something wrong when opening/parsing xml file, or opening output file'
return
init_offset = get_init_offset(sgm_path)
text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(text)
spans = list(tokenizer.span_tokenize(text))
pos = pos_tag(tokens)
ts = []
for i in range(len(tokens)):
t = token()
t.text = tokens[i]
t.pos = pos[i][1]
t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
t.bio = 'O'
ts.append(t)
entits = apf_tree.xpath('/source_file/document/entity')
for enty in entits:
enty_type = enty.get('TYPE')
mentions = enty.xpath('entity_mention')
for m in mentions:
head = m.xpath('head')[0]
span = (int(head[0].get('START')), int(head[0].get('END')))
found = False
for t in ts:
if t.span[0] == span[0]:
t.bio = 'B-' + enty_type
found = True
if t.span[0] > span[0] and t.span[1] <= span[1]:
t.bio = 'I-' + enty_type
found = True
if not found:
print 'entity mention head span not found', span, apf_path
for t in ts:
#print t.text, t.span
output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
output.close()