本文整理汇总了Python中tokenizer.Tokenizer.get_tokenized_as_xml方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.get_tokenized_as_xml方法的具体用法?Python Tokenizer.get_tokenized_as_xml怎么用?Python Tokenizer.get_tokenized_as_xml使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.get_tokenized_as_xml方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Document
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import get_tokenized_as_xml [as 别名]
#.........这里部分代码省略.........
tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
for s in self.tokenizer.tokenize_text().sentences:
sent = s.as_pairs() # [ (token, lexeme obj), (token, lexeme obj), ...]
offsets = indices_function(sent, tag_dict)
for begin, end in offsets:
extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
qslink_tag_dict, begin, end, self.basename, self)
yield extent
def qs_o_link_triples(self, indices_function, extent_class=Extent):
tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
for s in self.tokenizer.tokenize_text().sentences:
sent = s.as_pairs() # [ (token, lexeme obj), (token, lexeme obj), ...]
offsets = indices_function(sent, tag_dict)
for begin, end in offsets:
extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
qslink_tag_dict, begin, end, self.basename, self)
trigger = extent.tag
tags = extent.prev_tags + extent.next_tags
for from_tag in tags:
for to_tag in tags:
if to_tag['id'] != from_tag['id']:
alt_extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
qslink_tag_dict, begin, end, self.basename, self)
alt_extent.token = (trigger, from_tag, to_tag)
yield alt_extent
def move_link_triples(self, indices_function, extent_class=Extent):
tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
for s in self.tokenizer.tokenize_text().sentences:
sent = s.as_pairs() # [ (token, lexeme obj), (token, lexeme obj), ...]
offsets = indices_function(sent, tag_dict)
for begin, end in offsets:
extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
qslink_tag_dict, begin, end, self.basename, self)
tags = extent.prev_tags + extent.next_tags + [{'id': '', 'start': '-1', 'end': '-1'}]
for to_tag in tags:
alt_extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
qslink_tag_dict, begin, end, self.basename, self)
alt_extent.token = (extent.tag, extent.tag, to_tag)
yield alt_extent
def validate(self):
is_valid = True
tag_count = len(self.tags())
if not (tag_count > 0):
is_valid = False
warning = '\n'.join([
'No tag elements found',
"\tFile : '{doc}'"
]).format(doc=self.name)
warn(warning, RuntimeWarning)
for tag in self.consuming_tags():
start, end = int(tag['start']), int(tag['end'])
extent = slice(start, end)
text_attribute = tag['text'].encode('utf-8')
text_slice = self.text()[extent].encode('utf-8').replace('\n', ' ')
if text_attribute != text_slice:
is_valid = False
warning = '\n'.join([
'Misaligned extent tag',
"\tFile : '{doc}'",
'\tSpan : [{start}:{end}]',
"\tTag : '{id}'",
"\tText : '{text}'",
"\tSlice : '{slice}'"
]).format(
doc=self.name,
start=start,
end=end,
id=tag['id'],
text=text_attribute,
slice=text_slice
)
warn(warning, RuntimeWarning)
return is_valid
def get_xml(self):
xml = u'<?xml version="1.0" encoding="UTF-8" ?>\n'
root = Tag(name=self.task)
text = Tag(name='TEXT')
text.append(CData(self.text()))
tags = self.TAGS
tokens = (BS(
self.tokenizer.get_tokenized_as_xml().encode('utf-8'),
'xml'
)).TOKENS
elements = [u'\n', text, u'\n', tags, u'\n', tokens, u'\n']
for element in elements:
if element: # if missing tags, system will crash
root.append(element)
xml += unicode(root)
return xml
def save_xml(self, file):
if isinstance(file, basestring):
with open(file, 'wb') as file:
file.write(self.get_xml().encode('utf-8'))
else:
file.write(self.get_xml().encode('utf-8'))