当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.get_tokenized_as_xml方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.get_tokenized_as_xml方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.get_tokenized_as_xml方法的具体用法?Python Tokenizer.get_tokenized_as_xml怎么用?Python Tokenizer.get_tokenized_as_xml使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.get_tokenized_as_xml方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Document

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import get_tokenized_as_xml [as 别名]

#.........这里部分代码省略.........
        tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
        for s in self.tokenizer.tokenize_text().sentences:
            sent = s.as_pairs()  # [ (token, lexeme obj), (token, lexeme obj), ...]
            offsets = indices_function(sent, tag_dict)
            for begin, end in offsets:
                extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
                                      qslink_tag_dict, begin, end, self.basename, self)
                yield extent
                
    def qs_o_link_triples(self, indices_function, extent_class=Extent):
        tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
        for s in self.tokenizer.tokenize_text().sentences:
            sent = s.as_pairs()  # [ (token, lexeme obj), (token, lexeme obj), ...]
            offsets = indices_function(sent, tag_dict)
            for begin, end in offsets:
                extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
                                      qslink_tag_dict, begin, end, self.basename, self)
                trigger = extent.tag
                tags = extent.prev_tags + extent.next_tags
                for from_tag in tags:
                    for to_tag in tags:
                        if to_tag['id'] != from_tag['id']:
                            alt_extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
                                                      qslink_tag_dict, begin, end, self.basename, self)                            
                            alt_extent.token = (trigger, from_tag, to_tag)
                            yield alt_extent
        
    def move_link_triples(self, indices_function, extent_class=Extent):
        tag_dict, movelink_tag_dict, olink_tag_dict, qslink_tag_dict = self.sort_tags_by_begin_offset()
        for s in self.tokenizer.tokenize_text().sentences:
            sent = s.as_pairs()  # [ (token, lexeme obj), (token, lexeme obj), ...]
            offsets = indices_function(sent, tag_dict)
            for begin, end in offsets:
                extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
                                      qslink_tag_dict, begin, end, self.basename, self)
                tags = extent.prev_tags + extent.next_tags + [{'id': '', 'start': '-1', 'end': '-1'}]
                for to_tag in tags:
                    alt_extent = extent_class(sent, tag_dict, movelink_tag_dict, olink_tag_dict,
                                              qslink_tag_dict, begin, end, self.basename, self)                    
                    alt_extent.token = (extent.tag, extent.tag, to_tag)
                    yield alt_extent
        
    def validate(self):
        is_valid = True
        tag_count = len(self.tags())
        if not (tag_count > 0):
            is_valid = False
            warning = '\n'.join([
                'No tag elements found',
                "\tFile : '{doc}'"
            ]).format(doc=self.name)
            warn(warning, RuntimeWarning)
        for tag in self.consuming_tags():
            start, end = int(tag['start']), int(tag['end'])
            extent = slice(start, end)
            text_attribute = tag['text'].encode('utf-8')
            text_slice = self.text()[extent].encode('utf-8').replace('\n', ' ')
            if text_attribute != text_slice:
                is_valid = False
                warning = '\n'.join([
                    'Misaligned extent tag',
                    "\tFile : '{doc}'",
                    '\tSpan  : [{start}:{end}]',
                    "\tTag   : '{id}'",
                    "\tText  : '{text}'",
                    "\tSlice : '{slice}'"
                ]).format(
                    doc=self.name,
                    start=start,
                    end=end,
                    id=tag['id'],
                    text=text_attribute,
                    slice=text_slice
                )
                warn(warning, RuntimeWarning)
        return is_valid
    
    def get_xml(self):
        xml = u'<?xml version="1.0" encoding="UTF-8" ?>\n'
        root = Tag(name=self.task)
        text = Tag(name='TEXT')
        text.append(CData(self.text()))
        tags = self.TAGS
        tokens = (BS(
            self.tokenizer.get_tokenized_as_xml().encode('utf-8'),
            'xml'
        )).TOKENS
        elements = [u'\n', text, u'\n', tags, u'\n', tokens, u'\n']
        for element in elements:
            if element: # if missing tags, system will crash
                root.append(element)
        xml += unicode(root)
        return xml
    
    def save_xml(self, file):
        if isinstance(file, basestring):
            with open(file, 'wb') as file:
                file.write(self.get_xml().encode('utf-8'))
        else:
            file.write(self.get_xml().encode('utf-8'))
开发者ID:aclevine,项目名称:ISO-space,代码行数:104,代码来源:corpus.py


注:本文中的tokenizer.Tokenizer.get_tokenized_as_xml方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。