当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.tokenize_text方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.tokenize_text方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.tokenize_text方法的具体用法?Python Tokenizer.tokenize_text怎么用?Python Tokenizer.tokenize_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.tokenize_text方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Document

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize_text [as 别名]
class Document(BS):
    """A class for working with MAE annotation XMLs."""
    def __init__(self, doc_file):
        super(Document, self).__init__(doc_file.read(), "xml")
        from tokenizer import Tokenizer
        self.root = self.children.next()
        self.task = self.root.name
        self.name = doc_file.name
        self.basename = os.path.basename(self.name)
        self.dirname = os.path.dirname(self.name)
        self.tokenizer = Tokenizer(self.text())
        self.tokenizer.tokenize_text()
    
    def __repr__(self):
        return "Document:{d}".format(d=os.path.basename(self.name))
    
    def text(self):
        return u''.join(map(lambda t : t.decode_contents(), self('TEXT')))
    
    def tags(self, ttypes=None):
        """Return all annotation tags whose type is in ttypes (if ttypes is unspecified, all tags are returned)."""
        is_tag = lambda item : isinstance(item, Tag)
        if not self.find('TAGS'):
            tags = []
        else:
            tags = filter(is_tag, self.find('TAGS').children)
        if ttypes:
            tags = filter(lambda tag : tag.name in ttypes, tags)
        return tags
    
    def query_extents(self, ttypes, start, end):
        """Return a list of extent tags whose types are in the list of ttypes 
        and whose start and end attributes match the given start and end."""
        matches = lambda t : \
            map(int, (t['start'], t['end'])) == map(int, (start, end))
        return filter(matches, self.tags(ttypes=ttypes))
    
    def query_links(self, ttypes, trigger_id):
        """Return a list of link tags whose types are in the list of ttypes
        and whose trigger has the specified trigger id."""
        matches = lambda t : unicode(t['trigger']) == unicode(trigger_id)
        return filter(matches, self.tags(ttypes=ttypes))

    def query_links_by_attr(self, ttypes, attr_name, attr_value):
        """Return a list of link tags whose types are in the list of ttypes
        and whose attribute field has the specified value."""
        matches = lambda t : unicode(t[attr_name]) == unicode(attr_value)
        return filter(matches, self.tags(ttypes=ttypes))
    
    def query(self, tag_id):
        """Return the tag whose identifier matches the specified id."""
        matches = lambda t : t.attrs.get('id', object()) == unicode(tag_id)
        results = filter(matches, self.tags())
        if any(results):
            return results[0]
        else:
            return None
        
    def add_attribute(self, attribute, value=u'', ttypes=None):
        """Add an attribute to a tag (and possibly specify it's value)."""
        for tag in self.tags(ttypes):
            if not attribute in tag.attrs.keys():
                tag[attribute] = value
    
    def rename_attribute(self, old_ttype, new_ttype, ttypes=None):
        """Change the name of attributes for all tags with the given ttypes."""
        for tag in self.tags(ttypes):
            if tag.attrs.get(old_ttype):
                tag.attrs[new_ttype] = tag.attrs.pop(old_ttype)
    
    def rename_tag(self, old_ttype, new_ttype):
        """Rename a tag."""
        for tag in self.tags([old_ttype]):
            tag.name = new_ttype
    
    def rename_task(self, new_task):
        """Rename the document task (the XML root tag type)."""
        self.task = new_task
    
    def consuming_tags(self):
        """Return extent annotation tags with non-negative starting offsets."""
        is_extent_tag = lambda t : t.attrs.has_key('start')
        is_consuming = lambda t : int(t['start']) >= 0
        return filter(is_consuming, filter(is_extent_tag, self.tags()))

    def sort_tags_by_begin_offset(self):
        """Make dictionary of tag objects keyed on their 'start' field.
        
        Used for matching tags to tokens using offsets"""
        tag_dict = {}
        movelink_tag_dict = {}
        olink_tag_dict = {}
        qslink_tag_dict = {}
                    
        tags = self.tags()
        for t in tags:
            # load entity / event / signal tags
            if 'start' in t.attrs:
                tag_dict[int(t.attrs['start'])] = t.attrs  # {start offset: xml tokens, offsets, spatial data}
            # load movelink tags
#.........这里部分代码省略.........
开发者ID:aclevine,项目名称:ISO-space,代码行数:103,代码来源:corpus.py


注:本文中的tokenizer.Tokenizer.tokenize_text方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。