本文整理汇总了Python中tokenizer.Tokenizer.tokenize_text方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.tokenize_text方法的具体用法?Python Tokenizer.tokenize_text怎么用?Python Tokenizer.tokenize_text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.tokenize_text方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Document
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import tokenize_text [as 别名]
class Document(BS):
"""A class for working with MAE annotation XMLs."""
def __init__(self, doc_file):
super(Document, self).__init__(doc_file.read(), "xml")
from tokenizer import Tokenizer
self.root = self.children.next()
self.task = self.root.name
self.name = doc_file.name
self.basename = os.path.basename(self.name)
self.dirname = os.path.dirname(self.name)
self.tokenizer = Tokenizer(self.text())
self.tokenizer.tokenize_text()
def __repr__(self):
return "Document:{d}".format(d=os.path.basename(self.name))
def text(self):
return u''.join(map(lambda t : t.decode_contents(), self('TEXT')))
def tags(self, ttypes=None):
"""Return all annotation tags whose type is in ttypes (if ttypes is unspecified, all tags are returned)."""
is_tag = lambda item : isinstance(item, Tag)
if not self.find('TAGS'):
tags = []
else:
tags = filter(is_tag, self.find('TAGS').children)
if ttypes:
tags = filter(lambda tag : tag.name in ttypes, tags)
return tags
def query_extents(self, ttypes, start, end):
"""Return a list of extent tags whose types are in the list of ttypes
and whose start and end attributes match the given start and end."""
matches = lambda t : \
map(int, (t['start'], t['end'])) == map(int, (start, end))
return filter(matches, self.tags(ttypes=ttypes))
def query_links(self, ttypes, trigger_id):
"""Return a list of link tags whose types are in the list of ttypes
and whose trigger has the specified trigger id."""
matches = lambda t : unicode(t['trigger']) == unicode(trigger_id)
return filter(matches, self.tags(ttypes=ttypes))
def query_links_by_attr(self, ttypes, attr_name, attr_value):
"""Return a list of link tags whose types are in the list of ttypes
and whose attribute field has the specified value."""
matches = lambda t : unicode(t[attr_name]) == unicode(attr_value)
return filter(matches, self.tags(ttypes=ttypes))
def query(self, tag_id):
"""Return the tag whose identifier matches the specified id."""
matches = lambda t : t.attrs.get('id', object()) == unicode(tag_id)
results = filter(matches, self.tags())
if any(results):
return results[0]
else:
return None
def add_attribute(self, attribute, value=u'', ttypes=None):
"""Add an attribute to a tag (and possibly specify it's value)."""
for tag in self.tags(ttypes):
if not attribute in tag.attrs.keys():
tag[attribute] = value
def rename_attribute(self, old_ttype, new_ttype, ttypes=None):
"""Change the name of attributes for all tags with the given ttypes."""
for tag in self.tags(ttypes):
if tag.attrs.get(old_ttype):
tag.attrs[new_ttype] = tag.attrs.pop(old_ttype)
def rename_tag(self, old_ttype, new_ttype):
"""Rename a tag."""
for tag in self.tags([old_ttype]):
tag.name = new_ttype
def rename_task(self, new_task):
"""Rename the document task (the XML root tag type)."""
self.task = new_task
def consuming_tags(self):
"""Return extent annotation tags with non-negative starting offsets."""
is_extent_tag = lambda t : t.attrs.has_key('start')
is_consuming = lambda t : int(t['start']) >= 0
return filter(is_consuming, filter(is_extent_tag, self.tags()))
def sort_tags_by_begin_offset(self):
"""Make dictionary of tag objects keyed on their 'start' field.
Used for matching tags to tokens using offsets"""
tag_dict = {}
movelink_tag_dict = {}
olink_tag_dict = {}
qslink_tag_dict = {}
tags = self.tags()
for t in tags:
# load entity / event / signal tags
if 'start' in t.attrs:
tag_dict[int(t.attrs['start'])] = t.attrs # {start offset: xml tokens, offsets, spatial data}
# load movelink tags
#.........这里部分代码省略.........