本文整理汇总了Python中tokenizer.Tokenizer.getTokens方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.getTokens方法的具体用法?Python Tokenizer.getTokens怎么用?Python Tokenizer.getTokens使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.getTokens方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getOtherTaggedText
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import getTokens [as 别名]
def getOtherTaggedText(info):
taggedtext = TextMarker.getTaggedText(info)
# print taggedtext
# print ''
btags2 = ['B_WHAT', 'B_WHO', 'B_WHEN', 'B_WHERE', 'B_WHY', 'B_HOW']
etags2 = ['E_WHAT', 'E_WHO', 'E_WHEN', 'E_WHERE', 'E_WHY', 'E_HOW']
for i, tag in enumerate(btags2):
taggedtext = taggedtext.replace(TextMarker.btags[i], tag)
for i, tag in enumerate(etags2):
taggedtext = taggedtext.replace(TextMarker.etags[i], tag)
text = ""
state = 0
for token in Tokenizer.getTokens(taggedtext):
if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), btags2)) )):
state += len([item for item in list(map((lambda x: x in token), btags2)) if item])
if (state==0):
# print "%s\t%s" % (state, TextMarker.othertags[0] + token + TextMarker.othertags[1])
text += TextMarker.othertags[0] + token + TextMarker.othertags[1]
else:
# print "%s\t%s" % (state, token)
text += token + " "
if (reduce( (lambda x, y: x or y), list(map((lambda x: x in token), etags2)) )):
state -= len([item for item in list(map((lambda x: x in token), etags2)) if item])
for i, tag in enumerate(TextMarker.btags):
text = text.replace(btags2[i], tag)
for i, tag in enumerate(TextMarker.etags):
text = text.replace(etags2[i], tag)
return text
示例2: __init__
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import getTokens [as 别名]
def __init__(self, _what, _who, _when, _where, _why, _how, _text):
self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
self.sentences = Tokenizer.getSentences(self.text)
self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
示例3: getMarkedText
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import getTokens [as 别名]
def getMarkedText(info):
omtext = TextMarker.getOtherTaggedText(info)
# print omtext
result = ""
searchObj = re.findall( r'\[b(.+?)\](.+?)\[e.+?\]', omtext)
# print len(searchObj)
for tup in searchObj:
# print tup
if (tup[0]=="other"):
result += "[%s]%s[%s]" % (tup[0], tup[1], tup[0])
else:
label = tup[0]
tokens = Tokenizer.getTokens(tup[1])
for i, token in enumerate(tokens):
prefix = "beg" if(i==0) else "in"
result += "[%s_%s]%s[%s_%s]" % (prefix, label, token, prefix, label)
return result