本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer.span_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.span_tokenize方法的具体用法?Python PunktSentenceTokenizer.span_tokenize怎么用?Python PunktSentenceTokenizer.span_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.PunktSentenceTokenizer
的用法示例。
在下文中一共展示了PunktSentenceTokenizer.span_tokenize方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import span_tokenize [as 别名]
def main():
conn = get_connection(UNICODE=True)
curr = conn.cursor()
tokenizer = TreebankWordTokenizer()
while True:
curr.execute("""SELECT id, text, language FROM documents
WHERE
--guid='tw:122144569302323201'
EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL)
LIMIT 1""")
data = curr.fetchone()
if data is None:
print "sleep"
timer.sleep_minute(30)
continue
id, text, lang = data
print "id", id
curr.execute("""SELECT * FROM instances
WHERE item_id = %s
AND begintoken IS NULL""", (id,))
# throw away `confidence`
instances = [list(x)[:-1] for x in curr]
if not len(instances):
continue
instance_ = []
for ins in instances:
ins[-1] = None
ins[-2] = None
ins[-3] = None
instance_.append(ins)
instances = instance_
#print instances
sent_tok = PunktSentenceTokenizer()
for sid, sentidx in enumerate(sent_tok.span_tokenize(text)):
#print '++++'
sentence = text[sentidx[0]:sentidx[1]]
#print sentence
#print '----'
for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)):
# TODO indexy jsou pouze relativni k vete
# ale instances je ma od zacatku!
indexes = list(indexes)
indexes[0] = sentidx[0] + indexes[0]
indexes[1] = sentidx[0] + indexes[1]
word = text[indexes[0]:indexes[1]]
#print pos, word, indexes
for i, instance in enumerate(instances):
id, entity_id, item_id, exact, offset, length, sid_, begin, end =instance
#print i,instance
if sid_ is None:
if begin is None:
if offset >= indexes[0] and offset <= indexes[1]:
instances[i][-2] = begin = pos
instances[i][-3] = sid_ = sid
if sid_ == sid:
if end is None and begin is not None:
off = offset + length
if off <= indexes[1] and off >= indexes[0]:
instances[i][-1] = pos
if off == indexes[0]:
instances[i][-1] = pos - 1
for instance in instances:
print instance
id, entity_id, item_id, exact, offset, length, sid, begin, end =instance
#print exact, ">>", sid, begin, end
if end is None:
if not " " in exact:
end = begin
else:
end = -1
curr.execute("""UPDATE instances
SET sid=%s, begintoken=%s, endtoken=%s
WHERE id=%s""", (sid, begin, end, id))