本文整理汇总了Python中philologic.OHCOVector.Record.attrib["lemma"]方法的典型用法代码示例。如果您正苦于以下问题:Python Record.attrib["lemma"]方法的具体用法?Python Record.attrib["lemma"]怎么用?Python Record.attrib["lemma"]使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类philologic.OHCOVector.Record
的用法示例。
在下文中一共展示了Record.attrib["lemma"]方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tag_words
# 需要导入模块: from philologic.OHCOVector import Record [as 别名]
# 或者: from philologic.OHCOVector.Record import attrib["lemma"] [as 别名]
def tag_words(loader_obj, text):
# Set up the treetagger process
tt_args = [tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file]
ttout_fh = open(text["raw"] + ".ttout", "w")
tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh)
raw_fh = open(text["raw"], "r")
line_count = 0
# read through the object file, pass the words to treetagger
for line in raw_fh:
type, word, id, attrib = line.split('\t')
id = id.split()
if type == "word":
word = word.decode('utf-8', 'ignore').lower().encode('utf-8')
# close and re-open the treetagger process to prevent garbage
# output.
if line_count > maxlines:
tt_worker.stdin.close()
tt_worker.wait()
new_ttout_fh = open(text["raw"] + ".ttout", "a")
tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh)
line_count = 0
print >> tt_worker.stdin, word
line_count += 1
# finish tagging
tt_worker.stdin.close()
tt_worker.wait()
# go back through the object file, and add the treetagger results to
# each word
tmp_fh = open(text["raw"] + ".tmp", "w")
tag_fh = open(text["raw"] + ".ttout", "r")
for line in open(text["raw"], "r"):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = loads(attrib)
if type == "word":
tag_l = tag_fh.readline()
next_word, tag = tag_l.split("\t")[0:2]
pos, lem, prob = tag.split(" ")
if next_word != word.decode('utf-8', 'ignore').lower().encode('utf-8'):
print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem
return
else:
record.attrib["pos"] = pos
record.attrib["lemma"] = lem
print >> tmp_fh, record
else:
print >> tmp_fh, record
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp", text["raw"])
os.remove(text["raw"] + ".ttout")