本文整理汇总了Python中philologic.OHCOVector.Record类的典型用法代码示例。如果您正苦于以下问题:Python Record类的具体用法?Python Record怎么用?Python Record使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Record类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: prev_next_obj
def prev_next_obj(loader_obj, text, depth=5):
object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth]
record_dict = {}
temp_file = text['raw'] + '.tmp'
output_file = open(temp_file, 'w')
for line in open(text['sortedtoms']):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
if type in record_dict:
record_dict[type].attrib['next'] = ' '.join(id)
if type in object_types:
print >> output_file, record_dict[type]
else:
del record_dict[type].attrib['next']
del record_dict[type].attrib['prev']
print >> output_file, record_dict[type]
record.attrib['prev'] = ' '.join(record_dict[type].id)
record_dict[type] = record
else:
record.attrib['prev'] = ''
record_dict[type] = record
object_types.reverse()
for obj in object_types:
record_dict[obj].attrib['next'] = ''
print >> output_file, record_dict[obj]
output_file.close()
os.remove(text['sortedtoms'])
tomscommand = "cat %s | egrep \"^doc|^div|^para\" | sort %s > %s" % (temp_file,loader_obj.sort_by_id,text["sortedtoms"])
os.system(tomscommand)
os.remove(temp_file)
示例2: normalize_these_columns
def normalize_these_columns(loader_obj,text):
current_values = {}
tmp_file = open(text["sortedtoms"] + ".tmp","w")
for column in columns:
current_values[column] = ""
for line in open(text["sortedtoms"]):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
if type == "div1":
for column in columns:
if column in record.attrib:
current_values[column] = record.attrib[column]
else:
current_values[column] = ""
elif type == "div2":
for column in columns:
if column in record.attrib:
current_values[column] = record.attrib[column]
elif type == "div3":
for column in columns:
if column not in record.attrib:
record.attrib[column] = current_values[column]
print >> tmp_file, record
tmp_file.close()
os.remove(text["sortedtoms"])
os.rename(text["sortedtoms"] + ".tmp",text["sortedtoms"])
示例3: inner_prev_next_obj
def inner_prev_next_obj(loader_obj, text):
record_dict = {}
temp_file = text['raw'] + '.tmp'
output_file = open(temp_file, 'w')
for line in open(text['sortedtoms']):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
if type in record_dict:
record_dict[type].attrib['next'] = ' '.join(id)
if type in types:
print >> output_file, record_dict[type]
else:
del record_dict[type].attrib['next']
del record_dict[type].attrib['prev']
print >> output_file, record_dict[type]
record.attrib['prev'] = ' '.join(record_dict[type].id)
record_dict[type] = record
else:
record.attrib['prev'] = ''
record_dict[type] = record
types.reverse()
for obj in types:
try:
record_dict[obj].attrib['next'] = ''
print >> output_file, record_dict[obj]
except KeyError:
pass
output_file.close()
os.remove(text['sortedtoms'])
type_pattern = "|".join("^%s" % t for t in loader_obj.types)
tomscommand = "cat %s | egrep \"%s\" | sort %s > %s" % (temp_file,type_pattern,loader_obj.sort_by_id,text["sortedtoms"])
os.system(tomscommand)
os.remove(temp_file)
示例4: load_record
def load_record(line):
philo_type, word, philo_id, attrib = line.split('\t')
philo_id = philo_id.split()
record = Record(philo_type, word, philo_id)
record.attrib = loads(attrib)
record.attrib["prev"] = ""
record.attrib["next"] = ""
return record
示例5: load_record
def load_record(line):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = loads(attrib)
record.attrib["prev"] = ""
record.attrib["next"] = ""
return record
示例6: fix_pages
def fix_pages(loader_obj,text,depth=4):
"""Unfinished, do not use"""
object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word'][:depth]
current_page = 0;
temp_file = open(text["sortedtoms"] + ".tmp","w")
for line in open(text["sortedtoms"]):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
示例7: tag_words
def tag_words(loader_obj, text):
# Set up the treetagger process
tt_args = [tt_path, "-token", "-lemma", "-prob", '-no-unknown', "-threshold", ".01", param_file]
ttout_fh = open(text["raw"] + ".ttout", "w")
tt_worker = Popen(tt_args, stdin=PIPE, stdout=ttout_fh)
raw_fh = open(text["raw"], "r")
line_count = 0
# read through the object file, pass the words to treetagger
for line in raw_fh:
type, word, id, attrib = line.split('\t')
id = id.split()
if type == "word":
word = word.decode('utf-8', 'ignore').lower().encode('utf-8')
# close and re-open the treetagger process to prevent garbage
# output.
if line_count > maxlines:
tt_worker.stdin.close()
tt_worker.wait()
new_ttout_fh = open(text["raw"] + ".ttout", "a")
tt_worker = Popen(tt_args, stdin=PIPE, stdout=new_ttout_fh)
line_count = 0
print >> tt_worker.stdin, word
line_count += 1
# finish tagging
tt_worker.stdin.close()
tt_worker.wait()
# go back through the object file, and add the treetagger results to
# each word
tmp_fh = open(text["raw"] + ".tmp", "w")
tag_fh = open(text["raw"] + ".ttout", "r")
for line in open(text["raw"], "r"):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = loads(attrib)
if type == "word":
tag_l = tag_fh.readline()
next_word, tag = tag_l.split("\t")[0:2]
pos, lem, prob = tag.split(" ")
if next_word != word.decode('utf-8', 'ignore').lower().encode('utf-8'):
print >> sys.stderr, "TREETAGGER ERROR:", next_word, " != ", word, pos, lem
return
else:
record.attrib["pos"] = pos
record.attrib["lemma"] = lem
print >> tmp_fh, record
else:
print >> tmp_fh, record
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp", text["raw"])
os.remove(text["raw"] + ".ttout")
示例8: normalize_unicode_raw_words
def normalize_unicode_raw_words(loader_obj, text):
tmp_file = open(text["raw"] + ".tmp","w")
for line in open(text["raw"]):
rec_type, word, id, attrib = line.split('\t')
id = id.split()
if rec_type == "word":
word = word.decode("utf-8").lower().encode("utf-8")
record = Record(rec_type, word, id)
record.attrib = eval(attrib)
print >> tmp_file, record
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp",text["raw"])
示例9: normalize_unicode_raw_words
def normalize_unicode_raw_words(loader_obj, text):
tmp_file = open(text["raw"] + ".tmp", "w")
with open(text["raw"]) as fh:
for line in fh:
rec_type, word, id, attrib = line.split('\t')
id = id.split()
if rec_type == "word":
word = word.decode("utf-8").lower().encode("utf-8")
record = Record(rec_type, word, id)
record.attrib = loads(attrib)
print(record, file=tmp_file)
tmp_file.close()
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp", text["raw"])
示例10: normalize_unicode_raw_words
def normalize_unicode_raw_words(loader_obj, text):
tmp_file = open(text["raw"] + ".tmp", "w")
with open(text["raw"]) as filehandle:
for line in filehandle:
rec_type, word, philo_id, attrib = line.split('\t')
philo_id = philo_id.split()
if rec_type == "word":
word = word.lower()
record = Record(rec_type, word, philo_id)
record.attrib = loads(attrib)
print(record, file=tmp_file)
tmp_file.close()
os.remove(text["raw"])
os.rename(text["raw"] + ".tmp", text["raw"])
示例11: inner_make_object_ancestors
def inner_make_object_ancestors(loader_obj, text):
temp_file = text['words'] + '.tmp'
output_file = open(temp_file, 'w')
for line in open(text['words']):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
for type in types:
zeros_to_add = ['0' for i in range(7 - type_depth[type])]
philo_id = id[:type_depth[type]] + zeros_to_add
record.attrib[type + '_ancestor'] = ' '.join(philo_id)
print >> output_file, record
output_file.close()
os.remove(text['words'])
os.rename(temp_file, text['words'])
示例12: inner_make_object_ancestors
def inner_make_object_ancestors(loader_obj, text):
temp_file = text['words'] + '.tmp'
output_file = open(temp_file, 'w')
with open(text['words']) as filehandle:
for line in filehandle:
philo_type, word, philo_id, attrib = line.split('\t')
philo_id = philo_id.split()
record = Record(philo_type, word, philo_id)
record.attrib = loads(attrib)
for philo_type in philo_types:
zeros_to_add = ['0' for i in range(7 - philo_type_depth[philo_type])]
philo_id = philo_id[:philo_type_depth[philo_type]] + zeros_to_add
record.attrib[philo_type + '_ancestor'] = ' '.join(philo_id)
print(record, file=output_file)
output_file.close()
os.remove(text['words'])
os.rename(temp_file, text['words'])
示例13: smash_these_unicode_columns
def smash_these_unicode_columns(loader_obj, text):
tmp_file = open(text["sortedtoms"] + ".tmp", "w")
for line in open(text["sortedtoms"]):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = loads(attrib)
for column in columns:
if column in record.attrib:
# print >> sys.stderr, repr(record.attrib)
col = record.attrib[column].decode("utf-8")
col = col.lower()
smashed_col = [c for c in unicodedata.normalize("NFKD", col) if not unicodedata.combining(c)]
record.attrib[column + "_norm"] = ''.join(smashed_col).encode("utf-8")
print(record, file=tmp_file)
tmp_file.close()
os.remove(text["sortedtoms"])
os.rename(text["sortedtoms"] + ".tmp", text["sortedtoms"])
示例14: make_word_counts
def make_word_counts(loader_obj, text, depth=5):
object_types = ['doc', 'div1', 'div2', 'div3', 'para', 'sent', 'word']
counts = [0 for i in range(depth)]
temp_file = text['raw'] + '.tmp'
output_file = open(temp_file, 'w')
for line in open(text['raw']):
type, word, id, attrib = line.split('\t')
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
for d,count in enumerate(counts):
if type == 'word':
counts[d] += 1
elif type == object_types[d]:
record.attrib['word_count'] = counts[d]
counts[d] = 0
print >> output_file, record
output_file.close()
os.remove(text['raw'])
os.rename(temp_file, text['raw'])
示例15: make_word_counts
def make_word_counts(loader_obj, text, depth=4):
object_types = ["doc", "div1", "div2", "div3", "para", "sent", "word"]
counts = [0 for i in range(depth)]
temp_file = text["raw"] + ".tmp"
output_file = open(temp_file, "w")
for line in open(text["raw"]):
type, word, id, attrib = line.split("\t")
id = id.split()
record = Record(type, word, id)
record.attrib = eval(attrib)
for d, count in enumerate(counts):
if type == "word":
counts[d] += 1
elif type == object_types[d]:
record.attrib["word_count"] = counts[d]
counts[d] = 0
print >> output_file, record
output_file.close()
os.remove(text["raw"])
os.rename(temp_file, text["raw"])