本文整理汇总了Python中tokenizer.Tokenizer.process_review方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.process_review方法的具体用法?Python Tokenizer.process_review怎么用?Python Tokenizer.process_review使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.process_review方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import process_review [as 别名]
def main():
# first read in the inverted index file
parser = argparse.ArgumentParser()
parser.add_argument('-index', required=True, help='Path to inverted index file')
parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
opts = parser.parse_args()
# Pre-processing
f_index = open(opts.index,'r')
print "loading index file..."
wordsmap = {}
# count = 0
# for line in f_index:
# count += 1
# j_obj = json.load(line)
# for k, v in j_obj.items():
# wordsmap[k] = v
# j_obj = None
# if count % 100 == 0:
# print count
wordsmap = json.load(f_index)
print "done"
f_index.close()
b_map = {}
print "loading business file..."
f_b = open(opts.business, 'r')
line_num = 0
for line in f_b:
b_json = json.loads(line)
b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
line_num += 1
print "done"
tokenizer = Tokenizer()
# TODO: need to check error input
# Bug: c-d exit situation
for line in sys.stdin:
result = []
line = line.strip('\n')
if len(line)==0:
continue
elif line[0]=='"':
line = line.strip('"')
words = tokenizer.process_review(line)
result = phrase_query(words, wordsmap)
elif len(line.split())==1:
words = tokenizer.process_review(line)
result = one_word_query(words[0], wordsmap)
else:
words = tokenizer.process_review(line)
result = free_text_query(words, wordsmap)
rank_res = rank(words,result,b_map,wordsmap)
print rank_res
示例2: main
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import process_review [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-review_file', required=True, help='Path to review data')
parser.add_argument('-business_file', required=True, help='Path to business data')
parser.add_argument('-output', required=True, help='Path to output index file')
opts = parser.parse_args()
f_reviews = open(opts.review_file,'r')
f_business = open(opts.business_file,'r')
line_num = 0
b_map = {}
for line in f_business:
b_obj = json.loads(line)
b_map[b_obj['business_id']] = line_num
line_num += 1
tokenizer = Tokenizer()
wordsmap = {}
line_num = 0
for line in f_reviews:
r = json.loads(line)
words = tokenizer.process_review(r['text']);
w_idx = 0
for w in words:
if w=="":
continue
b_id = b_map[r['business_id']]
if w in wordsmap:
if b_id in wordsmap:
b_map = wordsmap[w][b_id]
if line_num in b_map:
b_map[line_num].append(w_idx)
else:
b_map[line_num] = [w_idx]
else:
wordsmap[w][b_id] = {line_num:[w_idx]}
else:
wordsmap[w] = {b_id:{line_num:[w_idx]}}
w_idx += 1
line_num += 1
if line_num % 1000==0:
print line_num
# if line_num == 1000:
# break
with open(opts.output, 'w') as f_out:
json.dump(wordsmap, f_out)