本文整理汇总了Python中etl.ETLUtils.load_json_file方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.load_json_file方法的具体用法?Python ETLUtils.load_json_file怎么用?Python ETLUtils.load_json_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils
的用法示例。
在下文中一共展示了ETLUtils.load_json_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main_evaluate
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main_evaluate():
I = my_i
records = ETLUtils.load_json_file(RECORDS_FILE)
# print('num_records', len(records))
test_file = RECORDS_FILE + '_test'
test_records = ETLUtils.load_json_file(test_file)
top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
top_n_evaluator.find_important_records()
# top_n_evaluator.initialize()
# records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)
predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
predictions = rmse_calculator.read_targets_from_txt(predictions_file)
# print('total predictions', len(predictions))
top_n_evaluator.evaluate(predictions)
# print('precision', top_n_evaluator.precision)
print('recall', top_n_evaluator.recall)
return top_n_evaluator.recall
示例2: main
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
# dataset = 'hotel'
dataset = 'restaurant'
my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
my_training_records_file =\
my_folder + 'classified_' + dataset + '_reviews.json'
my_training_reviews_file =\
my_folder + 'classified_' + dataset + '_reviews.pkl'
my_training_records = ETLUtils.load_json_file(my_training_records_file)
with open(my_training_reviews_file, 'rb') as read_file:
my_training_reviews = pickle.load(read_file)
classifier = ReviewsClassifier()
classifier.train(my_training_records, my_training_reviews)
my_input_records_file =\
my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
my_input_reviews_file =\
my_folder + 'reviews_' + dataset + '_shuffled.pkl'
my_output_records_file =\
my_folder + 'yelp_training_set_review_' + dataset +\
's_shuffled_tagged.json'
with open(my_input_reviews_file, 'rb') as read_file:
my_input_reviews = pickle.load(read_file)
my_input_records = ETLUtils.load_json_file(my_input_records_file)
my_output_records =\
classifier.label_json_reviews(my_input_records, my_input_reviews)
ETLUtils.save_json_file(my_output_records_file, my_output_records)
示例3: load
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def load(self):
print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
self.original_records =\
ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
# ETLUtils.drop_fields(['tagged_words'], self.original_records)
print('num_records: %d' % len(self.original_records))
if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
user_item_map = create_user_item_map(records)
with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
示例4: classify_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def classify_reviews(self):
print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
dataset = Constants.ITEM_TYPE
folder = Constants.DATASET_FOLDER
file_name_suffix =\
'' if Constants.MAX_SENTENCES is None else '_sentences'
training_records_file = folder +\
'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
training_records = ETLUtils.load_json_file(training_records_file)
if Constants.MAX_SENTENCES is not None:
training_records = [
record for record in training_records
if record['sentence_index'] < Constants.MAX_SENTENCES
]
for record in training_records:
record['specific'] = \
'yes' if record['sentence_type'] == 'specific' else 'no'
print('num training records', len(training_records))
self.lemmatize_reviews(training_records)
classifier = ReviewsClassifier(self.classifier, self.resampler)
classifier.train(training_records)
classifier.label_json_reviews(self.records)
示例5: main
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'-c', '--cycle', metavar='int', type=int,
nargs=1, help='The index of the running cycle')
parser.add_argument(
'-f', '--fold', metavar='int', type=int,
nargs=1, help='The index of the cross validation fold')
parser.add_argument(
'-t', '--numtopics', metavar='int', type=int,
nargs=1, help='The number of topics of the topic model')
args = parser.parse_args()
fold = args.fold[0] if args.fold is not None else None
cycle = args.cycle[0] if args.cycle is not None else None
num_topics = args.numtopics[0] if args.numtopics is not None else None
if num_topics is not None:
Constants.update_properties(
{Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})
if fold is None and cycle is None:
records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
num_records = len(records)
records = records[:num_records / 2]
print('num_reviews', len(records))
create_topic_model(records, None, None)
else:
create_single_topic_model(cycle, fold)
示例6: load
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def load(self):
print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
with open(Constants.REVIEWS_FILE, 'rb') as read_file:
self.original_reviews = pickle.load(read_file)
print('num_records: %d' % len(self.original_records))
for record, review in zip(self.original_records, self.original_reviews):
review.id = record[Constants.REVIEW_ID_FIELD]
review.rating = record[Constants.RATING_FIELD]
if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
user_item_map = create_user_item_map(records)
with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
示例7: main
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
# my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
my_records = ETLUtils.load_json_file(my_file)
# my_reviews = []
# my_index = 0
#
# print("records:", len(my_records))
#
# for record in my_records:
# my_index += 1
# my_reviews.append(Review(record['text']))
# print('index', my_index)
# binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
# with open(binary_reviews_file, 'wb') as write_file:
# pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)
with open(binary_reviews_file, 'rb') as read_file:
my_reviews = pickle.load(read_file)
cluster_labels = cluster_reviews(my_reviews)
specific_records = split_list_by_labels(my_records, cluster_labels)[0]
generic_records = split_list_by_labels(my_records, cluster_labels)[1]
示例8: classify_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def classify_reviews(self):
print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
print(Constants.CLASSIFIED_RECORDS_FILE)
training_records =\
ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
# If document level set to sentence (can be either 'sentence' or int)
document_level = Constants.DOCUMENT_LEVEL
if document_level != 'review':
if document_level == 'sentence':
document_level = float("inf")
training_records = [
record for record in training_records
if record['sentence_index'] < document_level
]
for record in training_records:
record['specific'] = \
'yes' if record['sentence_type'] == 'specific' else 'no'
print('num training records', len(training_records))
training_records = self.lemmatize_reviews(training_records)
classifier = ReviewsClassifier(self.classifier, self.resampler)
classifier.train(training_records)
classifier.label_json_reviews(self.records)
示例9: dataset_bucket_analysis_by_field
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def dataset_bucket_analysis_by_field(field):
# Set the dataset
hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
Constants.update_properties(hotel_dataset_properties)
records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
print('Loaded %d records' % len(records))
user_frequency_map = {}
for record in records:
user_id = record[field]
if user_id not in user_frequency_map:
user_frequency_map[user_id] = 0
user_frequency_map[user_id] += 1
print('There is a total of %d %ss' % (len(user_frequency_map), field))
sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_x[0])
print(sorted_x[1])
print(sorted_x[2])
# print(user_frequency_map)
# Number of reviews per user
rda = ReviewsDatasetAnalyzer(records)
users_summary = rda.summarize_reviews_by_field(field)
print('Average number of reviews per %s: %f' % (field,
float(rda.num_reviews) / rda.num_users))
users_summary.plot(kind='line', rot=0)
pandas.set_option('display.max_rows', len(users_summary))
print(users_summary)
pandas.reset_option('display.max_rows')
示例10: update_labeled_reviews_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def update_labeled_reviews_records():
reviews_label_map = compare_records()
agreed_review_ids = set(reviews_label_map.keys())
classifier_records = \
ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
classifier_review_ids = \
{record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)
# for record in classifier_records:
# print(record)
print('number of records before: %d' % len(classifier_records))
print(reviews_label_map)
print(non_agreed_review_ids)
review_type_map = {'s': 'yes', 'g': 'no'}
# We remove from the classifier records the ones who don't have agreed on a
# label
classifier_records = ETLUtils.filter_out_records(
classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)
# Finally we make the update of the labels
for record in classifier_records:
review_id = record[Constants.REVIEW_ID_FIELD]
record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
# print(record)
print('number of records after: %d' % len(classifier_records))
示例11: analyze_context_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def analyze_context_records():
records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
records = ETLUtils.filter_records(records, 'context_type', ['context'])
print('num records: %d' % len(records))
for record in records:
print(record[Constants.TEXT_FIELD])
示例12: main
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
records = ETLUtils.load_json_file(
Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
context_transformer = ContextTransformer(records)
context_transformer.load_data()
context_transformer.transform_records()
context_transformer.export_records()
示例13: generate_report_fourcity_filtered
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def generate_report_fourcity_filtered():
file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb'
reviews = ETLUtils.load_json_file(file_path)
load_reviews_code =\
'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\
'reviews = ETLUtils.load_json_file(file_path)\n'
ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)
示例14: generate_report_ruihai
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def generate_report_ruihai():
file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json'
file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb'
reviews = ETLUtils.load_json_file(file_path)
load_reviews_code =\
'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\
'reviews = ETLUtils.load_json_file(file_path)\n'
ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)
示例15: get_categories
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def get_categories(file_path):
records = ETLUtils.load_json_file(file_path)
# Now we obtain the categories for all the businesses
records = ETLUtils.add_transpose_list_column('categories', records)
BusinessETL.drop_unwanted_fields(records)
return records[0].keys()