本文整理汇总了Python中etl.ETLUtils.save_json_file方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.save_json_file方法的具体用法?Python ETLUtils.save_json_file怎么用?Python ETLUtils.save_json_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils
的用法示例。
在下文中一共展示了ETLUtils.save_json_file方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: export_records_to_predict
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def export_records_to_predict(self, records_file):
if self.records_to_predict is None:
self.records_to_predict = self.get_records_to_predict()
ETLUtils.save_json_file(records_file, self.records_to_predict)
with open(records_file + '.pkl', 'wb') as write_file:
pickle.dump(
self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
示例2: export_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def export_records(self):
print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
self.dictionary.save(Constants.DICTIONARY_FILE)
ETLUtils.save_json_file(
Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
self.drop_unnecessary_fields()
ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
示例3: main
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def main():
# dataset = 'hotel'
dataset = 'restaurant'
my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
my_training_records_file =\
my_folder + 'classified_' + dataset + '_reviews.json'
my_training_reviews_file =\
my_folder + 'classified_' + dataset + '_reviews.pkl'
my_training_records = ETLUtils.load_json_file(my_training_records_file)
with open(my_training_reviews_file, 'rb') as read_file:
my_training_reviews = pickle.load(read_file)
classifier = ReviewsClassifier()
classifier.train(my_training_records, my_training_reviews)
my_input_records_file =\
my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
my_input_reviews_file =\
my_folder + 'reviews_' + dataset + '_shuffled.pkl'
my_output_records_file =\
my_folder + 'yelp_training_set_review_' + dataset +\
's_shuffled_tagged.json'
with open(my_input_reviews_file, 'rb') as read_file:
my_input_reviews = pickle.load(read_file)
my_input_records = ETLUtils.load_json_file(my_input_records_file)
my_output_records =\
classifier.label_json_reviews(my_input_records, my_input_reviews)
ETLUtils.save_json_file(my_output_records_file, my_output_records)
示例4: test
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def test():
document_term_matrix = NmfTopicExtractor.load_document_term_matrix()
results = []
# my_list = range(2, 31)
my_list = range(2, 61)
for i in my_list:
Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
topic_model = NmfTopicExtractor()
topic_model.load_trained_data()
document_topic_matrix = topic_model.document_topic_matrix
topic_term_matrix = topic_model.topic_term_matrix
divergence = calculate_divergence(
document_term_matrix, document_topic_matrix, topic_term_matrix)
result = {
'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
'divergence': divergence,
Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
}
results.append(result)
print('Num topics: %d, Divergence: %f' %
(Constants.TOPIC_MODEL_NUM_TOPICS, divergence))
for result in results:
print('%d %f' % (result['num_topics'], result['divergence']))
prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
'_topic_model_divergence'
csv_file_path = prefix + '.csv'
json_file_path = prefix + '.json'
headers = sorted(results[0].keys())
ETLUtils.save_csv_file(csv_file_path, results, headers)
ETLUtils.save_json_file(json_file_path, results)
示例5: export_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def export_records(self):
print('%s: exporting transformed records' %
time.strftime("%Y/%m/%d-%H:%M:%S"))
records_to_export = []
desired_fields = [
Constants.USER_INTEGER_ID_FIELD,
Constants.ITEM_INTEGER_ID_FIELD,
Constants.RATING_FIELD,
Constants.CONTEXT_FIELD,
]
for record in self.records:
new_record = {field: record[field] for field in desired_fields}
records_to_export.append(new_record)
file_name = Constants.generate_file_name(
'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
None, None, True, True, uses_carskit=False, normalize_topics=True,
format_context=True)
ETLUtils.save_json_file(file_name, records_to_export)
示例6: load_all_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def load_all_reviews():
city_files = [
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Chicago_review.xml',
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Dublin_review.xml',
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Hong kong_review.xml',
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/London_review.xml',
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/New York_review.xml',
'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/Singapore_review.xml'
]
all_reviews = []
for city_file in city_files:
city_reviews = load_reviews(city_file)
all_reviews.extend(city_reviews)
ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/all_reviews.json', all_reviews)
cleaned_reviews = clean_reviews(all_reviews)
ETLUtils.save_json_file('/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json', cleaned_reviews)
return all_reviews
示例7: label_json_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def label_json_reviews(self, input_file, output_file, reviews=None):
records = ETLUtils.load_json_file(input_file)
if reviews is None:
reviews = []
for record in records:
reviews.append(Review(record["text"]))
if len(records) != len(reviews):
msg = "The size of the records and reviews arrays must be the same"
raise ValueError(msg)
predicted_classes = self.predict(reviews)
for record, predicted_class in zip(records, predicted_classes):
if predicted_class:
label = "specific"
else:
label = "generic"
record["predicted_class"] = label
ETLUtils.save_json_file(output_file, records)
示例8: extract_fields
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
return reviews
def extract_fields(reviews):
"""
Modifies the given list of reviews in order to extract the values contained
in the ratings field to top level fields. For instance, a review which is
in the form
{'user_id': 'U1', 'offering_id': :'I1',
'ratings': {'cleanliness': 4.0, 'location': 5.0}}
would become:
{'user_id': 'U1', 'offering_id': :'I1',
'ratings': {'cleanliness': 4.0, 'location': 5.0},
'cleanliness_rating': 4.0, 'location_rating': 5.0}
:param reviews: a list of reviews.
"""
for review in reviews:
review['offering_id'] = review['business_id']
review['overall_rating'] = review['stars']
my_reviews = pre_process_reviews()
filtered_reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
ETLUtils.save_json_file(filtered_reviews_file, my_reviews)
# print('Num reviews', len(my_reviews))
print(my_reviews[0])
print(my_reviews[1])
# print(my_reviews[2])
# print(my_reviews[3])
示例9: transform_manually_labeled_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
def transform_manually_labeled_reviews():
full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json')
records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
print('total records: %d' % len(records))
new_records = []
for record in records:
sentence_index = record['sentence_index']
if sentence_index > 0:
continue
record['predicted_class'] = record['sentence_type']
new_records.append(record)
# count = 0
# for new_record in new_records:
# internal_count = 0
# for full_record in full_records:
# if full_record['text'].startswith(new_record['text']):
# # print(full_record['text'])
# internal_count += 1
# count += 1
# print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id']))
#
# if internal_count > 1:
# print('internal count: %d\treview_id: %s' % (internal_count, new_record['text']))
# print('count: %d' % count)
index = 0
for new_record in new_records:
while True:
full_record = full_records[index]
if full_record['text'].startswith(new_record['text']):
new_record[Constants.USER_ID_FIELD] = full_record['user_id']
new_record[Constants.ITEM_ID_FIELD] = full_record['business_id']
new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id']
new_record[Constants.RATING_FIELD] = full_record['stars']
break
index += 1
index += 1
print('index: %d' % index)
for new_record in new_records:
for full_record in full_records:
if new_record['review_id'] == full_record['review_id']:
print('%s ====' % new_record['text'])
print(full_record['text'])
print('******************\n******************\n******************\n******************')
break
# reviews_preprocessor = ReviewsPreprocessor()
# new_records = reviews_preprocessor.lemmatize_sentences(new_records)
# reviews_preprocessor.records = new_records
# reviews_preprocessor.build_bag_of_words()
# reviews_preprocessor.drop_unnecessary_fields()
new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
Constants.ITEM_TYPE + '_reviews_first_sentences.json'
print(new_records[0])
ETLUtils.save_json_file(new_classified_records_file, new_records)
示例10: sort_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import save_json_file [as 别名]
if review['business_id'] in business_ids:
filtered_reviews.append(review)
return filtered_reviews
@staticmethod
def sort_records(records, field, reverse=False):
return sorted(records, key=itemgetter(field), reverse=reverse)
start = time.time()
review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))
# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)