本文整理汇总了Python中etl.ETLUtils.split_train_test_copy方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.split_train_test_copy方法的具体用法?Python ETLUtils.split_train_test_copy怎么用?Python ETLUtils.split_train_test_copy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils
的用法示例。
在下文中一共展示了ETLUtils.split_train_test_copy方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: perform_cross_validation
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test_copy [as 别名]
def perform_cross_validation(self):
Constants.print_properties()
utilities.plant_seeds()
total_recall = 0.0
total_specific_recall = 0.0
total_generic_recall = 0.0
total_cycle_time = 0.0
num_cycles = Constants.NUM_CYCLES
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
total_iterations = num_cycles * num_folds
split = 1 - (1/float(num_folds))
self.load()
for i in range(num_cycles):
print('\n\nCycle: %d/%d' % ((i+1), num_cycles))
if Constants.SHUFFLE_DATA:
self.shuffle()
self.records = copy.deepcopy(self.original_records)
self.reviews = copy.deepcopy(self.original_reviews)
for j in range(num_folds):
fold_start = time.time()
cv_start = float(j) / num_folds
print('\nFold: %d/%d' % ((j+1), num_folds))
self.create_tmp_file_names()
self.train_records, self.test_records = \
ETLUtils.split_train_test_copy(
self.records, split=split, start=cv_start)
self.train_reviews, self.test_reviews = \
ETLUtils.split_train_test_copy(
self.reviews, split=split, start=cv_start)
self.export()
if Constants.USE_CONTEXT:
lda_based_context = self.train_word_model()
self.find_reviews_topics(lda_based_context)
self.prepare()
self.predict()
self.evaluate()
recall = self.top_n_evaluator.recall
specific_recall = self.top_n_evaluator.specific_recall
generic_recall = self.top_n_evaluator.generic_recall
total_recall += recall
total_specific_recall += specific_recall
total_generic_recall += generic_recall
fold_end = time.time()
fold_time = fold_end - fold_start
total_cycle_time += fold_time
self.clear()
print("Total fold %d time = %f seconds" % ((j+1), fold_time))
average_recall = total_recall / total_iterations
average_specific_recall = total_specific_recall / total_iterations
average_generic_recall = total_generic_recall / total_iterations
average_cycle_time = total_cycle_time / total_iterations
print('average recall: %f' % average_recall)
print('average specific recall: %f' % average_specific_recall)
print('average generic recall: %f' % average_generic_recall)
print('average cycle time: %f' % average_cycle_time)
print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
results = Constants.get_properties_copy()
results['recall'] = average_recall
results['specific_recall'] = average_specific_recall
results['generic_recall'] = average_generic_recall
results['cycle_time'] = average_cycle_time
results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")
if not os.path.exists(Constants.CSV_RESULTS_FILE):
with open(Constants.CSV_RESULTS_FILE, 'wb') as f:
w = csv.DictWriter(f, sorted(results.keys()))
w.writeheader()
w.writerow(results)
else:
with open(Constants.CSV_RESULTS_FILE, 'a') as f:
w = csv.DictWriter(f, sorted(results.keys()))
w.writerow(results)
示例2: perform_cross_validation
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test_copy [as 别名]
def perform_cross_validation(self):
print(Constants._properties)
self.plant_seeds()
total_metric = 0.0
total_specific_metric = 0.0
total_generic_metric = 0.0
total_cycle_time = 0.0
num_cycles = Constants.NUM_CYCLES
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
total_iterations = num_cycles * num_folds
split = 1 - (1/float(num_folds))
self.load()
for i in range(num_cycles):
print('\n\nCycle: %d/%d' % ((i+1), num_cycles))
if Constants.SHUFFLE_DATA:
self.shuffle()
self.records = copy.deepcopy(self.original_records)
for j in range(num_folds):
fold_start = time.time()
cv_start = float(j) / num_folds
print('\nFold: %d/%d' % ((j+1), num_folds))
self.create_tmp_file_names()
self.train_records, self.test_records =\
ETLUtils.split_train_test_copy(
self.records, split=split, start=cv_start)
self.get_records_to_predict()
if Constants.USE_CONTEXT:
lda_based_context = self.train_topic_model(i, j)
self.find_reviews_topics(lda_based_context)
self.predict()
metric, specific_metric, generic_metric = self.evaluate()
total_metric += metric
total_specific_metric += specific_metric
total_generic_metric += generic_metric
fold_end = time.time()
fold_time = fold_end - fold_start
total_cycle_time += fold_time
self.clear()
print("Total fold %d time = %f seconds" % ((j+1), fold_time))
metric_name = Constants.EVALUATION_METRIC
metric_average = total_metric / total_iterations
average_specific_metric = total_specific_metric / total_iterations
average_generic_metric = total_generic_metric / total_iterations
average_cycle_time = total_cycle_time / total_iterations
print('average %s: %f' % (metric_name, metric_average))
print(
'average specific %s: %f' % (metric_name, average_specific_metric))
print('average generic %s: %f' % (metric_name, average_generic_metric))
print('average cycle time: %f' % average_cycle_time)
print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
#
results = copy.deepcopy(Constants._properties)
results[Constants.EVALUATION_METRIC] = metric_average
results['specific_' + metric_name] = average_specific_metric
results['generic_' + metric_name] = average_generic_metric
results['cycle_time'] = average_cycle_time
results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")
write_results_to_csv(results)
write_results_to_json(results)