本文整理汇总了Python中etl.ETLUtils.split_train_test方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.split_train_test方法的具体用法?Python ETLUtils.split_train_test怎么用?Python ETLUtils.split_train_test使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils
的用法示例。
在下文中一共展示了ETLUtils.split_train_test方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calculate_top_n_precision
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds):
start_time = time.time()
split = 1 - (1/float(num_folds))
total_precision = 0.
num_cycles = 0
for i in xrange(0, num_folds):
print('Fold', i )
start = float(i) / num_folds
train, test = ETLUtils.split_train_test(
reviews, split=split, shuffle_data=False, start=start)
recommender.load(train)
user_ids = recommender.user_ids
for user_id in user_ids:
precision = calculate_recommender_precision(
test, user_id, recommender, n, min_score)
if precision is not None:
total_precision += precision
num_cycles += 1
final_precision = total_precision / num_cycles
execution_time = time.time() - start_time
print('Final Top N Precision: %f' % final_precision)
print("--- %s seconds ---" % execution_time)
result = {
'Top N': final_precision,
'Execution time': execution_time
}
return result
示例2: create_single_topic_model
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_single_topic_model(cycle_index, fold_index, check_exists=True):
Constants.print_properties()
print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))
if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
msg = 'This function shouldn\'t be used when the ' \
'separate_topic_model_recsys_reviews property is set to True'
raise ValueError(msg)
records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
pass
elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
split = 1 - (1 / float(num_folds))
cv_start = float(cycle) / num_folds
print('cv_start', cv_start)
records, _ = ETLUtils.split_train_test(records, split, cv_start)
else:
raise ValueError('Unknown cross-validation strategy')
utilities.plant_seeds()
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
split = 1 - (1/float(num_folds))
for i in range(cycle_index+1):
if Constants.SHUFFLE_DATA:
random.shuffle(records)
cv_start = float(fold_index) / num_folds
train_records, test_records = \
ETLUtils.split_train_test(records, split=split, start=cv_start)
return create_topic_model(
train_records, cycle_index, fold_index, check_exists)
示例3: split_data
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def split_data(self):
"""
We split the data into training and validation
NOTE: Beware that there could be users/items that don't appear in the
training set in the test set
"""
self.train_votes, validation_test_votes = ETLUtils.split_train_test(
self.corpus.vote_list, split=0.8)
self.valid_votes, self.test_votes = ETLUtils.split_train_test(
validation_test_votes, split=0.5)
for vote in self.train_votes:
user = vote.user
item = vote.item
if user not in self.n_training_per_user:
self.n_training_per_user[user] = 0
if item not in self.n_training_per_item:
self.n_training_per_item[item] = 0
self.n_training_per_user[user] += 1
self.n_training_per_item[item] += 1
self.train_votes_per_user[user].append(vote)
self.train_votes_per_item[item].append(vote)
示例4: create_single_topic_model
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_single_topic_model(cycle_index, fold_index):
print(Constants._properties)
print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))
records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
plant_seeds()
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
split = 1 - (1/float(num_folds))
for i in range(cycle_index+1):
if Constants.SHUFFLE_DATA:
random.shuffle(records)
cv_start = float(fold_index) / num_folds
train_records, test_records = \
ETLUtils.split_train_test(records, split=split, start=cv_start)
create_topic_model(train_records, cycle_index, fold_index)
示例5: create_topic_models
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_topic_models():
print(Constants._properties)
print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))
records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
plant_seeds()
num_cycles = Constants.NUM_CYCLES
num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
split = 1 - (1/float(num_folds))
for i in range(num_cycles):
print('\n\nCycle: %d/%d' % ((i+1), num_cycles))
if Constants.SHUFFLE_DATA:
random.shuffle(records)
train_records_list = []
for j in range(num_folds):
cv_start = float(j) / num_folds
train_records, test_records =\
ETLUtils.split_train_test(records, split=split, start=cv_start)
train_records_list.append(train_records)
args = zip(
train_records_list,
[i] * Constants.CROSS_VALIDATION_NUM_FOLDS,
range(Constants.CROSS_VALIDATION_NUM_FOLDS)
)
parallel_context_top_n(args)
示例6: perform_cross_validation
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def perform_cross_validation(
records, recommender, num_folds, cache_reviews=None, reviews_type=None):
start_time = time.time()
split = 1 - (1/float(num_folds))
total_mean_absolute_error = 0.
total_mean_square_error = 0.
total_coverage = 0.
num_cycles = 0
for i in range(0, num_folds):
print('Num cycles: %d' % i)
start = float(i) / num_folds
cluster_labels = None
train_records, test_records = ETLUtils.split_train_test(
records, split=split, start=start)
if cache_reviews:
train_reviews, test_reviews = ETLUtils.split_train_test(
cache_reviews, split=split, start=start)
if reviews_type is not None:
cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
recommender.reviews = train_reviews
recommender.load(train_records)
if cluster_labels is not None:
separated_records = reviews_clusterer.split_list_by_labels(
test_records, cluster_labels)
if reviews_type == 'specific':
test_records = separated_records[0]
if reviews_type == 'generic':
test_records = separated_records[1]
_, errors, num_unknown_ratings = predict_rating_list(recommender, test_records)
recommender.clear()
mean_absolute_error = MeanAbsoluteError.compute_list(errors)
root_mean_square_error = RootMeanSquareError.compute_list(errors)
num_samples = len(test_records)
coverage = float((num_samples - num_unknown_ratings) / num_samples)
# print('Total length:', len(test))
# print('Unknown ratings:', num_unknown_ratings)
# print('Coverage:', coverage)
if mean_absolute_error is not None:
total_mean_absolute_error += mean_absolute_error
total_mean_square_error += root_mean_square_error
total_coverage += coverage
num_cycles += 1
else:
print('Mean absolute error is None!!!')
final_mean_absolute_error = total_mean_absolute_error / num_cycles
final_root_squared_error = total_mean_square_error / num_cycles
final_coverage = total_coverage / num_cycles
execution_time = time.time() - start_time
print('Final mean absolute error: %f' % final_mean_absolute_error)
print('Final root mean square error: %f' % final_root_squared_error)
print('Final coverage: %f' % final_coverage)
print("--- %s seconds ---" % execution_time)
result = {
'MAE': final_mean_absolute_error,
'RMSE': final_root_squared_error,
'Coverage': final_coverage,
'Execution time': execution_time
}
return result
示例7: calculate_recall_in_top_n
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def calculate_recall_in_top_n(
records, recommender, n, num_folds, split=None, min_score=5.0,
cache_reviews=None, reviews_type=None):
start_time = time.time()
if split is None:
split = 1 - (1/float(num_folds))
# split = 0.984
total_recall = 0.
total_coverage = 0.
num_cycles = 0.0
for i in xrange(0, num_folds):
print('Fold', i)
print('started training', time.strftime("%Y/%d/%m-%H:%M:%S"))
start = float(i) / num_folds
cluster_labels = None
train_records, test_records = ETLUtils.split_train_test(
records, split=split, shuffle_data=False, start=start)
if cache_reviews:
train_reviews, test_reviews = ETLUtils.split_train_test(
cache_reviews, split=split, shuffle_data=False, start=start)
if reviews_type is not None:
cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
recommender.reviews = train_reviews
recommender.load(train_records)
print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S"))
if cluster_labels is not None:
separated_records = reviews_clusterer.split_list_by_labels(
test_records, cluster_labels)
if reviews_type == 'specific':
test_records = separated_records[0]
if reviews_type == 'generic':
test_records = separated_records[1]
positive_reviews = \
[review for review in test_records if review['overall_rating'] >= min_score]
if len(positive_reviews) == 0:
continue
num_hits = 0.0
num_predictions = 0.0
for review in positive_reviews:
user_id = review['user_id']
item_id = review['offering_id']
if not recommender.has_context:
hit = calculate_is_a_hit(
test_records, recommender, user_id, item_id, n)
else:
text_review = review['text']
hit = calculate_is_a_hit(
test_records, recommender, user_id, item_id, n, text_review)
if hit is None:
continue
if hit:
num_hits += 1
num_predictions += 1
# print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S")))
if num_predictions == 0:
continue
recommender.clear()
recall = num_hits / num_predictions
coverage = num_predictions / len(positive_reviews)
print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S"))
print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S"))
total_recall += recall
total_coverage += coverage
num_cycles += 1
final_recall = total_recall / num_cycles
final_coverage = total_coverage / num_cycles
execution_time = time.time() - start_time
print('Final Top N Precision: %f' % final_recall)
print('Final Coverage: %f' % final_coverage)
print("--- %s seconds ---" % execution_time)
result = {
'Top N': final_recall,
'Coverage': final_coverage,
'Execution time': execution_time
}
return result