Python ETLUtils.split_train_test方法代码示例

本文整理汇总了Python中etl.ETLUtils.split_train_test方法的典型用法代码示例。如果您正苦于以下问题：Python ETLUtils.split_train_test方法的具体用法？Python ETLUtils.split_train_test怎么用？Python ETLUtils.split_train_test使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils的用法示例。

在下文中一共展示了ETLUtils.split_train_test方法的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: calculate_top_n_precision

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds):

    start_time = time.time()
    split = 1 - (1/float(num_folds))
    total_precision = 0.
    num_cycles = 0

    for i in xrange(0, num_folds):
        print('Fold', i )
        start = float(i) / num_folds
        train, test = ETLUtils.split_train_test(
            reviews, split=split, shuffle_data=False, start=start)
        recommender.load(train)
        user_ids = recommender.user_ids

        for user_id in user_ids:
            precision = calculate_recommender_precision(
                test, user_id, recommender, n, min_score)

            if precision is not None:
                total_precision += precision
                num_cycles += 1

    final_precision = total_precision / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_precision)
    print("--- %s seconds ---" % execution_time)

    result = {
        'Top N': final_precision,
        'Execution time': execution_time
    }

    return result

开发者ID:anuragreddygv323，项目名称:yelp，代码行数:37，代码来源:precision_in_top_n.py

示例2: create_single_topic_model

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)

开发者ID:melqkiades，项目名称:yelp，代码行数:40，代码来源:topic_model_creator.py

示例3: split_data

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
    def split_data(self):
        """
        We split the data into training and validation
        NOTE: Beware that there could be users/items that don't appear in the
        training set in the test set

        """
        self.train_votes, validation_test_votes = ETLUtils.split_train_test(
            self.corpus.vote_list, split=0.8)
        self.valid_votes, self.test_votes = ETLUtils.split_train_test(
            validation_test_votes, split=0.5)

        for vote in self.train_votes:
            user = vote.user
            item = vote.item
            if user not in self.n_training_per_user:
                self.n_training_per_user[user] = 0
            if item not in self.n_training_per_item:
                self.n_training_per_item[item] = 0
            self.n_training_per_user[user] += 1
            self.n_training_per_item[item] += 1
            self.train_votes_per_user[user].append(vote)
            self.train_votes_per_item[item].append(vote)

开发者ID:antoine-tran，项目名称:yelp，代码行数:25，代码来源:topic_corpus.py

示例4: create_single_topic_model

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_single_topic_model(cycle_index, fold_index):

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    create_topic_model(train_records, cycle_index, fold_index)

开发者ID:antoine-tran，项目名称:yelp，代码行数:22，代码来源:topic_model_creator.py

示例5: create_topic_models

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def create_topic_models():

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.RECORDS_FILE)

    plant_seeds()
    num_cycles = Constants.NUM_CYCLES
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(num_cycles):

        print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

        train_records_list = []

        for j in range(num_folds):

            cv_start = float(j) / num_folds

            train_records, test_records =\
                ETLUtils.split_train_test(records, split=split, start=cv_start)
            train_records_list.append(train_records)

        args = zip(
            train_records_list,
            [i] * Constants.CROSS_VALIDATION_NUM_FOLDS,
            range(Constants.CROSS_VALIDATION_NUM_FOLDS)
        )

        parallel_context_top_n(args)

开发者ID:antoine-tran，项目名称:yelp，代码行数:38，代码来源:topic_model_creator.py

示例6: perform_cross_validation

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def perform_cross_validation(
        records, recommender, num_folds, cache_reviews=None, reviews_type=None):

    start_time = time.time()
    split = 1 - (1/float(num_folds))
    total_mean_absolute_error = 0.
    total_mean_square_error = 0.
    total_coverage = 0.
    num_cycles = 0

    for i in range(0, num_folds):
        print('Num cycles: %d' % i)
        start = float(i) / num_folds
        cluster_labels = None
        train_records, test_records = ETLUtils.split_train_test(
            records, split=split, start=start)
        if cache_reviews:
            train_reviews, test_reviews = ETLUtils.split_train_test(
                cache_reviews, split=split, start=start)
            if reviews_type is not None:
                cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
            recommender.reviews = train_reviews
        recommender.load(train_records)

        if cluster_labels is not None:
            separated_records = reviews_clusterer.split_list_by_labels(
                test_records, cluster_labels)
            if reviews_type == 'specific':
                test_records = separated_records[0]
            if reviews_type == 'generic':
                test_records = separated_records[1]

        _, errors, num_unknown_ratings = predict_rating_list(recommender, test_records)
        recommender.clear()
        mean_absolute_error = MeanAbsoluteError.compute_list(errors)
        root_mean_square_error = RootMeanSquareError.compute_list(errors)
        num_samples = len(test_records)
        coverage = float((num_samples - num_unknown_ratings) / num_samples)
        # print('Total length:', len(test))
        # print('Unknown ratings:', num_unknown_ratings)
        # print('Coverage:', coverage)

        if mean_absolute_error is not None:
            total_mean_absolute_error += mean_absolute_error
            total_mean_square_error += root_mean_square_error
            total_coverage += coverage
            num_cycles += 1
        else:
            print('Mean absolute error is None!!!')


    final_mean_absolute_error = total_mean_absolute_error / num_cycles
    final_root_squared_error = total_mean_square_error / num_cycles
    final_coverage = total_coverage / num_cycles
    execution_time = time.time() - start_time

    print('Final mean absolute error: %f' % final_mean_absolute_error)
    print('Final root mean square error: %f' % final_root_squared_error)
    print('Final coverage: %f' % final_coverage)
    print("--- %s seconds ---" % execution_time)

    result = {
        'MAE': final_mean_absolute_error,
        'RMSE': final_root_squared_error,
        'Coverage': final_coverage,
        'Execution time': execution_time
    }

    return result

开发者ID:antoine-tran，项目名称:yelp，代码行数:71，代码来源:recommender_evaluator.py

示例7: calculate_recall_in_top_n

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test [as 别名]
def calculate_recall_in_top_n(
        records, recommender, n, num_folds, split=None, min_score=5.0,
        cache_reviews=None, reviews_type=None):

    start_time = time.time()
    if split is None:
        split = 1 - (1/float(num_folds))
    # split = 0.984
    total_recall = 0.
    total_coverage = 0.
    num_cycles = 0.0

    for i in xrange(0, num_folds):
        print('Fold', i)
        print('started training', time.strftime("%Y/%d/%m-%H:%M:%S"))
        start = float(i) / num_folds
        cluster_labels = None
        train_records, test_records = ETLUtils.split_train_test(
            records, split=split, shuffle_data=False, start=start)
        if cache_reviews:
            train_reviews, test_reviews = ETLUtils.split_train_test(
                cache_reviews, split=split, shuffle_data=False, start=start)
            if reviews_type is not None:
                cluster_labels = reviews_clusterer.cluster_reviews(test_reviews)
            recommender.reviews = train_reviews
        recommender.load(train_records)

        print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S"))

        if cluster_labels is not None:
            separated_records = reviews_clusterer.split_list_by_labels(
                test_records, cluster_labels)
            if reviews_type == 'specific':
                test_records = separated_records[0]
            if reviews_type == 'generic':
                test_records = separated_records[1]

        positive_reviews = \
            [review for review in test_records if review['overall_rating'] >= min_score]

        if len(positive_reviews) == 0:
            continue

        num_hits = 0.0
        num_predictions = 0.0
        for review in positive_reviews:
            user_id = review['user_id']
            item_id = review['offering_id']
            if not recommender.has_context:
                hit = calculate_is_a_hit(
                    test_records, recommender, user_id, item_id, n)
            else:
                text_review = review['text']
                hit = calculate_is_a_hit(
                    test_records, recommender, user_id, item_id, n, text_review)
            if hit is None:
                continue
            if hit:
                num_hits += 1
            num_predictions += 1
            # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S")))

        if num_predictions == 0:
            continue

        recommender.clear()
        recall = num_hits / num_predictions
        coverage = num_predictions / len(positive_reviews)
        print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S"))
        print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S"))
        total_recall += recall
        total_coverage += coverage
        num_cycles += 1

    final_recall = total_recall / num_cycles
    final_coverage = total_coverage / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_recall)
    print('Final Coverage: %f' % final_coverage)
    print("--- %s seconds ---" % execution_time)

    result = {
        'Top N': final_recall,
        'Coverage': final_coverage,
        'Execution time': execution_time
    }

    return result

开发者ID:anuragreddygv323，项目名称:yelp，代码行数:91，代码来源:precision_in_top_n.py

注：本文中的etl.ETLUtils.split_train_test方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。