当前位置: 首页>>代码示例>>Python>>正文


Python ETLUtils.split_train_test_copy方法代码示例

本文整理汇总了Python中etl.ETLUtils.split_train_test_copy方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.split_train_test_copy方法的具体用法?Python ETLUtils.split_train_test_copy怎么用?Python ETLUtils.split_train_test_copy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在etl.ETLUtils的用法示例。


在下文中一共展示了ETLUtils.split_train_test_copy方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: perform_cross_validation

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test_copy [as 别名]
    def perform_cross_validation(self):

        Constants.print_properties()

        utilities.plant_seeds()

        total_recall = 0.0
        total_specific_recall = 0.0
        total_generic_recall = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1/float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)
            self.reviews = copy.deepcopy(self.original_reviews)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j+1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records = \
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.train_reviews, self.test_reviews = \
                    ETLUtils.split_train_test_copy(
                        self.reviews, split=split, start=cv_start)
                self.export()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_word_model()
                    self.find_reviews_topics(lda_based_context)
                self.prepare()
                self.predict()
                self.evaluate()
                recall = self.top_n_evaluator.recall
                specific_recall = self.top_n_evaluator.specific_recall
                generic_recall = self.top_n_evaluator.generic_recall
                total_recall += recall
                total_specific_recall += specific_recall
                total_generic_recall += generic_recall

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j+1), fold_time))

        average_recall = total_recall / total_iterations
        average_specific_recall = total_specific_recall / total_iterations
        average_generic_recall = total_generic_recall / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average recall: %f' % average_recall)
        print('average specific recall: %f' % average_specific_recall)
        print('average generic recall: %f' % average_generic_recall)
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        results = Constants.get_properties_copy()
        results['recall'] = average_recall
        results['specific_recall'] = average_specific_recall
        results['generic_recall'] = average_generic_recall
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        if not os.path.exists(Constants.CSV_RESULTS_FILE):
            with open(Constants.CSV_RESULTS_FILE, 'wb') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writeheader()
                w.writerow(results)
        else:
            with open(Constants.CSV_RESULTS_FILE, 'a') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writerow(results)
开发者ID:melqkiades,项目名称:yelp,代码行数:87,代码来源:word_context_top_n_runner.py

示例2: perform_cross_validation

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import split_train_test_copy [as 别名]
    def perform_cross_validation(self):

        print(Constants._properties)

        self.plant_seeds()

        total_metric = 0.0
        total_specific_metric = 0.0
        total_generic_metric = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1/float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j+1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records =\
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.get_records_to_predict()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_topic_model(i, j)
                    self.find_reviews_topics(lda_based_context)
                self.predict()
                metric, specific_metric, generic_metric = self.evaluate()
                total_metric += metric
                total_specific_metric += specific_metric
                total_generic_metric += generic_metric

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j+1), fold_time))

        metric_name = Constants.EVALUATION_METRIC
        metric_average = total_metric / total_iterations
        average_specific_metric = total_specific_metric / total_iterations
        average_generic_metric = total_generic_metric / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average %s: %f' % (metric_name, metric_average))
        print(
            'average specific %s: %f' % (metric_name, average_specific_metric))
        print('average generic %s: %f' % (metric_name, average_generic_metric))
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        #
        results = copy.deepcopy(Constants._properties)
        results[Constants.EVALUATION_METRIC] = metric_average
        results['specific_' + metric_name] = average_specific_metric
        results['generic_' + metric_name] = average_generic_metric
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        write_results_to_csv(results)
        write_results_to_json(results)
开发者ID:antoine-tran,项目名称:yelp,代码行数:74,代码来源:context_top_n_runner.py


注:本文中的etl.ETLUtils.split_train_test_copy方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。