当前位置: 首页>>代码示例>>Python>>正文


Python DataFrame.split方法代码示例

本文整理汇总了Python中odps.df.DataFrame.split方法的典型用法代码示例。如果您正苦于以下问题:Python DataFrame.split方法的具体用法?Python DataFrame.split怎么用?Python DataFrame.split使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在odps.df.DataFrame的用法示例。


在下文中一共展示了DataFrame.split方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: TestSparseClassifiers

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
class TestSparseClassifiers(MLTestBase):
    def setUp(self):
        super(TestSparseClassifiers, self).setUp()
        self.create_iris_kv(IRIS_KV_TABLE)
        self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')

    def tearDown(self):
        super(TestSparseClassifiers, self).tearDown()

    @ci_skip_case
    def test_logistic_regression(self):
        options.runner.dry_run = False
        self.delete_table(LR_TEST_TABLE)
        self.delete_offline_model(MODEL_NAME)

        splited = self.df.split(0.6)

        lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
        model = lr.train(splited[0])
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LR_TEST_TABLE)

        fpr, tpr, thresh = roc_curve(predicted, "category")
        assert len(fpr) == len(tpr) and len(thresh) == len(fpr)

    def test_mock_xgboost(self):
        options.runner.dry_run = True

        splited = self.df.split(0.6)

        lr = Xgboost()
        model = lr.train(splited[0])._add_case(self.gen_check_params_case(
                {'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
                 'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true',
                 'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic',
                 'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case(
                {'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category',
                 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true',
                 'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'}))
        # persist operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_TEST_TABLE)
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:50,代码来源:test_sparse_classifiers.py

示例2: test_direct_method

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
 def test_direct_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:10,代码来源:test_dataframe.py

示例3: test_df_consecutive

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
 def test_df_consecutive(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df.roles(label='class')
     df.head(10)
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     predicted.to_pandas()
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:13,代码来源:test_dataframe.py

示例4: test_df_combined

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
 def test_df_combined(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     df = df[df['a04'] != 0]
     df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
     df = df.roles(label='class')
     df = df[df.a05 != 0].cache()
     df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
     train, test = df.split(0.6)
     lr = LogisticRegression(epsilon=0.01)
     model = lr.train(train)
     predicted = model.predict(test)
     (- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + (
     (1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename(
         't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:17,代码来源:test_dataframe.py

示例5: test_mock_gbdt

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
    def test_mock_gbdt(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        gbdt = GBDT(min_leaf_sample_count=10)
        model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({
            'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32',
            'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0',
            'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2',
            'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(GBDT_OUT_TABLE)
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:20,代码来源:test_regression.py

示例6: test_mock_xgboost

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
    def test_mock_xgboost(self):
        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        xgboost = Xgboost()
        model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({
            'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
            'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0',
            'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear',
            'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
            'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
        model.persist(MODEL_NAME)

        predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
            'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
            'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(XGBOOST_OUT_TABLE)
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:20,代码来源:test_regression.py

示例7: test_linear

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
    def test_linear(self):
        options.runner.dry_run = False
        self.delete_table(LINEAR_REGRESSION_OUT_TABLE)
        self.delete_offline_model(MODEL_NAME)

        df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
        splited = df.split(0.6)

        algo = LinearRegression()
        model = algo.train(splited[0])
        model.persist(MODEL_NAME)

        logging.info('Importance: ', regression_importance(splited[1], model))

        predicted = model.predict(splited[1])
        # persist is an operational node which will trigger execution of the flow
        predicted.persist(LINEAR_REGRESSION_OUT_TABLE)

        logging.info('MSE: ', mean_squared_error(predicted, 'class'))
        logging.info('MAE: ', mean_absolute_error(predicted, 'class'))
        logging.info('HIST: ', residual_histogram(predicted, 'class'))
        logging.info('MSE: ', pearson(predicted, col1='class'))
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:24,代码来源:test_regression.py

示例8: Test

# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]

#.........这里部分代码省略.........
                sepal_length="KVConfig(kv=-, item=;)",
            ),
        )
        # erase_key_value
        df12 = df10.erase_key_value("sepal_width")
        self.assertEqual(
            _df_key_value(df12),
            dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"),
        )

    def test_seq_field_operations(self):
        seq = self.df.sepal_length
        # roles
        seq1 = seq.role("weight")
        self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT"))
        # discrete
        seq2 = seq.discrete()
        self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE"))
        # continuous
        seq3 = seq.continuous()
        self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS"))
        # key_value
        seq4 = seq.key_value()
        self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)"))
        seq5 = seq4.key_value(kv="-", item=";")
        self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)"))
        # erase_key_value
        seq6 = seq5.erase_key_value()
        self.assertEqual(_df_key_value(seq6), dict(sepal_length=""))

    def test_coll_df_operations(self):
        from odps.ml.nodes import transform_nodes as tnodes

        splited = self.df.split(0.75)
        self.assertEqual(len(splited), 2)
        self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1]))
        split_node = adapter_from_df(splited[0])._bind_node
        self.assertEqual(split_node.code_name, "Split")
        self.assertEqual(split_node.parameters["fraction"], 0.75)

        id_appended = self.df.append_id()
        self.assertEqual(
            _df_roles(id_appended),
            dict(
                category="FEATURE",
                petal_length="FEATURE",
                petal_width="FEATURE",
                sepal_width="FEATURE",
                sepal_length="FEATURE",
                append_id="",
            ),
        )
        append_id_node = adapter_from_df(id_appended)._bind_node
        self.assertEqual(append_id_node.code_name, "AppendID")
        self.assertEqual(append_id_node.parameters["IDColName"], "append_id")

        summary_ep = self.df._create_summary_adapter()
        summary_node = summary_ep._bind_node
        self.assertIsInstance(summary_node, tnodes.SummaryNode)

    def test_dtypes(self):
        rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines())
        old_dtypes_repr = rstrip_lines(
            textwrap.dedent(
                """
        odps.Schema {
开发者ID:aliyun,项目名称:aliyun-odps-python-sdk,代码行数:70,代码来源:test_mixin.py


注:本文中的odps.df.DataFrame.split方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。