本文整理汇总了Python中odps.df.DataFrame.split方法的典型用法代码示例。如果您正苦于以下问题:Python DataFrame.split方法的具体用法?Python DataFrame.split怎么用?Python DataFrame.split使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类odps.df.DataFrame
的用法示例。
在下文中一共展示了DataFrame.split方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TestSparseClassifiers
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
class TestSparseClassifiers(MLTestBase):
def setUp(self):
super(TestSparseClassifiers, self).setUp()
self.create_iris_kv(IRIS_KV_TABLE)
self.df = DataFrame(self.odps.get_table(IRIS_KV_TABLE)).label_field('category').key_value('content')
def tearDown(self):
super(TestSparseClassifiers, self).tearDown()
@ci_skip_case
def test_logistic_regression(self):
options.runner.dry_run = False
self.delete_table(LR_TEST_TABLE)
self.delete_offline_model(MODEL_NAME)
splited = self.df.split(0.6)
lr = LogisticRegression(epsilon=0.001).set_max_iter(50)
model = lr.train(splited[0])
model.persist(MODEL_NAME)
predicted = model.predict(splited[1])
# persist is an operational node which will trigger execution of the flow
predicted.persist(LR_TEST_TABLE)
fpr, tpr, thresh = roc_curve(predicted, "category")
assert len(fpr) == len(tpr) and len(thresh) == len(fpr)
def test_mock_xgboost(self):
options.runner.dry_run = True
splited = self.df.split(0.6)
lr = Xgboost()
model = lr.train(splited[0])._add_case(self.gen_check_params_case(
{'labelColName': 'category', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
'eval_metric': 'error', 'eta': '0.3', 'itemDelimiter': ',', 'kvDelimiter': ':',
'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0', 'enableSparse': 'true',
'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'binary:logistic',
'featureColNames': 'content', 'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
model.persist(MODEL_NAME)
predicted = model.predict(splited[1])._add_case(self.gen_check_params_case(
{'itemDelimiter': ',', 'modelName': MODEL_NAME, 'appendColNames': 'content,category',
'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2', 'enableSparse': 'true',
'outputTableName': XGBOOST_TEST_TABLE, 'kvDelimiter': ':', 'featureColNames': 'content'}))
# persist operational node which will trigger execution of the flow
predicted.persist(XGBOOST_TEST_TABLE)
示例2: test_direct_method
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_direct_method(self):
self.create_ionosphere(IONOSPHERE_TABLE)
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
train, test = df.split(0.6)
lr = LogisticRegression(epsilon=0.01)
model = lr.train(train)
predicted = model.predict(test)
predicted.to_pandas()
示例3: test_df_consecutive
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_df_consecutive(self):
self.create_ionosphere(IONOSPHERE_TABLE)
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
df = df[df['a04'] != 0]
df = df.roles(label='class')
df.head(10)
train, test = df.split(0.6)
lr = LogisticRegression(epsilon=0.01)
model = lr.train(train)
predicted = model.predict(test)
predicted.to_pandas()
示例4: test_df_combined
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_df_combined(self):
self.create_ionosphere(IONOSPHERE_TABLE)
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
df = df[df['a04'] != 0]
df = df['a01', df.a05.map(lambda v: v * 2).rename('a05'), 'a06', 'class']
df = df.roles(label='class')
df = df[df.a05 != 0].cache()
df = df[df.a05, ((df.a06 + 1) / 2).rename('a06'), 'class']
train, test = df.split(0.6)
lr = LogisticRegression(epsilon=0.01)
model = lr.train(train)
predicted = model.predict(test)
(- 1.0 * ((predicted['class'] * predicted.prediction_score.log().rename('t')).rename('t1') + (
(1 - predicted['class']) * (1 - predicted.prediction_score).log().rename('t0')).rename('t2')).rename(
't3').sum() / predicted.prediction_score.count()).rename('t4').execute()
示例5: test_mock_gbdt
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_mock_gbdt(self):
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
splited = df.split(0.6)
gbdt = GBDT(min_leaf_sample_count=10)
model = gbdt.train(splited[0])._add_case(self.gen_check_params_case({
'tau': '0.6', 'modelName': MODEL_NAME, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'maxLeafCount': '32',
'shrinkage': '0.05', 'featureSplitValueMaxSize': '500', 'featureRatio': '0.6', 'testRatio': '0.0',
'newtonStep': '0', 'randSeed': '0', 'sampleRatio': '0.6', 'p': '1', 'treeCount': '500', 'metricType': '2',
'labelColName': 'class', 'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
'minLeafSampleCount': '10', 'lossType': '3', 'maxDepth': '11'}))
model.persist(MODEL_NAME)
predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
'outputTableName': GBDT_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
# persist is an operational node which will trigger execution of the flow
predicted.persist(GBDT_OUT_TABLE)
示例6: test_mock_xgboost
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_mock_xgboost(self):
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
splited = df.split(0.6)
xgboost = Xgboost()
model = xgboost.train(splited[0])._add_case(self.gen_check_params_case({
'labelColName': 'class', 'modelName': MODEL_NAME, 'colsample_bytree': '1', 'silent': '1',
'eval_metric': 'error', 'eta': '0.3', 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_1', 'max_delta_step': '0',
'base_score': '0.5', 'seed': '0', 'min_child_weight': '1', 'objective': 'reg:linear',
'featureColNames': ','.join('a%02d' % i for i in range(1, 35)),
'max_depth': '6', 'gamma': '0', 'booster': 'gbtree'}))
model.persist(MODEL_NAME)
predicted = model.predict(splited[1])._add_case(self.gen_check_params_case({
'modelName': MODEL_NAME, 'appendColNames': ','.join('a%02d' % i for i in range(1, 35)) + ',class',
'outputTableName': XGBOOST_OUT_TABLE, 'inputTableName': TEMP_TABLE_PREFIX + '0_split_2_2'}))
# persist is an operational node which will trigger execution of the flow
predicted.persist(XGBOOST_OUT_TABLE)
示例7: test_linear
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
def test_linear(self):
options.runner.dry_run = False
self.delete_table(LINEAR_REGRESSION_OUT_TABLE)
self.delete_offline_model(MODEL_NAME)
df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)).roles(label='class')
splited = df.split(0.6)
algo = LinearRegression()
model = algo.train(splited[0])
model.persist(MODEL_NAME)
logging.info('Importance: ', regression_importance(splited[1], model))
predicted = model.predict(splited[1])
# persist is an operational node which will trigger execution of the flow
predicted.persist(LINEAR_REGRESSION_OUT_TABLE)
logging.info('MSE: ', mean_squared_error(predicted, 'class'))
logging.info('MAE: ', mean_absolute_error(predicted, 'class'))
logging.info('HIST: ', residual_histogram(predicted, 'class'))
logging.info('MSE: ', pearson(predicted, col1='class'))
示例8: Test
# 需要导入模块: from odps.df import DataFrame [as 别名]
# 或者: from odps.df.DataFrame import split [as 别名]
#.........这里部分代码省略.........
sepal_length="KVConfig(kv=-, item=;)",
),
)
# erase_key_value
df12 = df10.erase_key_value("sepal_width")
self.assertEqual(
_df_key_value(df12),
dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"),
)
def test_seq_field_operations(self):
seq = self.df.sepal_length
# roles
seq1 = seq.role("weight")
self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT"))
# discrete
seq2 = seq.discrete()
self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE"))
# continuous
seq3 = seq.continuous()
self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS"))
# key_value
seq4 = seq.key_value()
self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)"))
seq5 = seq4.key_value(kv="-", item=";")
self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)"))
# erase_key_value
seq6 = seq5.erase_key_value()
self.assertEqual(_df_key_value(seq6), dict(sepal_length=""))
def test_coll_df_operations(self):
from odps.ml.nodes import transform_nodes as tnodes
splited = self.df.split(0.75)
self.assertEqual(len(splited), 2)
self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1]))
split_node = adapter_from_df(splited[0])._bind_node
self.assertEqual(split_node.code_name, "Split")
self.assertEqual(split_node.parameters["fraction"], 0.75)
id_appended = self.df.append_id()
self.assertEqual(
_df_roles(id_appended),
dict(
category="FEATURE",
petal_length="FEATURE",
petal_width="FEATURE",
sepal_width="FEATURE",
sepal_length="FEATURE",
append_id="",
),
)
append_id_node = adapter_from_df(id_appended)._bind_node
self.assertEqual(append_id_node.code_name, "AppendID")
self.assertEqual(append_id_node.parameters["IDColName"], "append_id")
summary_ep = self.df._create_summary_adapter()
summary_node = summary_ep._bind_node
self.assertIsInstance(summary_node, tnodes.SummaryNode)
def test_dtypes(self):
rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines())
old_dtypes_repr = rstrip_lines(
textwrap.dedent(
"""
odps.Schema {