本文整理汇总了Python中pyspark.mllib.tree.GradientBoostedTrees.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python GradientBoostedTrees.trainRegressor方法的具体用法?Python GradientBoostedTrees.trainRegressor怎么用?Python GradientBoostedTrees.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.tree.GradientBoostedTrees
的用法示例。
在下文中一共展示了GradientBoostedTrees.trainRegressor方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cross_validation_gb
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth):
# Training the model using Gradient Boosted Trees regressor
model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={},
loss=loss_type,
numIterations=num_iter, maxDepth=maxDepth)
# Evaluate model on test instances and compute test error
predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(Data_3.count())
model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={},
loss=loss_type,
numIterations=num_iter, maxDepth=maxDepth)
# Evaluate model on test instances and compute test error
predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(Data_1.count())
model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={},
loss=loss_type,
numIterations=num_iter, maxDepth=maxDepth)
# Evaluate model on test instances and compute test error
predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(Data_2.count())
return (testMSE_1+testMSE_2+testMSE_3)/3
示例2: seg_model_gb
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]
# Putting data in vector assembler form
assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")
transformed_train = assembler_train.transform(train_data.fillna(0))
# Creating input dataset in the form of labeled point for training the model
data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))
# Training the model using Gradient Boosted Trees regressor
model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
loss=loss_type,
numIterations=num_iter, maxDepth=maxDepth)
# Creating a list of features to be used for predictions
removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]
# Putting data in vector assembler form
assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")
transformed_final= assembler_final.transform(test_data.fillna(0))
# Creating input dataset to be used for predictions
data_final = transformed_final.select("features", "review_id")
# Predicting ratings using the developed model
predictions = model_train.predict(data_final.map(lambda x: x.features))
labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
return labelsAndPredictions
示例3: test_regression
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
示例4: testRegression
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def testRegression(trainingData, testData, model_path):
# Train a GradientBoostedTrees model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count())
print("Test Mean Squared Error = " + str(testMSE))
print("Learned regression GBT model:")
print(model.toDebugString())
model.save(sc, model_path)
示例5: validation_gb
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth):
# Training the model using Gradient Boosted Trees regressor
model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
loss=loss_type,
numIterations=num_iter, maxDepth=maxDepth)
# Evaluate model on test instances and compute test error
predictions = model_train.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
return testMSE
示例6: test_regression
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例7: testRegression
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def testRegression(trainingData, testData):
# Train a GradientBoostedTrees model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
numIterations=30, maxDepth=4)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
/ float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression ensemble model:')
print(model.toDebugString())
示例8: SparkContext
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
# $example on$
# Load and parse the data file.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainRegressor(trainingData,
categoricalFeaturesInfo={}, numIterations=3)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression GBT model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel")
# $example off$
示例9: ShuffleSplit
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
all_data = np.array(zip(yy, xx))
sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234)
for train_indexes, test_indexes in sss:
lparr = []
test_lp_arr = []
sample_data = all_data[train_indexes]
test_data = all_data[test_indexes]
for medianvalue, record in sample_data:
lp = LabeledPoint(medianvalue, tuple(record))
lparr.append(lp)
for medianvalue, record in test_data:
lp = LabeledPoint(medianvalue, tuple(record))
test_lp_arr.append(lp)
training_data = sc.parallelize(lparr).cache()
test_data_rdd = sc.parallelize(test_lp_arr).cache()
regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10)
result = regression_model.predict(test_data_rdd.map(lambda x: x.features))
print regression_model
print regression_model.toDebugString()
print "==============================="
predicted_data = result.collect()
actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()
print mean_absolute_error(actual_data, predicted_data)
break
示例10: SparkConf
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)
input = sys.argv[1]
# Load and parse the data
def parsePoint(line):
return LabeledPoint(float(line[1]), line[0])
train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)
print("Root Mean Squared Error Test= " + str(RMSE))
示例11:
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
if model_type == "classification":
model = GradientBoostedTrees.trainClassifier(
lp,
categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
loss=loss_param,
numIterations=numIterations_param,
learningRate=learningRate_param,
maxDepth=maxDepth_param,
maxBins=maxBins_param)
else:
# regression
model = GradientBoostedTrees.trainRegressor(
lp,
categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
loss=loss_param,
numIterations=numIterations_param,
learningRate=learningRate_param,
maxDepth=maxDepth_param,
maxBins=maxBins_param)
build_report = mbr.report(lp.count(),lp.getNumPartitions(),
predictors,datamodel,target,model_type,
settings=[("Algorithm","Gradient Boosted Trees",[("loss",loss_param),("numIterations",numIterations_param),("learningRate",learningRate_param),("maxDepth",maxDepth_param),("maxBins",maxBins_param)])])
print(build_report)
model.save(sc, modelpath)
model_metadata = { "target":target, "predictors":predictors, "datamodel": datamodel, "model_type":model_type }
print(model.toDebugString())
示例12: trainTestSaveALLModel
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def trainTestSaveALLModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
predictors = []
modelType = ""
if "batting" in encodedFeaturesParq:
modelType = 'batting'
predictors = hitterPredictors
else:
modelType = 'pitching'
predictors = pitcherPredictors
not_features.extend(predictors)
# Load and parse the data file.
features = sqlContext.read.parquet(encodedFeaturesParq).cache()
print features.take(3)
print "# features=", features.count()
numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
(catFeatures, featureLookup) = getCatFeatures(features, numVals)
all_fd_points_df = None
fd_points_testData = None
predictions = None
for predictor in predictors:
#global predictField
#predictField = predictor
#data = features.map(toLabeledPoint).coalesce(50)
#data = toLabeledPoint(features, predictor).coalesce(50)
#print "len data=", data.count()
print "catFeatures=", catFeatures
# Split the data into training and test sets (30% held out for testing)
(f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
#trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
#testData = f_testData.map(toLabeledPoint).coalesce(50)
testData = toLabeledPoint(f_testData, predictor).coalesce(50)
testData.cache()
print "testData count=", testData.count()
playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
print "playerIds=", playerIds
print "playerIds=", playerIds.take(2)
print "len playerIds=", playerIds.count()
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=5, numIterations=1, maxBins=300)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features)).cache()
print "# predictions=", predictions.count()
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
if fd_points_testData is None:
fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points)).toDF(['player_id', 'actual_fd_points']).coalesce(50)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
print predictor + ' Test Mean Squared Error = ' + str(testMSE)
print predictor + ' Test Mean Absolute Error = ' + str(testMAE)
if all_fd_points_df is None:
#all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache()
print "FIRST: # predictions=", predictions.count()
print " # playerIds=", playerIds.count()
all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
print "# all_fd_points_df", all_fd_points_df.count()
print "first all_fd_points_df", all_fd_points_df.take(5)
print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
else:
print "ELSE: # predictions=", predictions.count()
print " # playerIds=", playerIds.count()
curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('curr_fd_points_df')
print "all_fd_points_df", all_fd_points_df.printSchema()
print "PRE all_fd_points_df", all_fd_points_df.take(5)
print "curr_fd_points_df", curr_fd_points_df.printSchema()
print "few curr_fd_points_df", curr_fd_points_df.take(5)
print "# curr_fd_points_df", curr_fd_points_df.count()
print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count()
print "first curr", curr_fd_points_df.take(5)
#all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id)
all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, col("all_fd_points_df.player_id") == col("curr_fd_points_df.player_id")).drop(curr_fd_points_df.player_id).alias('all_fd_points_df').cache()
print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
#print "all debugstring", all_fd_points_df.rdd.toDebugString()
#print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
print "first few all_fd_points_df=", all_fd_points_df.take(3)
print "count few all_fd_points_df=", all_fd_points_df.count()
print "converted:"
print populateDebugString(model, featureLookup)
# Save and load model
modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
if modelType == "batting":
modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
try:
shutil.rmtree(modelFilename)
except OSError:
pass
model.save(sc, modelFilename)
#sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
print "DONE. all_fd_points_df", all_fd_points_df.printSchema()
print "# of all_fd_points=", all_fd_points_df.count()
#.........这里部分代码省略.........
示例13: trainTestSaveFDPointsModel
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def trainTestSaveFDPointsModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
modelType = ""
if "batting" in encodedFeaturesParq:
modelType = 'batting'
else:
modelType = 'pitching'
predictor = 'fd_points'
not_features.extend(predictor)
# Load and parse the data file.
features = sqlContext.read.parquet(encodedFeaturesParq).cache()
print features.take(3)
print "# features=", features.count()
numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
(catFeatures, featureLookup) = getCatFeatures(features, numVals)
all_fd_points_df = None
fd_points_testData = None
predictions = None
print "catFeatures=", catFeatures
# Split the data into training and test sets (30% held out for testing)
(f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
#trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
#testData = f_testData.map(toLabeledPoint).coalesce(50)
testData = toLabeledPoint(f_testData, predictor).coalesce(50)
testData.cache()
print "testData count=", testData.count()
playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
print "playerIds=", playerIds
print "playerIds=", playerIds.take(2)
print "len playerIds=", playerIds.count()
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=6, numIterations=32, maxBins=300)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features)).cache()
print "# predictions=", predictions.count()
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points or 0.0)).toDF(['player_id', 'actual_fd_points']).coalesce(50)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
print predictor + ' Test Mean Squared Error = ' + str(testMSE)
print predictor + ' Test Mean Absolute Error = ' + str(testMAE)
# print " # playerIds=", playerIds.count()
# all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
# print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
# print "# all_fd_points_df", all_fd_points_df.count()
# print "first all_fd_points_df", all_fd_points_df.take(5)
# print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
print "converted:"
print populateDebugString(model, featureLookup)
# Save and load model
modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
if modelType == "batting":
modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
try:
shutil.rmtree(modelFilename)
except OSError:
pass
model.save(sc, modelFilename)
fd_points_testData_filename = rddDir + modelType + '_' + 'fd_points_testData.csv'
try:
shutil.rmtree(fd_points_testData_filename)
except OSError:
pass
fd_points_testData.write.format('com.databricks.spark.csv').option('header', 'true').save(fd_points_testData_filename)