当前位置: 首页>>代码示例>>Python>>正文


Python GradientBoostedTrees.trainRegressor方法代码示例

本文整理汇总了Python中pyspark.mllib.tree.GradientBoostedTrees.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python GradientBoostedTrees.trainRegressor方法的具体用法?Python GradientBoostedTrees.trainRegressor怎么用?Python GradientBoostedTrees.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.tree.GradientBoostedTrees的用法示例。


在下文中一共展示了GradientBoostedTrees.trainRegressor方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: cross_validation_gb

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_3.count())

    model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_1.count())

    model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_2.count())

    return (testMSE_1+testMSE_2+testMSE_3)/3
开发者ID:USF-ML2,项目名称:Rectastic-,代码行数:35,代码来源:GB_models.py

示例2: seg_model_gb

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
开发者ID:USF-ML2,项目名称:Rectastic-,代码行数:35,代码来源:GB_models.py

示例3: test_regression

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:61,代码来源:tests.py

示例4: testRegression

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def testRegression(trainingData, testData, model_path):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4)
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count())
    print("Test Mean Squared Error = " + str(testMSE))
    print("Learned regression GBT model:")
    print(model.toDebugString())
    model.save(sc, model_path)
开发者ID:feng1008,项目名称:spark,代码行数:13,代码来源:ctr_mllib_gbdt.py

示例5: validation_gb

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE
开发者ID:USF-ML2,项目名称:Rectastic-,代码行数:14,代码来源:GB_models.py

示例6: test_regression

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:54,代码来源:tests.py

示例7: testRegression

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def testRegression(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                numIterations=30, maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
        / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression ensemble model:')
    print(model.toDebugString())
开发者ID:Amir-Github,项目名称:spark,代码行数:15,代码来源:gradient_boosted_trees.py

示例8: SparkContext

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={}, numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel")
    # $example off$
开发者ID:lhfei,项目名称:spark-in-action,代码行数:33,代码来源:gradient_boosting_regression_example.py

示例9: ShuffleSplit

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
all_data = np.array(zip(yy, xx))
sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234)

for train_indexes, test_indexes in sss:
    lparr = []
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for medianvalue, record in sample_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        lparr.append(lp)

    for medianvalue, record in test_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10)
    result = regression_model.predict(test_data_rdd.map(lambda x: x.features))
    print regression_model
    print regression_model.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    break
开发者ID:alexsisu,项目名称:spark-ml-training,代码行数:32,代码来源:boston_gradient_boosted_trees_regression.py

示例10: SparkConf

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)

input = sys.argv[1]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)

print("Root Mean Squared Error Test= " + str(RMSE))

开发者ID:gurpreetbajwa,项目名称:Sentiment-Analysis,代码行数:30,代码来源:gradient_boost.py

示例11:

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
if model_type == "classification":
    model = GradientBoostedTrees.trainClassifier(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)
else:
    # regression
    model = GradientBoostedTrees.trainRegressor(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)

build_report = mbr.report(lp.count(),lp.getNumPartitions(),
    predictors,datamodel,target,model_type,
    settings=[("Algorithm","Gradient Boosted Trees",[("loss",loss_param),("numIterations",numIterations_param),("learningRate",learningRate_param),("maxDepth",maxDepth_param),("maxBins",maxBins_param)])])

print(build_report)

model.save(sc, modelpath)

model_metadata = { "target":target, "predictors":predictors, "datamodel": datamodel, "model_type":model_type }

print(model.toDebugString())
开发者ID:IBMPredictiveAnalytics,项目名称:Gradient_Boosted_Trees_with_MLlib,代码行数:33,代码来源:gb_tree.py

示例12: trainTestSaveALLModel

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def trainTestSaveALLModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
    predictors = []
    modelType = ""
    if "batting" in encodedFeaturesParq:
        modelType = 'batting'
        predictors = hitterPredictors
    else:
        modelType = 'pitching'
        predictors = pitcherPredictors
    not_features.extend(predictors)
    # Load and parse the data file.
    features = sqlContext.read.parquet(encodedFeaturesParq).cache()
    print features.take(3)
    print "# features=", features.count()
    numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
    (catFeatures, featureLookup) = getCatFeatures(features, numVals)
    all_fd_points_df = None
    fd_points_testData = None
    predictions = None
    for predictor in predictors:
        #global predictField
        #predictField = predictor
        #data = features.map(toLabeledPoint).coalesce(50)
        #data = toLabeledPoint(features, predictor).coalesce(50)
        #print "len data=", data.count()

        print "catFeatures=", catFeatures

        # Split the data into training and test sets (30% held out for testing)
        (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
        #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
        trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
        #testData = f_testData.map(toLabeledPoint).coalesce(50)
        testData = toLabeledPoint(f_testData, predictor).coalesce(50)
        testData.cache()
        print "testData count=", testData.count()
        playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
        print "playerIds=", playerIds
        print "playerIds=", playerIds.take(2)
        print "len playerIds=", playerIds.count()

        # Train a GradientBoostedTrees model.
        #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
        #         (b) Use more iterations in practice.
        model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=5, numIterations=1, maxBins=300)

        # Evaluate model on test instances and compute test error
        predictions = model.predict(testData.map(lambda x: x.features)).cache()
        print "# predictions=", predictions.count()
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        if fd_points_testData is None:
            fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points)).toDF(['player_id', 'actual_fd_points']).coalesce(50)

        testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
        testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
        print predictor + ' Test Mean Squared  Error = ' + str(testMSE)
        print predictor + ' Test Mean Absolute Error = ' + str(testMAE)

        if all_fd_points_df is None:
            #all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache()
            print "FIRST: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
            print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            print "# all_fd_points_df", all_fd_points_df.count()
            print "first all_fd_points_df", all_fd_points_df.take(5)
            print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
        else:
            print "ELSE: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('curr_fd_points_df')
            print "all_fd_points_df", all_fd_points_df.printSchema()
            print "PRE all_fd_points_df", all_fd_points_df.take(5)
            print "curr_fd_points_df", curr_fd_points_df.printSchema()
            print "few curr_fd_points_df", curr_fd_points_df.take(5)
            print "# curr_fd_points_df", curr_fd_points_df.count()
            print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count()
            print "first curr", curr_fd_points_df.take(5)
            #all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id)
            all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, col("all_fd_points_df.player_id") == col("curr_fd_points_df.player_id")).drop(curr_fd_points_df.player_id).alias('all_fd_points_df').cache()
            print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            #print "all debugstring", all_fd_points_df.rdd.toDebugString()
            #print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
        print "first few all_fd_points_df=", all_fd_points_df.take(3)
        print "count few all_fd_points_df=", all_fd_points_df.count()
        print "converted:"
        print populateDebugString(model, featureLookup)

        # Save and load model
        modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
        if modelType == "batting":
            modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
        try:
            shutil.rmtree(modelFilename)
        except OSError:
            pass
        model.save(sc, modelFilename)
        #sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
    print "DONE. all_fd_points_df", all_fd_points_df.printSchema()
    print "# of all_fd_points=", all_fd_points_df.count()
#.........这里部分代码省略.........
开发者ID:xiaokekehaha,项目名称:mlb_stats_spark,代码行数:103,代码来源:TrainModel.py

示例13: trainTestSaveFDPointsModel

# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainRegressor [as 别名]
def trainTestSaveFDPointsModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
    modelType = ""
    if "batting" in encodedFeaturesParq:
        modelType = 'batting'
    else:
        modelType = 'pitching'
    predictor = 'fd_points'
    not_features.extend(predictor)
    # Load and parse the data file.
    features = sqlContext.read.parquet(encodedFeaturesParq).cache()
    print features.take(3)
    print "# features=", features.count()
    numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
    (catFeatures, featureLookup) = getCatFeatures(features, numVals)
    all_fd_points_df = None
    fd_points_testData = None
    predictions = None

    print "catFeatures=", catFeatures

    # Split the data into training and test sets (30% held out for testing)
    (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
    #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
    trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
    #testData = f_testData.map(toLabeledPoint).coalesce(50)
    testData = toLabeledPoint(f_testData, predictor).coalesce(50)
    testData.cache()
    print "testData count=", testData.count()
    playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
    print "playerIds=", playerIds
    print "playerIds=", playerIds.take(2)
    print "len playerIds=", playerIds.count()

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=6, numIterations=32, maxBins=300)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features)).cache()
    print "# predictions=", predictions.count()
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points or 0.0)).toDF(['player_id', 'actual_fd_points']).coalesce(50)

    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
    testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
    print predictor + ' Test Mean Squared  Error = ' + str(testMSE)
    print predictor + ' Test Mean Absolute Error = ' + str(testMAE)

#    print " # playerIds=", playerIds.count()
#    all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
#    print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
#    print "# all_fd_points_df", all_fd_points_df.count()
#    print "first all_fd_points_df", all_fd_points_df.take(5)
#    print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
    print "converted:"
    print populateDebugString(model, featureLookup)

    # Save and load model
    modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
    if modelType == "batting":
        modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
    try:
        shutil.rmtree(modelFilename)
    except OSError:
        pass
    model.save(sc, modelFilename)

    fd_points_testData_filename = rddDir + modelType + '_' + 'fd_points_testData.csv'
    try:
        shutil.rmtree(fd_points_testData_filename)
    except OSError:
        pass
    fd_points_testData.write.format('com.databricks.spark.csv').option('header', 'true').save(fd_points_testData_filename)
开发者ID:xiaokekehaha,项目名称:mlb_stats_spark,代码行数:76,代码来源:TrainModel.py


注:本文中的pyspark.mllib.tree.GradientBoostedTrees.trainRegressor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。