当前位置: 首页>>代码示例>>Python>>正文


Python RandomForest.trainRegressor方法代码示例

本文整理汇总了Python中pyspark.mllib.tree.RandomForest.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python RandomForest.trainRegressor方法的具体用法?Python RandomForest.trainRegressor怎么用?Python RandomForest.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.tree.RandomForest的用法示例。


在下文中一共展示了RandomForest.trainRegressor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def main():
    input_train = sys.argv[1]
    input_test = sys.argv[2]

    conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    train = sc.textFile(input_train).cache()
    test = sc.textFile(input_test).cache()

    '''sbaronia - get training and testing labeled points'''
    train_lp = train.map(to_labeledpoint).cache()
    test_lp = test.map(to_labeledpoint).cache()

    '''sbaronia - run RandomForest regression on our training data with
    default options except numTrees = 5'''
    rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
    
    '''sbaronia - run predictions on testing data and calculate RMSE value'''
    predictions = rf_model.predict(test_lp.map(lambda x: x.features))
    labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
    rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))

    print("RMSE = " + str(rmse))
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:27,代码来源:randomforest.py

示例2: test_regression

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:61,代码来源:tests.py

示例3: trainRandomForestModel

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def trainRandomForestModel(data):
    """
    Train a random forest regression model and return it
    :param data: RDD[LabeledPoint]
    :return: random forest regression model
    """
    from pyspark.mllib.tree import RandomForest
    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    return model
开发者ID:theseusyang,项目名称:GEQE,代码行数:11,代码来源:createROC.py

示例4: getRandomForestRMSE

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def getRandomForestRMSE(trees_array):
	valRMSE_list = []
	for trees in trees_array:
		model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
                                    numTrees=trees, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)
		predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
		labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
		valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
		valRMSE=valMSE**0.5
		valRMSE_list.append((trees, valRMSE))
	return valRMSE_list
开发者ID:shaileshr,项目名称:SentimentAnalysis,代码行数:14,代码来源:Qn8.py

示例5: test_regression

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:54,代码来源:tests.py

示例6: testRegression

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def testRegression(trainingData, testData):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
        .sum() / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
    print(model.toDebugString())
开发者ID:1ambda,项目名称:spark,代码行数:19,代码来源:random_forest_example.py

示例7: train_amount_model

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
    def train_amount_model(self, model, data, i):
        rdd_data = self.sc.parallelize(data)
        self.logger.info('Start to train the amount model')
        if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
            input_num = self.feature_num
            layers = [input_num, input_num / 3 * 2, input_num / 3, 1]

            neural_network = NeuralNetworkSpark(layers=layers, bias=0)
            model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
                                         iteration=15, model=model)
        elif self.amount_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
                                                featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
                                                maxBins=32)

        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                  initialWeights=model.weights if model is not None else None)

        else:
            self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
            raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
        return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:25,代码来源:composition_prediction_system.py

示例8: run

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]

#.........这里部分代码省略.........
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, as well as prepare application data
    # i.)  In both the region, and in the time window
    # ii.) In the region, but outside the time window
    # iii.) Out of region, data to apply model to
    t1 = time.time()
    sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
    sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
    df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
    df1.registerTempTable("df1")
    df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
    dfn1 =  sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
    df1_inTime.registerTempTable("df1_inTime")
    #df1_outTime.registerTempTable("df1_outTime")
    #nL1T1 = df1_inTime.count()
    #nL1T0 = df1_outTime.count()
    exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
    t2 = time.time()
    #print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
    diff = t2-t1
    print "Time to partition by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data
    t1 = time.time()
    groupedIn  = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
    nSignal = float(groupedIn.count())
    nBack = float(groupedOut.count())
    scaleFactor = 10.*nSignal/nBack
    (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
    mlApply.cache()
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
    mlTrain.cache()
    nTotTrain = mlTrain.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2-t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2-t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Apply Model to out of region data
    t1 = time.time()
    predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    #print "Number of points to score:", nApply
    diff = t2-t1
    print "Time aggregate and label points: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd+1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
    t2 = time.time()
    diff = t2-t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
开发者ID:theseusyang,项目名称:GEQE,代码行数:104,代码来源:findSimilarEvent.py

示例9: dict

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
# Dictionary mapping each beat to an index. Useful when converting to LabeledPoint. Otherwise converts to numeric.
beatsDict = dict(beatList.zipWithIndex().map(lambda x: (x[0],x[1])).collect())

# Data points as LabeledPoints
# (crime count, [beat, week])
predArrayLP = joinedData.map(lambda x: LabeledPoint(x[0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]]))

# Split into training and testing set. 70-30 split.
(train, test) = predArrayLP.randomSplit([0.7, 0.3])

# Feature categories : 
featuresCat = {0: len(beatsDict), 1: 53}
maxBins = max(len(beatsDict),len(weekDict))

model = RandomForest.trainRegressor(train, categoricalFeaturesInfo=featuresCat,
                                    numTrees=10, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=maxBins)


# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
#rschoolCountBeats = schoolCount.map(lambda x: x[0])
predOutput = predictions.collect()
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(test.count())
print('Test Mean Squared Error = ' + str(testMSE))

### Write output to file ###
with open("predictions.txt", 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(predOutput)
开发者ID:sanjuw,项目名称:Spark_ChicagoCrimeDataAnalysis,代码行数:33,代码来源:crime3.py

示例10: StandardScaler

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
features=rdd.map(lambda t: (t[0],t[1],t[2],t[5],t[6],t[9],t[10],t[11],t[12],t[15],t[16]))
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)                              

#select value we want to predict
#lab = rdd.map(lambda row: row[8])#time
lab = rdd.map(lambda row: row[7])#fare
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

#split into training and testing datasets
trainingData, testingData = transformedData.randomSplit([0.9,0.1],seed=1234)

#do the training and get predictions
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},impurity='variance',numTrees=25, seed=42, maxDepth=8)
predictions = model.predict(testingData.map(lambda x: x.features))
valuesAndPreds = testingData.map(lambda lp: lp.label).zip(predictions)
results = valuesAndPreds.toDF().toPandas()
results.columns = ['truth', 'pred']
results = results[results['truth'] > 0]
truth = np.array(results["truth"].tolist())
pred = np.array(results["pred"].tolist())
diff_fare = 100*(truth - pred)/truth

print 'mean = ' + str(diff_fare.mean())

#R-squared
metrics = RegressionMetrics(valuesAndPreds)
print("R-squared = %s" % metrics.r2)
开发者ID:jgran,项目名称:TaxiPredict,代码行数:32,代码来源:random_forest.py

示例11: SparkConf

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
    sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local")
    sc=SparkContext(conf=sparkConf)
    sqlContext = SQLContext(sc)

    (visual_training_image_array , visual_training_outcome_array ) = loadVisualTrainingDataToArray()
    #We have to turn it into a list of observations
    visual_training_data = []
    for i in range(0,len(visual_training_outcome_array) ):
        visual_training_data.append((visual_training_outcome_array[i],visual_training_image_array[i]))
    visual_training_rdd = sc.parallelize(visual_training_data)
    visual_data_flattened = visual_training_rdd.map(lambda x : ( x[0] , averageBrightness4By4(x[1])) )
    visual_data_labeled_points = visual_data_flattened.map(lambda x : varsToLabeledPoint(x))
    toprint=visual_data_labeled_points.take(1)
    print(str(toprint))
    visual_model = RandomForest.trainRegressor(visual_data_labeled_points, categoricalFeaturesInfo={},
                                    numTrees=1000, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=100)
    #visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True)

    visual_training_vectors = visual_data_flattened.map(lambda x : featuresToVectors(x[1]))
    toprint = visual_training_vectors.take(1)
    print(str(toprint))
    visual_in_sample_predictions = visual_model.predict(visual_training_vectors)
    visual_in_sample_labels_and_predictions = visual_data_labeled_points.map(lambda lp: lp.label).zip(visual_in_sample_predictions)
    visual_in_sample_labels_and_predictions.foreach(printline)
    squaresdf = visual_in_sample_labels_and_predictions.map(lambda p : (p[0] , p[0]*p[0] , p[0] - p[1] , (p[0] - p[1])*(p[0] - p[1]) , 1 ) )
    squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) )
    tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4])
    rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4])
    r2 = 1-rss/tss
    print("Training set:")
开发者ID:tavisbarr,项目名称:BeijingTomorrow,代码行数:33,代码来源:pollutionrflearner.py

示例12: LabeledPoint

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
                            .join( avgTemperature ) \
                            .map( lambda row: [ item for sublist in row for item in sublist ] ) \
                            .map( lambda row: LabeledPoint( row[ 2 ][ 1 ], [ row[ 2 ][ 0 ], row[ 1 ], row[ 3 ] ] ) ) \
                            .cache( );
 
 crimeCounts.unpersist( );
 
 # Split the crime counts into training and test datasets
 ( training, test ) = joinedData.randomSplit( ( 0.7, 0.3 ) );
 
 # Categorical features dictionary
 featuresInfo = { 0: len( beatsDict ), 1: 53 };
 
 # Train a Random Forest model to predict crimes
 model = RandomForest.trainRegressor( training, categoricalFeaturesInfo = featuresInfo,
                                      numTrees = 5, featureSubsetStrategy = "auto",
                                      impurity = 'variance', maxDepth = 10, maxBins = len( beatsDict ) );
 
 # Measure the model performance on test dataset
 predictions = model.predict( test.map( lambda x: x.features ) ) \
                    .cache( );
 
 meanCrimes = test.map( lambda x: x.label ).mean( );
 labelsAndPredictions = test.map( lambda x:  x.label ).zip( predictions );
 testMSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( ) / float( test.count( ) );
 testSSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( );
 testSST = labelsAndPredictions.map( lambda ( v, p ): ( v - meanCrimes ) * ( v - meanCrimes ) ).sum( );
 
 Rsq = 1 - testSSE / testSST;
 
 #### Predicting crimes for next week ####
开发者ID:apurvaa7,项目名称:Big-Data,代码行数:33,代码来源:Problem7.py

示例13: rf

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def rf(userID, n):
    
    ### CREATING GAME PROFILE DF ####
    game_profiles = get_game_profiles()
    df = pd.DataFrame(game_profiles)
    df_clean = preprocess(df)

    # Full df for games only, no playtimes (for prediction later)
    df_games = df_clean.drop('genres', 1)
    #df_games = df_games.drop('name', 1) 
    df_games = df_games.drop('appID', 1)
    df_games = df_games.drop('cat', 1)
    df_games = df_games.drop('tags', 1)
    df_games = df_games.drop('type', 1)


    games = get_games('/media/sf_AdvancedML/Final/gameData.txt')
    missing = set()

    ### CROSS VALIDATING ###    
    all_accur, avg_accur = cross_validate(df_clean, games, 10)
    print "Accuracies, Average Accuracy"
    print all_accur, avg_accur

    ### TRAIN ON INCOMING USER ###
    ownedGames = build_user_dataset.get_ownedGames(userID) #json object
    with open('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt', 'w') as outFile:
        if len(ownedGames) == 0:
            print "This user's library is empty or unreachable."
            return
        json.dump({'user': userID, 'ownedGames':ownedGames}, outFile)

    # initialize empty frame with appropriate columns
    df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])

    # Randomly select user's library
    gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt')
    user = random.choice(gamesOwned.values())
    gamesList = gamesOwned[gamesOwned.keys()[0]].keys()

    # Connect playtime to game df for games owned
    if len(user.values()) > 0:
        #print user.values()[0]
        for k, v in user.values()[0].iteritems():
            if k in games:
                row = df_clean.loc[df_clean['name'] == k]
                row['playtime'] = np.log(v)
                df = df.append(row)
            else:
                missing.add(k)

    df = df.drop('genres', 1)
    df = df.drop('name', 1)
    df = df.drop('appID', 1)
    df = df.drop('cat', 1)
    df = df.drop('tags', 1)
    df = df.drop('type', 1)

    # Pass User DF to Spark
    df.to_csv('/media/sf_AdvancedML/Final/RF.csv')

    data = sc.textFile('/media/sf_AdvancedML/Final/RF.csv')
    header = data.first()
    data = data.filter(lambda x: x != header)
    data = data.map(lambda line: convertUni(line))
    data = data.map(lambda line: line.split(','))

    # RDD of (label, features) pairs
    data = data.map(lambda line: LabeledPoint(line[0], line[1:]))

    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo = {},
                                        numTrees = 3, featureSubsetStrategy = "auto",
                                        impurity = 'variance', maxDepth = 4)

    ### PREDICT ###
    # for every game in Steam library #
    df_games.to_csv('/media/sf_AdvancedML/Final/RF_games_names.csv')
    df_games.drop('name', 1).to_csv('/media/sf_AdvancedML/Final/RF_games.csv')

    data_games = sc.textFile('/media/sf_AdvancedML/Final/RF_games.csv')
    header = data_games.first()
    data_games = data_games.filter(lambda x: x != header)
    data_games = data_games.map(lambda line: convertUni(line))
    data_games = data_games.map(lambda line: line.split(','))

    data_test = sc.textFile('/media/sf_AdvancedML/Final/RF_games_names.csv')
    header2 = data_test.first()
    data_test = data_test.filter(lambda x: x != header2)
    data_test = data_test.map(lambda line: convertUni(line))
    data_test = data_test.map(lambda line: line.split(','))
    
    predictions = model.predict(data_games)
    idPredictions = data_test.map(lambda x: x[6]).zip(predictions)

    # Filter predictions for games owned or trailers/apps
    idPredictions = idPredictions.filter(lambda x: x[0] not in gamesList)

    # Export predictions to pandas df
    predDF = idPredictions.toDF()
    predDF = predDF.toPandas()  # Name, Prediction
#.........这里部分代码省略.........
开发者ID:USF-ML2,项目名称:Steamed_Up,代码行数:103,代码来源:game_contentBased_RF.py

示例14: cross_validate

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def cross_validate(df_clean, games, n):
    """
    :param k n: number of users for CV
    :return: list of accuracies for each of n users, avg acc
    """
    missing = set()

    ### COLLECTING LIBRARIES ###
    gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData.txt')
    print "Done collecting ownedGames."

    ### VALIDATING ###
    all_accur = {'model1': [], 'model2': [], 'model3': [], 'model4': []}

    for i in range(n):
        # initialize empty frame with appropriate columns
        df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])

        # Randomly select user's library
        user = random.choice(gamesOwned.values())

        # Connect playtime to game df for games owned
        if len(user.values()) > 0:
            #print user.values()[0]
            for k, v in user.values()[0].iteritems():
                if k in games:
                    row = df_clean.loc[df_clean['name'] == k]
                    row['playtime'] = np.log(v)
                    df = df.append(row)
                else:
                    missing.add(k)

        df = df.drop('genres', 1)
        df = df.drop('name', 1)
        df = df.drop('appID', 1)
        df = df.drop('cat', 1)
        df = df.drop('tags', 1)
        df = df.drop('type', 1)

        # Pass User DF to Spark
        df.to_csv('/media/sf_AdvancedML/Final/RF_train.csv')

        data = sc.textFile('/media/sf_AdvancedML/Final/RF_train.csv')
        header = data.first()
        data = data.filter(lambda x: x != header)
        data = data.map(lambda line: convertUni(line))
        data = data.map(lambda line: line.split(','))

        # RDD of (label, features) pairs
        data = data.map(lambda line: LabeledPoint(line[-1], line[:len(line)]))

        # Split into training, test
        (trainingData, testData) = data.randomSplit([0.8, 0.2])

        try:
            # Training model
            model1 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 70, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model2 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 100, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model3 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 120, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 4)
            model4 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
                                                numTrees = 100, featureSubsetStrategy = "auto",
                                                impurity = 'variance', maxDepth = 6)

            models = [model1, model2, model3, model4]
            modelNames = ['model1', 'model2', 'model3', 'model4']
            for i in range(len(models)):
                m = models[i]
                name = modelNames[i]
                # Evaluate on test data, compute error
                predictions = m.predict(testData.map(lambda x: x.features))
                labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
                testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)*(v-p)).sum() /\
                          float(testData.count())

                all_accur[name] += [testMSE]

        except:
            pass

    avgDict = {}
    for k,v in all_accur.iteritems():
        avgDict[k] = np.mean(v)
    return all_accur, avgDict
开发者ID:USF-ML2,项目名称:Steamed_Up,代码行数:91,代码来源:game_contentBased_RF.py

示例15: toAge

# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
filtered_car_data = car_data.map(
    lambda d: [toInteger(d["prc"]), toAge(d["fr"]), toFuel(d["fl"]), toInteger(d["ma"]), d["pk"], d["po"], d["ei"]]
)
filtered_car_data.first()

labeled_car_data = filtered_car_data.map(lambda row: LabeledPoint(row[0], row[1:]))
labeled_car_data.first()
labeled_car_data.collect()


"""
	(3) Run the Random Forest.

"""
model = RandomForest.trainRegressor(
    labeled_car_data, numTrees=750, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5, maxBins=32
)

predictions = model.predict(labeled_car_data.map(lambda x: x.features))
labelsAndPredictions = labeled_car_data.map(lambda lp: [lp.label, lp.features]).zip(predictions)
labelsAndPredictions.first()

model_error = labelsAndPredictions.map(lambda row: (row[1] - row[0][0], row))


"""
	(4) Get the extremes!

		Best & Worst deal.

开发者ID:willemhendriks,项目名称:buyausedcar,代码行数:31,代码来源:car_demo_v11.py


注:本文中的pyspark.mllib.tree.RandomForest.trainRegressor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。