本文整理汇总了Python中pyspark.mllib.tree.RandomForest.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python RandomForest.trainRegressor方法的具体用法?Python RandomForest.trainRegressor怎么用?Python RandomForest.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.tree.RandomForest
的用法示例。
在下文中一共展示了RandomForest.trainRegressor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def main():
input_train = sys.argv[1]
input_test = sys.argv[2]
conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
train = sc.textFile(input_train).cache()
test = sc.textFile(input_test).cache()
'''sbaronia - get training and testing labeled points'''
train_lp = train.map(to_labeledpoint).cache()
test_lp = test.map(to_labeledpoint).cache()
'''sbaronia - run RandomForest regression on our training data with
default options except numTrees = 5'''
rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
'''sbaronia - run predictions on testing data and calculate RMSE value'''
predictions = rf_model.predict(test_lp.map(lambda x: x.features))
labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))
print("RMSE = " + str(rmse))
示例2: test_regression
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
示例3: trainRandomForestModel
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def trainRandomForestModel(data):
"""
Train a random forest regression model and return it
:param data: RDD[LabeledPoint]
:return: random forest regression model
"""
from pyspark.mllib.tree import RandomForest
model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
return model
示例4: getRandomForestRMSE
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def getRandomForestRMSE(trees_array):
valRMSE_list = []
for trees in trees_array:
model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
numTrees=trees, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
valRMSE=valMSE**0.5
valRMSE_list.append((trees, valRMSE))
return valRMSE_list
示例5: test_regression
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例6: testRegression
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def testRegression(trainingData, testData):
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
.sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())
示例7: train_amount_model
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def train_amount_model(self, model, data, i):
rdd_data = self.sc.parallelize(data)
self.logger.info('Start to train the amount model')
if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
input_num = self.feature_num
layers = [input_num, input_num / 3 * 2, input_num / 3, 1]
neural_network = NeuralNetworkSpark(layers=layers, bias=0)
model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
iteration=15, model=model)
elif self.amount_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
maxBins=32)
elif self.amount_prediction_method == self.LINEAR_REGRESSION:
model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=model.weights if model is not None else None)
else:
self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
return model
示例8: run
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
#.........这里部分代码省略.........
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# Split data into training and apply samples
# training data is 2 parts, as well as prepare application data
# i.) In both the region, and in the time window
# ii.) In the region, but outside the time window
# iii.) Out of region, data to apply model to
t1 = time.time()
sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType())
sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType())
sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType())
df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache()
df1.registerTempTable("df1")
df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
#df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache()
dfn1 = sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)")
df1_inTime.registerTempTable("df1_inTime")
#df1_outTime.registerTempTable("df1_outTime")
#nL1T1 = df1_inTime.count()
#nL1T0 = df1_outTime.count()
exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup)
t2 = time.time()
#print nL1T1, "events in region in time,", nL1T0, "events in region out of time"
diff = t2-t1
print "Time to partition by time", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# Create training vectors from in region data
t1 = time.time()
groupedIn = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
#groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache()
groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache()
nSignal = float(groupedIn.count())
nBack = float(groupedOut.count())
scaleFactor = 10.*nSignal/nBack
(mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor])
mlApply.cache()
mlTrain = groupedIn.union(groupedUse).cache()
if len(lStop) != 0:
mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop))
mlTrain.cache()
nTotTrain = mlTrain.count()
t2 = time.time()
print nTotTrain, "entries for training"
diff = t2-t1
print "Time to get data ready for model by time", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# train model
t1 = time.time()
model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
if modelSavePath is not None:
if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/"
model_Tree.save(sc, modelSavePath + jobNm)
t2 = time.time()
diff = t2-t1
print "Time to train model", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
# Apply Model to out of region data
t1 = time.time()
predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features))
vecAndPredictions = mlApply.zip(predictions_Tree)
vecAndPredictions.cache()
vecAndPredictions.count()
t2 = time.time()
#print "Number of points to score:", nApply
diff = t2-t1
print "Time aggregate and label points: ", diff
if bWriteMonitor:
mPY[mInd] = diff
mInd = mInd+1
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
#Get the results
t1 = time.time()
resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict)
t2 = time.time()
diff = t2-t1
print "Time to create json objects for output: ", diff
if bWriteMonitor:
mPY[mInd] = diff
plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)
diff = time.time() - t0
print "<----------BOOM GOES THE DYNOMITE!---------->"
print "< total number of tweets:,", nGoodTweets
print "< total process Time:", diff
print "< total idf vector length:", nVecLen
print "<------------------------------------------->"
return resultSet
示例9: dict
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
# Dictionary mapping each beat to an index. Useful when converting to LabeledPoint. Otherwise converts to numeric.
beatsDict = dict(beatList.zipWithIndex().map(lambda x: (x[0],x[1])).collect())
# Data points as LabeledPoints
# (crime count, [beat, week])
predArrayLP = joinedData.map(lambda x: LabeledPoint(x[0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]]))
# Split into training and testing set. 70-30 split.
(train, test) = predArrayLP.randomSplit([0.7, 0.3])
# Feature categories :
featuresCat = {0: len(beatsDict), 1: 53}
maxBins = max(len(beatsDict),len(weekDict))
model = RandomForest.trainRegressor(train, categoricalFeaturesInfo=featuresCat,
numTrees=10, featureSubsetStrategy="auto",
impurity='variance', maxDepth=5, maxBins=maxBins)
# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
#rschoolCountBeats = schoolCount.map(lambda x: x[0])
predOutput = predictions.collect()
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(test.count())
print('Test Mean Squared Error = ' + str(testMSE))
### Write output to file ###
with open("predictions.txt", 'wb') as f:
writer = csv.writer(f)
writer.writerows(predOutput)
示例10: StandardScaler
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
features=rdd.map(lambda t: (t[0],t[1],t[2],t[5],t[6],t[9],t[10],t[11],t[12],t[15],t[16]))
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
#select value we want to predict
#lab = rdd.map(lambda row: row[8])#time
lab = rdd.map(lambda row: row[7])#fare
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))
#split into training and testing datasets
trainingData, testingData = transformedData.randomSplit([0.9,0.1],seed=1234)
#do the training and get predictions
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},impurity='variance',numTrees=25, seed=42, maxDepth=8)
predictions = model.predict(testingData.map(lambda x: x.features))
valuesAndPreds = testingData.map(lambda lp: lp.label).zip(predictions)
results = valuesAndPreds.toDF().toPandas()
results.columns = ['truth', 'pred']
results = results[results['truth'] > 0]
truth = np.array(results["truth"].tolist())
pred = np.array(results["pred"].tolist())
diff_fare = 100*(truth - pred)/truth
print 'mean = ' + str(diff_fare.mean())
#R-squared
metrics = RegressionMetrics(valuesAndPreds)
print("R-squared = %s" % metrics.r2)
示例11: SparkConf
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local")
sc=SparkContext(conf=sparkConf)
sqlContext = SQLContext(sc)
(visual_training_image_array , visual_training_outcome_array ) = loadVisualTrainingDataToArray()
#We have to turn it into a list of observations
visual_training_data = []
for i in range(0,len(visual_training_outcome_array) ):
visual_training_data.append((visual_training_outcome_array[i],visual_training_image_array[i]))
visual_training_rdd = sc.parallelize(visual_training_data)
visual_data_flattened = visual_training_rdd.map(lambda x : ( x[0] , averageBrightness4By4(x[1])) )
visual_data_labeled_points = visual_data_flattened.map(lambda x : varsToLabeledPoint(x))
toprint=visual_data_labeled_points.take(1)
print(str(toprint))
visual_model = RandomForest.trainRegressor(visual_data_labeled_points, categoricalFeaturesInfo={},
numTrees=1000, featureSubsetStrategy="auto",
impurity='variance', maxDepth=5, maxBins=100)
#visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True)
visual_training_vectors = visual_data_flattened.map(lambda x : featuresToVectors(x[1]))
toprint = visual_training_vectors.take(1)
print(str(toprint))
visual_in_sample_predictions = visual_model.predict(visual_training_vectors)
visual_in_sample_labels_and_predictions = visual_data_labeled_points.map(lambda lp: lp.label).zip(visual_in_sample_predictions)
visual_in_sample_labels_and_predictions.foreach(printline)
squaresdf = visual_in_sample_labels_and_predictions.map(lambda p : (p[0] , p[0]*p[0] , p[0] - p[1] , (p[0] - p[1])*(p[0] - p[1]) , 1 ) )
squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) )
tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4])
rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4])
r2 = 1-rss/tss
print("Training set:")
示例12: LabeledPoint
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
.join( avgTemperature ) \
.map( lambda row: [ item for sublist in row for item in sublist ] ) \
.map( lambda row: LabeledPoint( row[ 2 ][ 1 ], [ row[ 2 ][ 0 ], row[ 1 ], row[ 3 ] ] ) ) \
.cache( );
crimeCounts.unpersist( );
# Split the crime counts into training and test datasets
( training, test ) = joinedData.randomSplit( ( 0.7, 0.3 ) );
# Categorical features dictionary
featuresInfo = { 0: len( beatsDict ), 1: 53 };
# Train a Random Forest model to predict crimes
model = RandomForest.trainRegressor( training, categoricalFeaturesInfo = featuresInfo,
numTrees = 5, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 10, maxBins = len( beatsDict ) );
# Measure the model performance on test dataset
predictions = model.predict( test.map( lambda x: x.features ) ) \
.cache( );
meanCrimes = test.map( lambda x: x.label ).mean( );
labelsAndPredictions = test.map( lambda x: x.label ).zip( predictions );
testMSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( ) / float( test.count( ) );
testSSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( );
testSST = labelsAndPredictions.map( lambda ( v, p ): ( v - meanCrimes ) * ( v - meanCrimes ) ).sum( );
Rsq = 1 - testSSE / testSST;
#### Predicting crimes for next week ####
示例13: rf
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def rf(userID, n):
### CREATING GAME PROFILE DF ####
game_profiles = get_game_profiles()
df = pd.DataFrame(game_profiles)
df_clean = preprocess(df)
# Full df for games only, no playtimes (for prediction later)
df_games = df_clean.drop('genres', 1)
#df_games = df_games.drop('name', 1)
df_games = df_games.drop('appID', 1)
df_games = df_games.drop('cat', 1)
df_games = df_games.drop('tags', 1)
df_games = df_games.drop('type', 1)
games = get_games('/media/sf_AdvancedML/Final/gameData.txt')
missing = set()
### CROSS VALIDATING ###
all_accur, avg_accur = cross_validate(df_clean, games, 10)
print "Accuracies, Average Accuracy"
print all_accur, avg_accur
### TRAIN ON INCOMING USER ###
ownedGames = build_user_dataset.get_ownedGames(userID) #json object
with open('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt', 'w') as outFile:
if len(ownedGames) == 0:
print "This user's library is empty or unreachable."
return
json.dump({'user': userID, 'ownedGames':ownedGames}, outFile)
# initialize empty frame with appropriate columns
df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])
# Randomly select user's library
gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt')
user = random.choice(gamesOwned.values())
gamesList = gamesOwned[gamesOwned.keys()[0]].keys()
# Connect playtime to game df for games owned
if len(user.values()) > 0:
#print user.values()[0]
for k, v in user.values()[0].iteritems():
if k in games:
row = df_clean.loc[df_clean['name'] == k]
row['playtime'] = np.log(v)
df = df.append(row)
else:
missing.add(k)
df = df.drop('genres', 1)
df = df.drop('name', 1)
df = df.drop('appID', 1)
df = df.drop('cat', 1)
df = df.drop('tags', 1)
df = df.drop('type', 1)
# Pass User DF to Spark
df.to_csv('/media/sf_AdvancedML/Final/RF.csv')
data = sc.textFile('/media/sf_AdvancedML/Final/RF.csv')
header = data.first()
data = data.filter(lambda x: x != header)
data = data.map(lambda line: convertUni(line))
data = data.map(lambda line: line.split(','))
# RDD of (label, features) pairs
data = data.map(lambda line: LabeledPoint(line[0], line[1:]))
model = RandomForest.trainRegressor(data, categoricalFeaturesInfo = {},
numTrees = 3, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 4)
### PREDICT ###
# for every game in Steam library #
df_games.to_csv('/media/sf_AdvancedML/Final/RF_games_names.csv')
df_games.drop('name', 1).to_csv('/media/sf_AdvancedML/Final/RF_games.csv')
data_games = sc.textFile('/media/sf_AdvancedML/Final/RF_games.csv')
header = data_games.first()
data_games = data_games.filter(lambda x: x != header)
data_games = data_games.map(lambda line: convertUni(line))
data_games = data_games.map(lambda line: line.split(','))
data_test = sc.textFile('/media/sf_AdvancedML/Final/RF_games_names.csv')
header2 = data_test.first()
data_test = data_test.filter(lambda x: x != header2)
data_test = data_test.map(lambda line: convertUni(line))
data_test = data_test.map(lambda line: line.split(','))
predictions = model.predict(data_games)
idPredictions = data_test.map(lambda x: x[6]).zip(predictions)
# Filter predictions for games owned or trailers/apps
idPredictions = idPredictions.filter(lambda x: x[0] not in gamesList)
# Export predictions to pandas df
predDF = idPredictions.toDF()
predDF = predDF.toPandas() # Name, Prediction
#.........这里部分代码省略.........
示例14: cross_validate
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
def cross_validate(df_clean, games, n):
"""
:param k n: number of users for CV
:return: list of accuracies for each of n users, avg acc
"""
missing = set()
### COLLECTING LIBRARIES ###
gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData.txt')
print "Done collecting ownedGames."
### VALIDATING ###
all_accur = {'model1': [], 'model2': [], 'model3': [], 'model4': []}
for i in range(n):
# initialize empty frame with appropriate columns
df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime'])
# Randomly select user's library
user = random.choice(gamesOwned.values())
# Connect playtime to game df for games owned
if len(user.values()) > 0:
#print user.values()[0]
for k, v in user.values()[0].iteritems():
if k in games:
row = df_clean.loc[df_clean['name'] == k]
row['playtime'] = np.log(v)
df = df.append(row)
else:
missing.add(k)
df = df.drop('genres', 1)
df = df.drop('name', 1)
df = df.drop('appID', 1)
df = df.drop('cat', 1)
df = df.drop('tags', 1)
df = df.drop('type', 1)
# Pass User DF to Spark
df.to_csv('/media/sf_AdvancedML/Final/RF_train.csv')
data = sc.textFile('/media/sf_AdvancedML/Final/RF_train.csv')
header = data.first()
data = data.filter(lambda x: x != header)
data = data.map(lambda line: convertUni(line))
data = data.map(lambda line: line.split(','))
# RDD of (label, features) pairs
data = data.map(lambda line: LabeledPoint(line[-1], line[:len(line)]))
# Split into training, test
(trainingData, testData) = data.randomSplit([0.8, 0.2])
try:
# Training model
model1 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
numTrees = 70, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 4)
model2 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
numTrees = 100, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 4)
model3 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
numTrees = 120, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 4)
model4 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {},
numTrees = 100, featureSubsetStrategy = "auto",
impurity = 'variance', maxDepth = 6)
models = [model1, model2, model3, model4]
modelNames = ['model1', 'model2', 'model3', 'model4']
for i in range(len(models)):
m = models[i]
name = modelNames[i]
# Evaluate on test data, compute error
predictions = m.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)*(v-p)).sum() /\
float(testData.count())
all_accur[name] += [testMSE]
except:
pass
avgDict = {}
for k,v in all_accur.iteritems():
avgDict[k] = np.mean(v)
return all_accur, avgDict
示例15: toAge
# 需要导入模块: from pyspark.mllib.tree import RandomForest [as 别名]
# 或者: from pyspark.mllib.tree.RandomForest import trainRegressor [as 别名]
filtered_car_data = car_data.map(
lambda d: [toInteger(d["prc"]), toAge(d["fr"]), toFuel(d["fl"]), toInteger(d["ma"]), d["pk"], d["po"], d["ei"]]
)
filtered_car_data.first()
labeled_car_data = filtered_car_data.map(lambda row: LabeledPoint(row[0], row[1:]))
labeled_car_data.first()
labeled_car_data.collect()
"""
(3) Run the Random Forest.
"""
model = RandomForest.trainRegressor(
labeled_car_data, numTrees=750, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5, maxBins=32
)
predictions = model.predict(labeled_car_data.map(lambda x: x.features))
labelsAndPredictions = labeled_car_data.map(lambda lp: [lp.label, lp.features]).zip(predictions)
labelsAndPredictions.first()
model_error = labelsAndPredictions.map(lambda row: (row[1] - row[0][0], row))
"""
(4) Get the extremes!
Best & Worst deal.