本文整理汇总了Python中pyspark.mllib.tree.RandomForest类的典型用法代码示例。如果您正苦于以下问题:Python RandomForest类的具体用法?Python RandomForest怎么用?Python RandomForest使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RandomForest类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testOnce
def testOnce ():
# split the data into training and testing sets
(trainingData, testData) = data.randomSplit([1-test_size, test_size])
# train the random forest
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=num_trees, featureSubsetStrategy = strat,
impurity='gini', maxDepth = max_depth, maxBins=32)
# test the random forest
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
probsAndScores = probTest(testData, model)
threshold_accuracy = probsAndScores[0]
probs = probsAndScores[1].map(lambda x: x/num_trees)
labelsAndPredictions = labelsAndPredictions.zip(probs)
labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs)
save(labelsAndProbs, 'answers')
print ('Galaxy Purity = ' + str(Ng / (Ng+Ms)))
print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg)))
print ('Star Purity = ' + str(Ns / (Ns+Mg)))
print ('Star Completeness = ' + str(Ns/(Ns+Ms)))
print ('Accuracy = ' + str(1 - testErr))
print ('Threshold method accuracy = ' + str(threshold_accuracy))
示例2: rfTest
def rfTest(sqlContext,dataset_rdd):
dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5)
train_positive = dataset_positive.sample(False,0.8)
test_positive = dataset_positive.subtract(train_positive)
train_negotive = dataset_negotive.sample(False,0.8)
test_negotive = dataset_negotive.subtract(train_negotive)
trainset_rdd = train_positive.union(train_negotive)
testset_rdd = test_positive.union(test_negotive)
trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
trainset_nums = trainset.count()
testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
testset_nums = testset.count()
trainset_positive = train_positive.count()
testset_positive = test_positive.count()
model = RandomForest.trainClassifier(trainset,2,{},3)
predictions = model.predict(testset.map(lambda x:x.features))
predict = testset.map(lambda lp: lp.label).zip(predictions)
hitALL =predict.filter(lambda e:e[0]==e[1]).count()
hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
positive = predict.filter(lambda e:e[1]>0.5).count()
recallPositive = hitPositive/float(testset_positive)
precision = hitPositive/float(positive)
accuracy = hitALL/float(testset.count())
F_Value = 2/(1/precision+1/recallPositive)
return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
示例3: main
def main():
sc = SparkContext(appName="MyApp")
sc.setLogLevel('ERROR')
# Parse data
train_labels, train_data = load_data('train.csv')
dummy_labels, test_data = load_data('test.csv', use_labels=False)
# Truncate the last 2 features of the data
for dataPoint in train_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
for dataPoint in test_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
# Map each data point's label to its features
train_set = reformatData(train_data, train_labels)
test_set = reformatData(test_data, dummy_labels)
# Parallelize the data
parallelized_train_set = sc.parallelize(train_set)
parallelized_test_set = sc.parallelize(test_set)
# Split the data
trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)
# Train the models
randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
numTrees=750, seed=42, maxDepth=30, maxBins=32)
# Test the model
testRandomForest(randomForestModel, parallelized_test_set)
示例4: generateRandomForest
def generateRandomForest():
if os.path.exists(RF_PATH):
print("RF_PATH Already available")
return
data = sc.textFile(F_PATH).map(parseLine)
(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)
# Train a RandomForest model.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
numTrees=4, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error', str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
modelStatistics(labelsAndPredictions)
# Save and load model
model.save(sc, RF_PATH)
print("Saved RF Model.")
示例5: main
def main():
input_train = sys.argv[1]
input_test = sys.argv[2]
conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
train = sc.textFile(input_train).cache()
test = sc.textFile(input_test).cache()
'''sbaronia - get training and testing labeled points'''
train_lp = train.map(to_labeledpoint).cache()
test_lp = test.map(to_labeledpoint).cache()
'''sbaronia - run RandomForest regression on our training data with
default options except numTrees = 5'''
rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
'''sbaronia - run predictions on testing data and calculate RMSE value'''
predictions = rf_model.predict(test_lp.map(lambda x: x.features))
labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))
print("RMSE = " + str(rmse))
示例6: Random_Forest
def Random_Forest(filename, sc):
filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, filename)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
示例7: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
示例8: trainRandomForestModel
def trainRandomForestModel(data):
"""
Train a random forest regression model and return it
:param data: RDD[LabeledPoint]
:return: random forest regression model
"""
from pyspark.mllib.tree import RandomForest
model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
return model
示例9: train_model
def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\
depth=4):
"""
训练模型
"""
model = RandomForest.trainClassifier(trainData, numClasses=2,\
categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \
featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\
maxBins=32)
return model
示例10: evaluate
def evaluate(self, trainingData, testData=None, metric=None):
if testData !=None:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=10, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
else: #cross validation
pass
示例11: trainModel
def trainModel(trainingData):
print "\nTrainning Random Forest model started!"
Utils.logTime()
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto", impurity='gini',
maxDepth=5, maxBins=32)
print '\nTraining Random Forest model finished'
Utils.logTime()
return model
示例12: getRandomForestRMSE
def getRandomForestRMSE(trees_array):
valRMSE_list = []
for trees in trees_array:
model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
numTrees=trees, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
valRMSE=valMSE**0.5
valRMSE_list.append((trees, valRMSE))
return valRMSE_list
示例13: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例14: trainOptimalModel
def trainOptimalModel(trainingData, testData):
print "\nTraining optimal Random Forest model started!"
Utils.logTime()
numTreesVals = [3,5,8]
featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird']
impurityVals = ['gini', 'entropy']
maxDepthVals = [3,4,5,6,7]
maxBinsVals = [8,16,32]
optimalModel = None
optimalNumTrees = None
optimalFeatureSubsetStrategy = None
optimalMaxDepth = None
optimalImpurity = None
optimalBinsVal = None
minError = None
try:
for curNumTree in numTreesVals:
for curFeatureSubsetStrategy in featureSubsetStrategyVals:
for curImpurity in impurityVals:
for curMaxDepth in maxDepthVals:
for curMaxBins in maxBinsVals:
model = RandomForest.trainClassifier(trainingData,
numClasses=2,
categoricalFeaturesInfo={},
numTrees=curNumTree,
featureSubsetStrategy=curFeatureSubsetStrategy,
impurity=curImpurity,
maxDepth=curMaxDepth,
maxBins=curMaxBins)
testErr = Evaluation.evaluate(model, testData)
if testErr < minError or not minError:
minError = testErr
optimalNumTrees = curNumTree
optimalFeatureSubsetStrategy = curFeatureSubsetStrategy
optimalImpurity = curImpurity
optimalMaxDepth = curMaxDepth
optimalBinsVal = curMaxBins
optimalModel = model
except:
msg = "\nException during model training with below parameters:"
msg += "\tnum trees: " + str(optimalNumTrees)
msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy
msg += "\timpurity: " + str(curImpurity)
msg += "\tmaxDepth: " + str(curMaxDepth)
msg += "\tmaxBins: " + str(curMaxBins)
Utls.logMessage(msg)
logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
return optimalModel
示例15: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)