本文整理汇总了Python中pyspark.mllib.tree.DecisionTree.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python DecisionTree.trainRegressor方法的具体用法?Python DecisionTree.trainRegressor怎么用?Python DecisionTree.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.tree.DecisionTree
的用法示例。
在下文中一共展示了DecisionTree.trainRegressor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluate_dt
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def evaluate_dt(train,test,maxDepth,maxBins):
model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins)
preds = model.predict(test.map(lambda p:p.features))
actual = test.map(lambda p:p.label)
tp = actual.zip(preds)
rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean())
return rmsle
示例2: test_regression
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
示例3: test_regression
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例4: regression
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def regression(sc, sample):
traindata = sc.parallelize(sample)
traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
testdata = [8.2]
#####
# linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
# prediction = linear_model.predict(testdata)
# print prediction
#####
decision_model = DecisionTree.trainRegressor(traindata,{})
prediction = decision_model.predict(testdata)
print prediction
示例5: test_regression
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
示例6: xrange
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
# MAGIC %md DecisionTree performs best when it is told which features are categorical. We constructor a map categoricalFeaturesInfo to pass this information to DecisionTree.
# MAGIC If DecisionTree is not given this info, then it will treat all features as continuous.
# COMMAND ----------
# Construct a map for categorical features:
# categoricalFeaturesInfo[column index] = number of categories
categoricalFeaturesInfo = {}
for j in xrange(numFeatures):
col = featureCols[j]
if col in categoryIndexes:
categoricalFeaturesInfo[j] = len(categoryIndexes[col])
# COMMAND ----------
initialModel = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo)
initialModel
# COMMAND ----------
# We can print the full model, but it can be hard to parse when the tree is large.
print initialModel.toDebugString()
# COMMAND ----------
# MAGIC %md We now compute the error of the DecisionTreeModel on the training dataset. We use Root Mean Squared Error (RMSE) as our error metric.
# MAGIC
# MAGIC Denote (y_i, x_i) as the (label, feature vector) for instance i, and write model.predict(x_i) as our model's predicted label for instance i. RMSE is defined as:
# MAGIC
# MAGIC %[ RMSE(dataset) = \left[ \mathbf{avg}_{(y_i, x_i) \in dataset} \left( y_i - model.predict(x_i) \right)^2 \right]^{1/2} ]%
示例7: help
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
help(DecisionTree.trainRegressor)
# ## Train a Regression Model on the Bike Sharing Dataset
# In[9]:
linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))
# In[10]:
# we pass in an mepty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())
# ## Perfomance Metrics
# In[11]:
# set up performance metrics functions
def squared_error(actual, pred):
开发者ID:CRG-NLP,项目名称:Machine-Learning-with-Spark,代码行数:33,代码来源:Machine+Learning+with+Spark,+Chapter+6.py
示例8: squared_error
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)
# make decision tree model
dt_model = DecisionTree.trainRegressor(train_data,{})
# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())
def squared_error(actual, pred):
return (pred - actual)**2
def squared_log_error(pred, actual):
return (np.log(pred + 1) - np.log(actual + 1))**2
示例9: LabeledPoint
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
#ArrDelay is our response
#ArrDelay becomes the 8tth column now, and total columns in the data = 12
label = clean_line_split[0]
nonLable = clean_line_split[1:]
return LabeledPoint (label, nonLable)
parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
impurity='variance', maxDepth=5, maxBins=32)
#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
print ('Time consumed = '), (datetime.now() - startTime)
print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())
#save and load model
model.save (sc, "DTR-Narrow-2008")
示例10: enumerate
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
# In[22]:
for i,x in enumerate(features): print i,x
# In[23]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
impurity="variance", maxDepth=6, maxBins=12)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())
# In[24]:
#
plt.xlabel("response")
plt.ylabel("prediction")
示例11: float
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
summary = Statistics.colStats(testvecData)
variance = summary.variance()[0]
# compute the pseudo R-square
test_Rsqr1 = 1 - testMSE1/float(variance)
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# use variance as impurity for regression
# maxDepth is the maximum number of level for each tree
model2 = DecisionTree.trainRegressor(trainparsedData
, categoricalFeaturesInfo={}
, impurity='variance'
, maxDepth=8
, maxBins=32)
# evaluate the training error
# first make the prediction and create a new "vector" of all the predictions
trainpredictions = model2.predict(trainparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions)
# use map operation to compute MSE
trainMSE2 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count())
# use the the Statistics library to obtain the variance
summary = Statistics.colStats(trainvecData)
variance = summary.variance()[0]
示例12: LabeledPoint
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()
first_point_dt.label
first_point_dt.features
len(first_point_dt.features)
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree
linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
true_vs_predicted.take(5)
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data_dt.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
true_vs_predicted_dt.take(5)
dt_model.depth()
dt_model.numNodes()
def squared_error(actual, pred):
return (pred - actual) ** 2
def abs_error(actual, pred):
return np.abs(pred - actual)
def squared_log_error(actual, pred):
return (np.log(pred + 1) - np.log(actual + 1)) ** 2
true_vs_predicted.map(lambda t: squared_error(t[0], t[1])).mean()
示例13: SparkContext
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold
# Kfold
if __name__ == "__main__":
sc = SparkContext('local',appName="Prediction")
import fileinput
data_y1, data_y2 = [], []
for line in fileinput.input("data/feature_extracted_class3.txt"):
data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]]))
data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]]))
total, right, mse = 0, 0, []
for t in xrange(10):
kf = KFold(32*40, n_folds=10)
for train, test in kf:
data_train_y1, data_train_y2 = [], []
for i in train:
data_train_y1.append(data_y1[i])
data_train_y2.append(data_y2[i])
clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100)
clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100)
for i in test:
data_test_y1, data_test_y2 = data_y1[i], data_y2[i]
r1 = clf1.predict(data_test_y1.features)
r2 = clf2.predict(data_test_y2.features)
if r1 == data_test_y1.label:
right += 1
mse.append(abs(r2-data_test_y2.label))
total += 1
print float(right)/total, sum(mse)/len(mse)
示例14: run_decision_tree
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def run_decision_tree(userid):
conf = SparkConf().setMaster("local[1]").setAppName("heart-disease-prediction-descision-tree")
sc = SparkContext(conf=conf)
print "Running Spark Version %s" % (sc.version)
# https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
path = "/home/raju/Documents/hdp_proj"
heartdf_tr = pd.read_csv(path+"processed.cleveland.data.csv",header=None)
heartdf_test = pd.read_csv(path+"testdata.csv",header=None)
print "Original training Dataset (Rows:Colums): "
print heartdf_tr.shape
print heartdf_test.shaperead_csvread_csvread_csv
print "Categories of Diagnosis of heart disease (angiographic disease status) that we are predicting"
print "-- Value 0: < 50% diameter narrowing"
print "-- Value 1: > 50% diameter narrowing "
print heartdf_tr.ix[:,13].unique() #Column containing the Diagnosis of heart disease
print heartdf_test.ix[:,13].unique() #Column containing the Diagnosis of heart disease
newheartdf = pd.concat([heartdf_tr.ix[:,13], heartdf_tr.ix[:,0:12]],axis=1, join_axes=[heartdf_tr.index])
newheartdf_test = pd.concat([heartdf_test.ix[:,13], heartdf_test.ix[:,0:12]],axis=1, join_axes=[heartdf_test.index])
newheartdf.replace('?', np.nan, inplace=True) # Replace ? values
newheartdf_test.replace('?', np.nan, inplace=True) # Replace ? values
print "After dropping rows with anyone empty value (Rows:Columns): "
ndf2 = newheartdf.dropna()
ndf_test = newheartdf_test.dropna()
ndf2.to_csv(path+"new-heart-disease-cleaveland.txt",sep=",",index=False,header=None,na_rep=np.nan)
ndf_test.to_csv(path+"new-heart-disease-cleaveland-test.txt",sep=",",index=False,header=None,na_rep=np.nan)
print ndf2.shape
print ndf_test.shape
print ndf2.ix[:5,:]
print ndf_test.ix[:5,:]
print "Create a Labeled point which is a local vector, associated with a label/response"
points = sc.textFile(path+'new-heart-disease-cleaveland.txt')
points_test = sc.textFile(path+'new-heart-disease-cleaveland-test.txt')
print "###############################Something"
parsed_data = points.map(parsePoint)
parsed_data_test = points_test.map(parsePoint)
print 'After parsing, number of training lines: %s' %parsed_data.take(5) #parsed_data.count()
print 'After parsing, number of test data lines: %s' %parsed_data_test.take(5) #parsed_data.count()
#####Perform Classification using a Decision Tree#####
# Split the data into training and test sets (30% held out for testing)
(trainingData, trainingData1) = parsed_data.randomSplit([1,0])
(testData , testData1) = parsed_data_test.randomSplit([1,0])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
print "+++++++++++++++++++++++++++++++++ Perform Classification using a Decision Tree +++++++++++++++++++++++++++++++++"
model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=32)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('=================== Learned classification tree model ====================')
print(model.toDebugString())
print "+++++++++++++++++++++++++++++++++ Perform Regression using a Decision Tree +++++++++++++++++++++++++++++++++"
model1 = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=4, maxBins=32)
####### Evaluate model on test instances and compute test error########
predictions = model1.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('================== Learned regression tree model ====================')
print(model1.toDebugString())
print(userid)
input_data = get_input_data(userid[-20:-2])
#features = vector.dense(result)
prediction_value = model1.predict(input_data)
print(prediction_value)
post_prediction(userid[-20:-2],prediction_value)
示例15: learn
# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def learn(examples,depth,bin):
global model
model = DecisionTree.trainRegressor(examples, categoricalFeaturesInfo={},
impurity='variance', maxDepth=depth, maxBins=bin)