当前位置: 首页>>代码示例>>Python>>正文


Python DecisionTree.trainRegressor方法代码示例

本文整理汇总了Python中pyspark.mllib.tree.DecisionTree.trainRegressor方法的典型用法代码示例。如果您正苦于以下问题:Python DecisionTree.trainRegressor方法的具体用法?Python DecisionTree.trainRegressor怎么用?Python DecisionTree.trainRegressor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.tree.DecisionTree的用法示例。


在下文中一共展示了DecisionTree.trainRegressor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: evaluate_dt

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def evaluate_dt(train,test,maxDepth,maxBins):
    model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins)
    preds = model.predict(test.map(lambda p:p.features))
    actual = test.map(lambda p:p.label)
    tp = actual.zip(preds)
    rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean())
    return rmsle
开发者ID:zhbzz2007,项目名称:SparkProject,代码行数:9,代码来源:regression.py

示例2: test_regression

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:61,代码来源:tests.py

示例3: test_regression

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:54,代码来源:tests.py

示例4: regression

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def regression(sc, sample):

    traindata = sc.parallelize(sample)
    traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
    testdata = [8.2]
    #####
#    linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
#    prediction = linear_model.predict(testdata)
#    print prediction


    #####
    decision_model = DecisionTree.trainRegressor(traindata,{})
    prediction = decision_model.predict(testdata)
    print prediction
开发者ID:fndjjx,项目名称:practice,代码行数:17,代码来源:regression.py

示例5: test_regression

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:39,代码来源:test_linalg.py

示例6: xrange

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
# MAGIC %md DecisionTree performs best when it is told which features are categorical.  We constructor a map categoricalFeaturesInfo to pass this information to DecisionTree.
# MAGIC If DecisionTree is not given this info, then it will treat all features as continuous.

# COMMAND ----------

# Construct a map for categorical features:
#   categoricalFeaturesInfo[column index] = number of categories
categoricalFeaturesInfo = {}
for j in xrange(numFeatures):
  col = featureCols[j]
  if col in categoryIndexes:
    categoricalFeaturesInfo[j] = len(categoryIndexes[col])

# COMMAND ----------

initialModel = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo)
initialModel

# COMMAND ----------

# We can print the full model, but it can be hard to parse when the tree is large.
print initialModel.toDebugString()

# COMMAND ----------

# MAGIC %md We now compute the error of the DecisionTreeModel on the training dataset.  We use Root Mean Squared Error (RMSE) as our error metric.
# MAGIC 
# MAGIC Denote (y_i, x_i) as the (label, feature vector) for instance i, and write model.predict(x_i) as our model's predicted label for instance i.  RMSE is defined as:
# MAGIC 
# MAGIC %[ RMSE(dataset) = \left[ \mathbf{avg}_{(y_i, x_i) \in dataset} \left( y_i - model.predict(x_i) \right)^2 \right]^{1/2} ]%
开发者ID:Inscrutive,项目名称:spark,代码行数:32,代码来源:MLWorkflow.py

示例7: help

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
help(DecisionTree.trainRegressor)


# ## Train a Regression Model on the Bike Sharing Dataset

# In[9]:

linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))


# In[10]:

# we pass in an mepty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())


# ## Perfomance Metrics

# In[11]:

# set up performance metrics functions 

def squared_error(actual, pred):
开发者ID:CRG-NLP,项目名称:Machine-Learning-with-Spark,代码行数:33,代码来源:Machine+Learning+with+Spark,+Chapter+6.py

示例8: squared_error

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make decision tree model 
dt_model = DecisionTree.trainRegressor(train_data,{})

# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

def squared_error(actual, pred): 
	return (pred - actual)**2

def squared_log_error(pred, actual):
	return (np.log(pred + 1) - np.log(actual + 1))**2
开发者ID:rrballenger17,项目名称:cscie63,代码行数:31,代码来源:three_horsepower.py

示例9: LabeledPoint

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
	
	#ArrDelay is our response
	#ArrDelay becomes the 8tth column now, and total columns in the data = 12
	label = clean_line_split[0]
	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
开发者ID:bsangee,项目名称:spark_vs_r,代码行数:34,代码来源:decision_tree_regression.py

示例10: enumerate

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]

# In[22]:

for i,x in enumerate(features): print i,x


# In[23]:


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    impurity="variance", maxDepth=6, maxBins=12)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())


# In[24]:

# 
plt.xlabel("response")
plt.ylabel("prediction")
开发者ID:wangwf,项目名称:Codes,代码行数:33,代码来源:spark-mongo.py

示例11: float

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
summary = Statistics.colStats(testvecData)
variance = summary.variance()[0]
# compute the pseudo R-square
test_Rsqr1 = 1 - testMSE1/float(variance)





# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# use variance as impurity for regression
# maxDepth is the maximum number of level for each tree
model2 = DecisionTree.trainRegressor(trainparsedData
									, categoricalFeaturesInfo={}
									, impurity='variance'
									, maxDepth=8
									, maxBins=32)


# evaluate the training error
# first make the prediction and create a new "vector" of all the predictions
trainpredictions = model2.predict(trainparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions)
# use map operation to compute MSE
trainMSE2 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count())

# use the the Statistics library to obtain the variance
summary = Statistics.colStats(trainvecData)
variance = summary.variance()[0]
开发者ID:chhavi21,项目名称:Bike-Share_Demand_Public,代码行数:33,代码来源:Spark_EC2.py

示例12: LabeledPoint

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()

first_point_dt.label
first_point_dt.features
len(first_point_dt.features)

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree

linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
true_vs_predicted.take(5)

dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data_dt.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
true_vs_predicted_dt.take(5)
dt_model.depth()
dt_model.numNodes()

def squared_error(actual, pred):
	return (pred - actual) ** 2
def abs_error(actual, pred):
    return np.abs(pred - actual)
def squared_log_error(actual, pred):
    return (np.log(pred + 1) - np.log(actual + 1)) ** 2

true_vs_predicted.map(lambda t: squared_error(t[0], t[1])).mean()
开发者ID:jiangzhenxing,项目名称:spark-learning-python,代码行数:32,代码来源:bike-sharing.py

示例13: SparkContext

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold

# Kfold
if __name__ == "__main__":
	sc = SparkContext('local',appName="Prediction")
	import fileinput
	data_y1, data_y2 = [], []
	for line in fileinput.input("data/feature_extracted_class3.txt"):
		data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]]))
		data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]]))
	total, right, mse = 0, 0, []
	for t in xrange(10):
		kf = KFold(32*40, n_folds=10)
		for train, test in kf:
			data_train_y1, data_train_y2 = [], []
			for i in train:
				data_train_y1.append(data_y1[i])
				data_train_y2.append(data_y2[i])
			clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100)
			clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100)
			for i in test:
				data_test_y1, data_test_y2 = data_y1[i], data_y2[i]
				r1 = clf1.predict(data_test_y1.features)
				r2 = clf2.predict(data_test_y2.features)
				if r1 == data_test_y1.label:
					right += 1
				mse.append(abs(r2-data_test_y2.label))
				total += 1
	print float(right)/total, sum(mse)/len(mse)
开发者ID:qiangsiwei,项目名称:competition_CCF,代码行数:32,代码来源:prediction_kfold.py

示例14: run_decision_tree

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def run_decision_tree(userid):
	conf = SparkConf().setMaster("local[1]").setAppName("heart-disease-prediction-descision-tree")
	sc   = SparkContext(conf=conf)

	print "Running Spark Version %s" % (sc.version)


	# https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
	path = "/home/raju/Documents/hdp_proj"
	heartdf_tr = pd.read_csv(path+"processed.cleveland.data.csv",header=None)
	heartdf_test = pd.read_csv(path+"testdata.csv",header=None)
	print "Original training Dataset (Rows:Colums): "
	print heartdf_tr.shape
	print heartdf_test.shaperead_csvread_csvread_csv

	print "Categories of Diagnosis of heart disease (angiographic disease status) that we are predicting"
	print "-- Value 0: < 50% diameter narrowing"
	print "-- Value 1: > 50% diameter narrowing "
	print heartdf_tr.ix[:,13].unique() #Column containing the Diagnosis of heart disease
	print heartdf_test.ix[:,13].unique() #Column containing the Diagnosis of heart disease

	newheartdf = pd.concat([heartdf_tr.ix[:,13], heartdf_tr.ix[:,0:12]],axis=1, join_axes=[heartdf_tr.index])
	newheartdf_test = pd.concat([heartdf_test.ix[:,13], heartdf_test.ix[:,0:12]],axis=1, join_axes=[heartdf_test.index])
	newheartdf.replace('?', np.nan, inplace=True) # Replace ? values
	newheartdf_test.replace('?', np.nan, inplace=True) # Replace ? values

	print "After dropping rows with anyone empty value (Rows:Columns): "
	ndf2 = newheartdf.dropna()
	ndf_test = newheartdf_test.dropna()

	ndf2.to_csv(path+"new-heart-disease-cleaveland.txt",sep=",",index=False,header=None,na_rep=np.nan)
	ndf_test.to_csv(path+"new-heart-disease-cleaveland-test.txt",sep=",",index=False,header=None,na_rep=np.nan)

	print ndf2.shape
	print ndf_test.shape
	print ndf2.ix[:5,:]
	print ndf_test.ix[:5,:]

	print "Create a Labeled point which is a local vector, associated with a label/response"

	points = sc.textFile(path+'new-heart-disease-cleaveland.txt')
	points_test = sc.textFile(path+'new-heart-disease-cleaveland-test.txt')

	print "###############################Something"
	parsed_data = points.map(parsePoint)
	parsed_data_test = points_test.map(parsePoint)

	print 'After parsing, number of training lines: %s' %parsed_data.take(5)  #parsed_data.count()
	print 'After parsing, number of test data lines: %s' %parsed_data_test.take(5)  #parsed_data.count()


	#####Perform Classification using a Decision Tree#####
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, trainingData1) = parsed_data.randomSplit([1,0])
	(testData , testData1) = parsed_data_test.randomSplit([1,0])
	# Train a DecisionTree model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous. 
	print "+++++++++++++++++++++++++++++++++ Perform Classification using a Decision Tree +++++++++++++++++++++++++++++++++"
	model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=32)

	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('=================== Learned classification tree model ====================')
	print(model.toDebugString())


	print "+++++++++++++++++++++++++++++++++ Perform Regression using a Decision Tree +++++++++++++++++++++++++++++++++"
	model1 = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=4, maxBins=32)

	####### Evaluate model on test instances and compute test error########
	predictions = model1.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
	print('Test Mean Squared Error = ' + str(testMSE))
	print('================== Learned regression tree model ====================')
	print(model1.toDebugString())
	print(userid)
	input_data = get_input_data(userid[-20:-2])
	#features = vector.dense(result)
	prediction_value = model1.predict(input_data)
	print(prediction_value)
	post_prediction(userid[-20:-2],prediction_value)
开发者ID:RajuC,项目名称:hdp_proj,代码行数:86,代码来源:sparkpred.py

示例15: learn

# 需要导入模块: from pyspark.mllib.tree import DecisionTree [as 别名]
# 或者: from pyspark.mllib.tree.DecisionTree import trainRegressor [as 别名]
def learn(examples,depth,bin):
    global model
    model = DecisionTree.trainRegressor(examples, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=depth, maxBins=bin)
开发者ID:weituo12321,项目名称:MusicRecommendation,代码行数:6,代码来源:model.py


注:本文中的pyspark.mllib.tree.DecisionTree.trainRegressor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。