本文整理汇总了Python中pyspark.mllib.tree.GradientBoostedTrees.trainClassifier方法的典型用法代码示例。如果您正苦于以下问题:Python GradientBoostedTrees.trainClassifier方法的具体用法?Python GradientBoostedTrees.trainClassifier怎么用?Python GradientBoostedTrees.trainClassifier使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.tree.GradientBoostedTrees
的用法示例。
在下文中一共展示了GradientBoostedTrees.trainClassifier方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run_GBDT
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def run_GBDT(input_file,output_file,iterations):
dataRDD=sc.textFile(input_file).map(lambda x: x.replace('\t',','))
#Now let us create labeled point from data
dataRDDParsed=dataRDD.map(parsePoint).cache()
featSet=dataRDDParsed.flatMap(lambda x: x).map(maaro).reduceByKey(lambda a,b: a+b).takeOrdered(26,lambda (k,v): -v)
#reduceByKey(lambda x,y:x+y).takeOrdered(25,lambda (k,v):-v)
#print featSet
#OHEdict=createOneHotDict(dataRDDParsed,featSet)
OHEdict={}
for i,x in enumerate(featSet):
# print i,x
OHEdict[x[0]]=i
#print oneHotEncoding(dataRDDParsed,OHEdict,numSampleOHEFeats,)
#Now let us create a dictionary of points
# weights=[.8,.1,.1]
# seed=42
# trainRDD,validateRDD,testRDD=dataRDD.randomSplit(weights,seed)
# OHETrainData = trainRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39))
OHETrainData = dataRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39))
# print OHETrainData.take(1)
# print OHETrainData.count()
model = (GradientBoostedTrees.trainClassifier(OHETrainData, loss = 'logLoss', numIterations=2,
categoricalFeaturesInfo={}, learningRate = 0.1, maxDepth = 7, maxBins = 2))
sc.parallelize([model.toDebugString()]).coalesce(1).saveAsTextFile(output_file)
示例2: train_model
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def train_model(cls, trianData, cateFeaInfo={}, iterTimes=3):
"""
训练模型
"""
model = GradientBoostedTrees.trainClassifier(trianData, \
categoricalFeaturesInfo=cateFeaInfo, numIterations=iterTimes)
return model
示例3: main
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def main():
text = sc.textFile(inputs)
nltk_data_path = "[change yo your own nltk_data location]" # maybe changed to the sfu server path
nltk.data.path.append(nltk_data_path)
cleaned_review = text.map(clean_reviewf).cache()
reviews_txt = cleaned_review.map(lambda review: review['reviewText'])
reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache()
training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014)
testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014)
training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()
testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()
training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
training_review_text_flat = training_review_text.flatMapValues(myf)
training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))
testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
testing_review_text_flat = testing_review_text.flatMapValues(myf)
testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))
word2vec_model = generate_word2vec_model(reviews_txt)
mv = word2vec_model.getVectors()
# this step seems redundant but necessary
mvdct = []
for k,v in mv.items():
vec = [f for f in v]
mvdct.append((k,vec))
dct_rdd = sc.parallelize(mvdct)
training_feature_vecs = dct_rdd.join(training_review_text_flat)
training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
training_rating_avgf = training_rating.join(training_avg_vecs)
training_lps = training_rating_avgf.map(get_lp)
testing_feature_vecs = dct_rdd.join(testing_review_text_flat)
testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
testing_rating_avgf = testing_rating.join(testing_avg_vecs)
testing_lps = testing_rating_avgf.map(get_lp)
gbt_model = GradientBoostedTrees.trainClassifier(training_lps,
categoricalFeaturesInfo={}, numIterations=20)
predictions = gbt_model.predict(testing_lps.map(lambda x: x.features))
labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions)
MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count())
RMSE = math.sqrt(MSE)
result = str(RMSE)
outdata = sc.parallelize([result])
outdata.saveAsTextFile(output)
示例4: testClassification
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def testClassification(trainingData, testData):
# Train a GradientBoostedTrees model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count())
print("Test Error = " + str(testErr))
print("Learned classification ensemble model:")
print(model.toDebugString())
示例5: test_classification
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
示例6: Gradient_BoostedTrees
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def Gradient_BoostedTrees(filename, sc):
# Load and parse the data file.
data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainClassifier(trainingData,
categoricalFeaturesInfo={}, numIterations=3)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())
示例7: crossValidator
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def crossValidator(IterNums,dataset_rdd,rate):
dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5)
# dataset_positive1,dataset_positive2,dataset_positive3,dataset_positive4,dataset_positive5 = dataset_positive.randomSplit([1,1,1,1,1])
# dataset_negotive1,dataset_negotive2,dataset_negotive3,dataset_negotive4,dataset_negotive5 = dataset_negotive.randomSplit([1,1,1,1,1])
dataset_positive_list = dataset_positive.randomSplit([1,1,1,1,1])
dataset_negotive_list = dataset_negotive.randomSplit([1,1,1,1,1])
result = []
#result2 = []
for i in range(5):
testset_positive = dataset_positive_list[i].count()
testset_rdd = dataset_positive_list[i].union(dataset_negotive_list[i])
testset_count = testset_rdd.count()
trainset_rdd = dataset_rdd.subtract(testset_rdd)
trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
model = GradientBoostedTrees.trainClassifier(trainset, {}, numIterations=IterNums,learningRate = rate)
#model2 = LogisticRegressionWithLBFGS.train(trainset,iterations = 100)
predictions = model.predict(testset.map(lambda x:x.features))
#predictions2 = model2.predict(testset.map(lambda x:x.features))
predict = testset.map(lambda lp: lp.label).zip(predictions)
#predict2 = testset.map(lambda lp:lp.label).zip(predictions2)
hitALL =predict.filter(lambda e:e[0]==e[1]).count()
#hitALL2 = predict2.filter(lambda e:e[0]==e[1]).count()
hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
#hitPositive2 = predict2.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
positive = predict.filter(lambda e:e[1]>0.5).count()
#positive2 = predict2.filter(lambda e:e[1]>0.5).count()
recall = hitPositive/float(testset_positive)
#recall2 = hitPositive2/float(testset_positive)
precision = hitPositive/float(positive)
#precision2 = hitPositive2/float(positive2)
accuracy = hitALL/float(testset_count)
#accuracy2 = hitALL2/float(testset_count)
F_Value = 2/(1/precision+1/recall)
#F_Value2 = 2/(1/precision2+1/recall2)
result.append((precision,recall,accuracy,F_Value,hitPositive,positive,testset_positive,testset_count))
示例8: cal_mllib_accuracy
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
# In[8]:
#Split the training set and test set
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# In[9]:
#Training model
RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=5, maxBins=32)
GB_model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3)
# In[10]:
#Predication
def cal_mllib_accuracy(list):
for i, clf in enumerate(list):
#prediction with the features
predictions = clf.predict(testData.map(lambda x: x.features))
#append with lables first then features
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).count()/testData.count()
#compare results
示例9: features
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
# * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories
#
#
# * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are:
# * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context.
# * `load(sc,path)` : The counterpart to save - load classifier from file.
# * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints.
# * `toDebugString()` : print the classifier in a human readable format.
# In[32]:
from time import time
errors={}
for depth in [10]:
start=time()
model=GradientBoostedTrees.trainClassifier(trainingData, {},maxDepth=depth, numIterations=30)##FILLIN to generate 10 trees ##)
#print model.toDebugString()
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys(): # Calculate errors on train and test sets
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) ### FILLIN ###
Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth]#,int(time()-start),'seconds'
#print errors
# In[33]:
示例10: LabeledPoint
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
# Read the file into an RDD
# If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS.
path='/HIGGS/HIGGS.csv'
inputRDD=sc.textFile(path)
# Transform the text RDD into an RDD of LabeledPoints
Data=inputRDD.map(lambda line: [float(strip(x)) for x in line.split(',')]) .map(lambda x: LabeledPoint(x[0], x[1:]))
Data1=Data.sample(False,0.1, seed=255).cache()
(trainingData,testData)=Data1.randomSplit([0.7,0.3],seed = 255)
trainingData.cache()
testData.cache()
errors={}
depth = 10
model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=30, learningRate=0.3, maxDepth=depth)
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys(): # Calculate errors on train and test sets
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted)
Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth]
示例11: parsePoint2
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def parsePoint2(line):
values= [float(x) for x in line.split(',')]
return LabeledPoint(values[0], values[1:])
#train data load
train_data_new = sc.textFile('/home/hduser/dataset.txt')
parsedData = train_data_new.map(parsePoint)
#test data load
test_data_new = sc.textFile('/home/hduser/testfile.txt')
test_final = test_data_new.map(parsePoint2)
# Split train and test
X_train, X_test = parsedData.randomSplit([0.8,0.2])
#train the classifier
model=GradientBoostedTrees.trainClassifier(X_train,categoricalFeaturesInfo={},numIterations=10)
#20% of training data
predictions=model.predict(X_test.map(lambda x: x.features))
labelsAndPredictions1 = X_test.map(lambda p: p.label).zip(predictions)
#test data
predictions1=model.predict(test_final.map(lambda x: x.features))
y_final = test_final.map(lambda p: p.label).zip(predictions1)
er =labelsAndPredictions1.filter(lambda (v, p): v != p).count() / float(X_train.count())
acc = (1 - er)*100
print('===============================================================')
print(model.toDebugString())
print('===============================================================')
for i in y_final.collect():
示例12: features
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
# * **maxDepth** – Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3)
# * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories
#
#
# * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are:
# * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context.
# * `load(sc,path)` : The counterpart to save - load classifier from file.
# * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints.
# * `toDebugString()` : print the classifier in a human readable format.
errors={}
catInfo = {}
for i in range(10,54):
catInfo[i] = 2
depth = 13
model=GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo=catInfo,maxDepth=depth,numIterations=13,learningRate = 0.15)
#print model.toDebugString()
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys():
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted)
Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth]
# coding: utf-8
示例13: LabeledPoint
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
Data=inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(1.0, V[:-1]) if V[-1] == 2.0 else LabeledPoint(0.0, V[:-1])).cache()
# ### Reducing data size
# In[11]:
(trainingData,testData)=Data.randomSplit([0.7,0.3],seed=255)
trainingData.cache()
testData.cache()
# ### Gradient Boosted Trees
# In[13]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
errors={}
for depth in [14]:
model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=15, maxDepth=depth)
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys(): # Calculate errors on train and test sets
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth]
示例14: test_classification
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
temp_dir = tempfile.mkdtemp()
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd, iterations=10)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
dt_model_dir = os.path.join(temp_dir, "dt")
dt_model.save(self.sc, dt_model_dir)
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
rf_model_dir = os.path.join(temp_dir, "rf")
rf_model.save(self.sc, rf_model_dir)
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
gbt_model_dir = os.path.join(temp_dir, "gbt")
gbt_model.save(self.sc, gbt_model_dir)
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
try:
rmtree(temp_dir)
except OSError:
pass
示例15: parsePoint
# 需要导入模块: from pyspark.mllib.tree import GradientBoostedTrees [as 别名]
# 或者: from pyspark.mllib.tree.GradientBoostedTrees import trainClassifier [as 别名]
from pyspark.context import SparkContext
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
def parsePoint(line):
values = [float(x.strip()) for x in line.split(',')]
return LabeledPoint(values[-1],values[1:10])
data = sc.textFile("heart_disease.csv")
data = sc.textFile("heart_disease.csv")
data = data.map(parsePoint)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={},
numIterations=30, maxDepth=4)
# This works too!
train = sc.textFile("train.csv")
def parsePoint(line):
values = [float(x.strip()) for x in line.split(',')]
return LabeledPoint(values[-1],values[:65])
train = train.map(parsePoint)
model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={},
numIterations=300, maxDepth=2,learningRate=0.1)
test = sc.textFile("test.csv")
test = test.map(parsePoint)
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())