本文整理汇总了Scala中org.apache.spark.mllib.tree.DecisionTree类的典型用法代码示例。如果您正苦于以下问题:Scala DecisionTree类的具体用法?Scala DecisionTree怎么用?Scala DecisionTree使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DecisionTree类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: DecisionTreeUtil
//设置package包名称以及导入依赖的类
package org.sparksamples.decisiontree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util
import scala.collection.Map
import scala.collection.mutable.ListBuffer
object DecisionTreeUtil {
def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
val recordsArray = Util.getRecords()
val records = recordsArray._1
val first = records.first()
val numData = recordsArray._2
println(numData.toString())
records.cache()
print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2))
var list = new ListBuffer[Map[String, Long]]()
for( i <- 2 to 9){
val m = Util.get_mapping(records, i)
list += m
}
val mappings = list.toList
var catLen = 0
mappings.foreach( m => (catLen +=m.size))
val numLen = records.first().slice(11, 15).size
val totalLen = catLen + numLen
val data = {
records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
}
val data_dt = {
records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
}
val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
return (training, test)
}
def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint],
categoricalFeaturesInfo: scala.Predef.Map[Int, Int],
maxDepth :Int, maxBins: Int): Double = {
val impurity = "variance"
val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo,
impurity,maxDepth, maxBins )
val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features)))
val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
return rmsle
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:61,代码来源:DecisionTreeUtil.scala
示例2: DecisionTreeTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object DecisionTreeTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("DecisionTreeTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/DecisionTree.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (training, test) = (splits(0), splits(1))
// Train a DecisionTree model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "entropy" // Also, we can use entrophy
val maxDepth = 14
val maxBins = 16384
val model = DecisionTree.trainClassifier(training, numClasses, categoricalFeaturesInfo,
impurity, maxDepth, maxBins)
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
}
}
示例3: DecisionTreeTest
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.mllib
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Gini
object DecisionTreeTest {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("KMeansClustering")
val sc = new SparkContext(sparkConf)
val data = sc.textFile("../data/mllib/sample_tree_data.csv")
val parsedData = data.map { line =>
val parts = line.split(',').map(_.toDouble)
//LabeledPoint????????,?????????????,????????????(label)
LabeledPoint(parts(0), Vectors.dense(parts.tail))
}
val maxDepth = 5//??????,???????,?????????
val model = DecisionTree.train(parsedData, Classification, Gini, maxDepth)
val labelAndPreds = parsedData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val trainErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / parsedData.count
println("Training Error = " + trainErr)
}
}