本文整理汇总了Scala中org.apache.spark.ml.classification.DecisionTreeClassifier类的典型用法代码示例。如果您正苦于以下问题:Scala DecisionTreeClassifier类的具体用法?Scala DecisionTreeClassifier怎么用?Scala DecisionTreeClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DecisionTreeClassifier类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: DecisionTreePipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object DecisionTreePipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val dt = new DecisionTreeClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala
示例2: TreeOrForestClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
abstract class TreeOrForestClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
getFeatureArity(ctx), ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
object DecisionTreeClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
new DecisionTreeClassifier()
.setMaxDepth(depth)
.setSeed(ctx.seed())
}
}
object TreeOrForestClassification {
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}
示例3: DecisionTreePipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object DecisionTreePipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val dt = new DecisionTreeClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala
示例4: DTreeClassificationJob
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.sql.SparkSession
object DTreeClassificationJob extends MLMistJob{
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(datasetPath: String, savePath: String): Map[String, Any] = {
val data = session.read.format("libsvm").load(datasetPath)
val Array(training, _) = data.randomSplit(Array(0.7, 0.3))
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
val featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(4)// features with > 4 distinct values are treated as continuous.
.fit(data)
val dt = new DecisionTreeClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
val model = pipeline.fit(training)
model.write.overwrite().save(savePath)
Map.empty[String, Any]
}
def serve(modelPath: String, features: List[Array[Double]]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(modelPath)
val data = LocalData(
LocalDataColumn("features", features)
)
val result: LocalData = pipeline.transform(data)
Map("result" -> result.select("predictedLabel").toMapList)
}
}
示例5: TreeOrForestClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
abstract class TreeOrForestClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "label", numClasses, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
getFeatureArity(ctx), ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
object DecisionTreeClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
new DecisionTreeClassifier()
.setMaxDepth(depth)
.setSeed(ctx.seed())
}
}
object TreeOrForestClassification {
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}
示例6: TrainNewsClassWithDTDemo
//设置package包名称以及导入依赖的类
package applications.mining
import config.paramconf.ClassParams
import functions.Preprocessor
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SparkSession
object TrainNewsClassWithDTDemo {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val spark = SparkSession
.builder
.master("local[2]")
.appName("train news with DT Demo")
.getOrCreate()
val args = Array("ckooc-ml/data/classnews/train")
val filePath = args(0)
import spark.implicits._
val data = spark.sparkContext.textFile(filePath).flatMap { line =>
val tokens: Array[String] = line.split("\u00ef")
if (tokens.length > 3) Some((tokens(0), tokens(1), tokens(2), tokens(3))) else None
}.toDF("label", "title", "time", "content")
data.persist()
val preprocessor = new Preprocessor
val pipeline = preprocessor.preprocess(data)
// DT????
val params = new ClassParams
val dtClassifier = new DecisionTreeClassifier()
.setMinInfoGain(params.minInfoGain)
.setMaxDepth(params.maxDepth) //??Spark?????30???
.setLabelCol("indexedLabel")
.setFeaturesCol("features")
val indexModel = pipeline.getStages(1).asInstanceOf[StringIndexerModel]
//?????
val labelConverter = new IndexToString()
.setLabels(indexModel.labels)
.setInputCol(dtClassifier.getPredictionCol)
.setOutputCol("predictedLabel")
val stages = pipeline.getStages ++ Array(dtClassifier, labelConverter)
pipeline.setStages(stages)
val model = pipeline.fit(data)
model.write.overwrite().save(params.DTModelPath)
data.unpersist()
spark.stop()
}
}