本文整理汇总了Scala中org.apache.spark.ml.classification.RandomForestClassifier类的典型用法代码示例。如果您正苦于以下问题:Scala RandomForestClassifier类的具体用法?Scala RandomForestClassifier怎么用?Scala RandomForestClassifier使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RandomForestClassifier类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: RandomForestClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.RandomForestClassifier
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
object RandomForestClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
// TODO: subsamplingRate, featureSubsetStrategy
// TODO: cacheNodeIds, checkpoint?
new RandomForestClassifier()
.setMaxDepth(depth)
.setNumTrees(maxIter)
.setSeed(ctx.seed())
}
}
示例2: RandomForestPipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object RandomForestPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val rf = new RandomForestClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setNumTrees(20)
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += rf
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala
示例3: RandomForestPipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object RandomForestPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val rf = new RandomForestClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setNumTrees(20)
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += rf
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala
示例4: Test
//设置package包名称以及导入依赖的类
package org.apache.spark.test
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.StringIndexer
object Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//KMEANS
val npart = 216
def time[A](a: => A) = {
val now = System.nanoTime
val result = a
val sec = (System.nanoTime - now) * 1e-9
println("Total time (secs): " + sec)
result
}
val file = "hdfs://hadoop-master:8020/user/spark/datasets/higgs/HIGGS.csv"
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "false")
.option("inferSchema", "true").load(file).repartition(npart)
import org.apache.spark.ml.feature.VectorAssembler
val featureAssembler = new VectorAssembler().setInputCols(df.columns.drop(1)).setOutputCol("features")
val processedDf = featureAssembler.transform(df).cache()
print("Num. elements: " + processedDf.count)
// Trains a k-means model.
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans().setSeed(1L)
val cmodel = time(kmeans.fit(processedDf.select("features")))
//RANDOM FOREST
import org.apache.spark.ml.classification.RandomForestClassifier
val labelCol = df.columns.head
val indexer = new StringIndexer().setInputCol(labelCol).setOutputCol("labelIndexed")
val imodel = indexer.fit(processedDf)
val indexedDF = imodel.transform(processedDf)
val rf = new RandomForestClassifier().setFeaturesCol("features").setLabelCol("labelIndexed")
val model = time(rf.fit(indexedDF))
}
}