本文整理汇总了Scala中org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator类的典型用法代码示例。如果您正苦于以下问题:Scala MulticlassClassificationEvaluator类的具体用法?Scala MulticlassClassificationEvaluator怎么用?Scala MulticlassClassificationEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MulticlassClassificationEvaluator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: DocumentClassificationLibSVM
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.ml
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
object DocumentClassificationLibSVM {
def main(args: Array[String]): Unit = {
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkRatingData").config(spConfig)
.getOrCreate()
val data = spark.read.format("libsvm").load("./output/20news-by-date-train-libsvm/part-combined")
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1L)
// Train a NaiveBayes model.
val model = new NaiveBayes()
.fit(trainingData)
val predictions = model.transform(testData)
predictions.show()
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:37,代码来源:DocumentClassificationLibSVM.scala
示例2: DecisionTreePipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object DecisionTreePipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val dt = new DecisionTreeClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala
示例3: LogisticRegression
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer}
import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors
object LogisticRegression extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
DataGenerator.generateContinuousFeatures(
ctx.sqlContext,
numExamples,
ctx.seed(),
numPartitions,
numFeatures)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
val rng = ctx.newGenerator()
val coefficients =
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLogisticRegressionModel(coefficients, intercept)
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
new ml.classification.LogisticRegression()
.setTol(tol)
.setMaxIter(maxIter)
.setRegParam(regParam)
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
示例4: TreeOrForestClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
abstract class TreeOrForestClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
getFeatureArity(ctx), ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
object DecisionTreeClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
new DecisionTreeClassifier()
.setMaxDepth(depth)
.setSeed(ctx.seed())
}
}
object TreeOrForestClassification {
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}
示例5: GBTClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
object GBTClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
import ctx.params._
// We add +1 to the depth to make it more likely that many iterations of boosting are needed
// to model the true tree.
ModelBuilder.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
ctx.seed())
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
// TODO: subsamplingRate, featureSubsetStrategy
// TODO: cacheNodeIds, checkpoint?
new GBTClassifier()
.setMaxDepth(depth)
.setMaxIter(maxIter)
.setSeed(ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
示例6: NaiveBayesPipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object NaiveBayesPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val nb = new NaiveBayes()
stages += vectorAssembler
stages += nb
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala
示例7: RandomForestPipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object RandomForestPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val rf = new RandomForestClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setNumTrees(20)
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += rf
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala
示例8: DecisionTreePipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object DecisionTreePipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val dt = new DecisionTreeClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala
示例9: NaiveBayesPipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object NaiveBayesPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val nb = new NaiveBayes()
stages += vectorAssembler
stages += nb
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala
示例10: RandomForestPipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object RandomForestPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val rf = new RandomForestClassifier()
.setFeaturesCol(vectorAssembler.getOutputCol)
.setLabelCol("indexedLabel")
.setNumTrees(20)
.setMaxDepth(5)
.setMaxBins(32)
.setMinInstancesPerNode(1)
.setMinInfoGain(0.0)
.setCacheNodeIds(false)
.setCheckpointInterval(10)
stages += vectorAssembler
stages += rf
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala
示例11: MLPTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
object MLPTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("MLPTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/MLP.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm")
.load("hdfs://master:9000/sample_formatted.txt")
// Split the data into train and test
val splits = data.randomSplit(Array(0.7, 0.3), seed = 14L)
val train = splits(0)
val test = splits(1)
val layers = Array[Int](20, 20, 2)
// create the trainer and set its parameters
val trainer = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(14L)
.setMaxIter(100)
// train the model
val model = trainer.fit(train)
// compute accuracy on the test set
val result = model.transform(test)
val predictionAndLabels = result.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy")
println("Sensitivity = " + predictionAndLabels.filter(x => x(0) == x(1) && x(0) == 1.0).count().toDouble / predictionAndLabels.filter(x => x(1) == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x(0) == x(1) && x(0) == 0.0).count().toDouble / predictionAndLabels.filter(x => x(1) == 0.0).count().toDouble)
println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
}
}
示例12: TreeOrForestClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
abstract class TreeOrForestClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "label", numClasses, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
getFeatureArity(ctx), ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
object DecisionTreeClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
new DecisionTreeClassifier()
.setMaxDepth(depth)
.setSeed(ctx.seed())
}
}
object TreeOrForestClassification {
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}
示例13: GBTClassification
//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
object GBTClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "label", numClasses, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
import ctx.params._
// We add +1 to the depth to make it more likely that many iterations of boosting are needed
// to model the true tree.
ModelBuilder.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
ctx.seed())
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
import ctx.params._
// TODO: subsamplingRate, featureSubsetStrategy
// TODO: cacheNodeIds, checkpoint?
new GBTClassifier()
.setMaxDepth(depth)
.setMaxIter(maxIter)
.setSeed(ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
示例14: NeuralNetworkSpec
//设置package包名称以及导入依赖的类
package io.spinor.sparkdemo.mllib
import io.spinor.sparkdemo.data.MNISTData
import io.spinor.sparkdemo.util.DemoUtil
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{FlatSpec, Matchers}
import org.slf4j.LoggerFactory
class NeuralNetworkSpec extends FlatSpec with DemoUtil with Matchers {
val logger = LoggerFactory.getLogger(classOf[NeuralNetworkSpec])
"Training on MNIST data" should " run" in {
val sparkConf = new SparkConf()
sparkConf.setAppName("NeuralNetworkDemo")
sparkConf.setMaster("local[2]")
val sparkContext = new SparkContext(sparkConf)
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val sqlContext = sparkSession.sqlContext
import sqlContext.implicits._
val mNISTData = new MNISTData()
val trainingData = mNISTData.getTrainingData()
val trainingPoints = sparkContext.parallelize(trainingData.map(entry => LabeledPoint(entry._2, Vectors.dense(entry._1)))).toDF()
val classifier = new MultilayerPerceptronClassifier()
classifier
.setLayers(Array(784, 100))
.setBlockSize(125)
.setSeed(1234L)
.setMaxIter(10)
val model = classifier.fit(trainingPoints)
val testData = mNISTData.getTestData()
val testPoints = sparkContext.parallelize(testData.map(entry => {
LabeledPoint(entry._2, Vectors.dense(entry._1))})).toDF()
val result = model.transform(testPoints)
val predictionAndLabels = result.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")
logger.info("accuracy:" + evaluator.evaluate(predictionAndLabels))
}
}
示例15: MlpcHelpers
//设置package包名称以及导入依赖的类
package com.zobot.ai.spark.helpers
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.DataFrame
object MlpcHelpers {
case class NeuralNetworkLayers (
featureLayerSize: Int,
intermediateLayerSizes: List[Int],
classLayerSize: Int
)
def layersToArray(layers: NeuralNetworkLayers): Array[Int] = {
(layers.featureLayerSize :: layers.intermediateLayerSizes ::: List(layers.classLayerSize)).toArray
}
def createTrainer(layers: NeuralNetworkLayers, blockSize: Int, maxIterations: Int, seed: Option[Long]): MultilayerPerceptronClassifier = {
val mlpcClassifier = new MultilayerPerceptronClassifier()
.setLayers(layersToArray(layers))
.setBlockSize(blockSize)
.setMaxIter(maxIterations)
seed match {
case Some(n) => mlpcClassifier.setSeed(n)
case None => mlpcClassifier
}
}
def trainModel(trainer: MultilayerPerceptronClassifier, trainingData: DataFrame): MultilayerPerceptronClassificationModel = {
trainer.fit(trainingData)
}
def testModel(model: MultilayerPerceptronClassificationModel, testData: DataFrame): DataFrame = model.transform(testData)
def getModelAccuracy(testResults: DataFrame): Double = {
val predictionAndLabels = testResults.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy")
evaluator.evaluate(predictionAndLabels)
}
}