本文整理汇总了Scala中org.apache.spark.ml.classification.NaiveBayes类的典型用法代码示例。如果您正苦于以下问题:Scala NaiveBayes类的具体用法?Scala NaiveBayes怎么用?Scala NaiveBayes使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了NaiveBayes类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: movies
//设置package包名称以及导入依赖的类
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
object movies {
case class Sentence(sentence: String,label: Double)
def main(args:Array[String]) {
val spark = SparkSession
.builder
.appName("Movies Reviews")
.config("spark.master", "local")
.getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val neg = spark.sparkContext.textFile("file:///data/train/neg/").repartition(4)
.map(w => Sentence(w, 0.0))
val pos = spark.sparkContext.textFile("file:///data/train/pos/").repartition(4)
.map(w => Sentence(w, 1.0))
val test = spark.sparkContext.wholeTextFiles("file:///data/test/").repartition(4)
.map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})
val training=neg.union(pos)
val trainingDF=spark.createDataFrame(training)
val testDF=spark.createDataFrame(test)
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
val tokenizer = new Tokenizer()
.setInputCol("sentence")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val nb = new NaiveBayes()
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, nb))
// Fit the pipeline to training documents.
val model = pipeline.fit(trainingDF)
// Make predictions on test documents.
model.transform(testDF).repartition(1)
.select("file", "prediction")
.write.format("csv")
.option("header","true")
.option("delimiter","\t")
.save("/tmp/spark-prediction")
spark.stop()
}
}
示例2: DocumentClassificationLibSVM
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.ml
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
object DocumentClassificationLibSVM {
def main(args: Array[String]): Unit = {
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkRatingData").config(spConfig)
.getOrCreate()
val data = spark.read.format("libsvm").load("./output/20news-by-date-train-libsvm/part-combined")
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1L)
// Train a NaiveBayes model.
val model = new NaiveBayes()
.fit(trainingData)
val predictions = model.transform(testData)
predictions.show()
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:37,代码来源:DocumentClassificationLibSVM.scala
示例3: NaiveBayesPipeline
//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object NaiveBayesPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val nb = new NaiveBayes()
stages += vectorAssembler
stages += nb
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala
示例4: NaiveBayesPipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon
import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame
import scala.collection.mutable
object NaiveBayesPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)
// Set up Pipeline
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
val nb = new NaiveBayes()
stages += vectorAssembler
stages += nb
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline
val startTime = System.nanoTime()
//val model = pipeline.fit(training)
val model = pipeline.fit(dataFrame)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val holdout = model.transform(test).select("prediction","label")
val holdout = model.transform(dataFrame).select("prediction","label")
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val mAccuracy = evaluator.evaluate(holdout)
println("Test set accuracy = " + mAccuracy)
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala
示例5: NaiveBayesJob
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.{Vector => LVector}
import org.apache.spark.sql.SparkSession
object NaiveBayesJob extends MLMistJob {
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(savePath: String): Map[String, Any] = {
val df = session.createDataFrame(Seq(
(Vectors.dense(4.0, 0.2, 3.0, 4.0, 5.0), 1.0),
(Vectors.dense(3.0, 0.3, 1.0, 4.1, 5.0), 1.0),
(Vectors.dense(2.0, 0.5, 3.2, 4.0, 5.0), 1.0),
(Vectors.dense(5.0, 0.7, 1.5, 4.0, 5.0), 1.0),
(Vectors.dense(1.0, 0.1, 7.0, 4.0, 5.0), 0.0),
(Vectors.dense(8.0, 0.3, 5.0, 1.0, 7.0), 0.0)
)).toDF("features", "label")
val nb = new NaiveBayes()
val pipeline = new Pipeline().setStages(Array(nb))
val model = pipeline.fit(df)
model.write.overwrite().save(savePath)
Map.empty[String, Any]
}
def serve(modelPath: String, features: List[List[Double]]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(modelPath)
val data = LocalData(LocalDataColumn("features", features))
val result = pipeline.transform(data)
val response = result.select("probability", "rawPrediction", "prediction").toMapList.map(rowMap => {
val mapped = rowMap("probability").asInstanceOf[LVector].toArray
val one = rowMap + ("probability" -> mapped)
val mapped2 = one("rawPrediction").asInstanceOf[LVector].toArray
one + ("rawPrediction" -> mapped2)
})
Map("result" -> response)
}
}
示例6: WatherScript
//设置package包名称以及导入依赖的类
package naivebayes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object WatherScript extends App {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._
val watherRaw: RDD[String] = sc.textFile("/Users/mateusz/Workspace/mllib/spark-naive-bayes/src/main/resources/wather-nums.csv")
val dataRaw = watherRaw.map(_.split(";")).map { csv =>
val label = csv.last.toDouble
val point = csv.init.map(_.toDouble)
(label, point)
}
val data: Dataset[LabeledPoint] = dataRaw
.map { case (label, point) =>
LabeledPoint(label, Vectors.dense(point))
}.toDS()
val Array(training: Dataset[LabeledPoint], test: Dataset[LabeledPoint]) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
val model = new NaiveBayes()
.setModelType("multinomial")
.fit(training)
val predictions = model.transform(test)
predictions.show()
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)
}