本文整理汇总了Scala中org.apache.spark.mllib.util.MLUtils类的典型用法代码示例。如果您正苦于以下问题:Scala MLUtils类的具体用法?Scala MLUtils怎么用?Scala MLUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MLUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: HME_BD
//设置package包名称以及导入依赖的类
package org.apache.spark.mllib.feature
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.mllib.util.MLUtils
//Train with K-1, predict with 1
class HME_BD(val data: RDD[LabeledPoint], val nTrees: Int, val k: Int, val maxDepth: Int = 10, val seed: Int) extends Serializable {
private val labels = data.map(_.label).distinct().collect()
private var modelNoise: Array[RDD[LabeledPoint]] = new Array[RDD[LabeledPoint]](k)
def runFilter(): RDD[LabeledPoint] = {
val cvdat = MLUtils.kFold(data, k, seed)
//RF Parameters
val numClasses = labels.length
val categoricalFeaturesInfo = Map[Int, Int]()
val featureSubsetStrategy = "all"
val impurity = "gini"
val maxBins = 32
modelNoise = cvdat.map {
case (train, test) =>
{
var filteredData: RDD[LabeledPoint] = data.context.emptyRDD
val rfModel = RandomForest.trainClassifier(train, numClasses, categoricalFeaturesInfo,
nTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
val rfPred = rfModel.predict(test.map(_.features)).zipWithIndex.map { case (k, v) => (v, k) }
val joinedPreds = test.zipWithIndex.map { case (k, v) => (v, k) }.join(rfPred).map { l =>
val example = l._2._1
val rfVote = l._2._2
if (rfVote != example.label) {
LabeledPoint(-1, example.features)
} else {
example
}
}.filter { point => point.label != -1 }
filteredData.union(joinedPreds)
}
}
var filteredData = modelNoise(0).filter { point => point.label != -1 }
for (i <- 1 to k - 1) {
filteredData = filteredData.union(modelNoise(i).filter { point => point.label != -1 })
}
return filteredData
}
}
示例2: StandardScalarSample
//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
object StandardScalarSample {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
val sc = new SparkContext(conf)
val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt")
val scaler1 = new StandardScaler().fit(data.map(x => x.features))
val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
// scaler3 is an identical model to scaler2, and will produce identical transformations
val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
// data1 will be unit variance.
val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
println(data1.first())
// Without converting the features into dense vectors, transformation with zero mean will raise
// exception on sparse vector.
// data2 will be unit variance and zero mean.
val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
println(data2.first())
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:28,代码来源:StandardScalarSample.scala
示例3: CLIParserDataGen
//设置package包名称以及导入依赖的类
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
import utils.Utils
import scala.util.Try
import scalax.file.Path
class CLIParserDataGen(arguments: Seq[String]) extends org.rogach.scallop.ScallopConf(arguments) {
val numPoints = opt[Int](required = true, short = 'n', descr = "Number of data points to generate")
val numFeatures = opt[Int](required = true, short = 'm', descr = "Number of features to generate")
val partitions = opt[Int](required = false, default = Some(4), short = 'p', validate = (0 <),
descr = "Number of spark partitions to be used. Optional.")
val dir = opt[String](required = true, default = Some("../dataset"), short = 'd', descr = "working directory where dataset is stored. Default is \"../results\". ")
val datasetType = opt[String](required = false, default = Some("Regression"), descr = "Type of dataset. Can be \"Regression\" for the moment.")
verify()
}
object GenerateData {
def main(args: Array[String]) {
//Spark conf
val conf = new SparkConf().setAppName("Distributed Machine Learning").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//Parser arguments
val parser = new CLIParserDataGen(args)
val numPoints = parser.numPoints()
val numFeatures = parser.numFeatures()
val numPartitions = parser.partitions()
val workingDir = parser.dir()
val datasetType = parser.datasetType()
if ( datasetType == "Regression" ) {
val data = Utils.generateLabeledPoints(sc, numPoints, numFeatures, 1, 1.0, numPartitions, System.nanoTime())
MLUtils.saveAsLibSVMFile(data, workingDir)
} else {
print("Error: dataset generation of type \"" + datasetType + "\" not supported.")
System.exit(1)
}
}
}
示例4: PrepArgParser
//设置package包名称以及导入依赖的类
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
import utils.Utils
import scala.util.Try
import scalax.file.Path
class PrepArgParser(arguments: Seq[String]) extends org.rogach.scallop.ScallopConf(arguments) {
val dataset = opt[String](required = true, short = 'd',
descr = "absolute address of the libsvm dataset. This must be provided.")
val partitions = opt[Int](required = false, default = Some(4), short = 'p', validate = (0 <),
descr = "Number of spark partitions to be used. Optional.")
val dir = opt[String](required = true, default = Some("../results/"), short = 'w', descr = "working directory where results " +
"are stored. Default is \"../results\". ")
val method = opt[String](required = true, short = 'm',
descr = "Method can be either \"Regression\" or \"Classification\". This must be provided")
verify()
}
object PrepareData {
def main(args: Array[String]) {
//Spark conf
val conf = new SparkConf().setAppName("Distributed Machine Learning").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//Turn off logs
val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)
//Parse arguments
val parser = new PrepArgParser(args)
val dataset = parser.dataset()
var workingDir = parser.dir()
val numPartitions = parser.partitions()
val method = parser.method()
//Load data
val (train, test) = method match {
case "Classification" => Utils.loadAbsolutLibSVMBinaryClassification(dataset, numPartitions, sc)
case "Regression" => Utils.loadAbsolutLibSVMRegression(dataset, numPartitions, sc)
case _ => throw new IllegalArgumentException("The method " + method + " is not supported.")
}
// append "/" to workingDir if necessary
workingDir = workingDir + ( if (workingDir.takeRight(1) != "/") "/" else "" )
val trainPath: Path = Path.fromString(workingDir + "train")
Try(trainPath.deleteRecursively(continueOnFailure = false))
val testPath: Path = Path.fromString(workingDir + "test")
Try(testPath.deleteRecursively(continueOnFailure = false))
MLUtils.saveAsLibSVMFile(train, workingDir + "train")
MLUtils.saveAsLibSVMFile(test, workingDir + "test")
}
}
示例5: PCAExample2
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
object PCAExample2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "data/mnist.bz2")
val df = spark.read.format("libsvm").load("C:/Exp/mnist.bz2")
df.show(20)
val featureSize = data.first().features.size
println("Feature Size: " + featureSize)
val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L)
val (training, test) = (splits(0), splits(1))
val pca = new PCA(featureSize/2).fit(data.map(_.features))
val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
val numIterations = 20
val stepSize = 0.0001
val model = LinearRegressionWithSGD.train(training, numIterations, stepSize)
val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations, stepSize)
val valuesAndPreds = test.map { point =>
val score = model.predict(point.features)
(score, point.label)
}
val valuesAndPreds_pca = test_pca.map { point =>
val score = model_pca.predict(point.features)
(score, point.label)
}
val MSE = valuesAndPreds.map { case (v, p) => math.pow(v - p, 2) }.mean()
val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow(v - p, 2) }.mean()
println("Mean Squared Error = " + MSE)
println("PCA Mean Squared Error = " + MSE_pca)
println("Model coefficients:"+ model.toString())
println("Model with PCA coefficients:"+ model_pca.toString())
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:61,代码来源:PCA_LinearRegression_Demo.scala
示例6: StandardScalarSample
//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
object StandardScalarSample {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
val sc = new SparkContext(conf)
val data = MLUtils.loadLibSVMFile(sc,
org.sparksamples.Util.SPARK_HOME + "/data/mllib/sample_libsvm_data.txt")
val scaler1 = new StandardScaler().fit(data.map(x => x.features))
val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
// scaler3 is an identical model to scaler2, and will produce identical transformations
val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
// data1 will be unit variance.
val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
println(data1.first())
// Without converting the features into dense vectors, transformation with zero mean will raise
// exception on sparse vector.
// data2 will be unit variance and zero mean.
val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
println(data2.first())
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:29,代码来源:StandardScalarSample.scala
示例7: MLLibRandomForestModel
//设置package包名称以及导入依赖的类
package com.asto.dmp.articlecate.biz
import com.asto.dmp.articlecate.base.Props
import com.asto.dmp.articlecate.utils.FileUtils
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import com.asto.dmp.articlecate.biz.ClsFeaturesParser._
import scala.collection._
class MLLibRandomForestModel(val sc: SparkContext, val modelPath: String) extends scala.Serializable with Logging {
def genRandomForestModel(svmTrainDataPath: String) = {
val numClasses = ClsFeaturesParser.clsNameToCodeMap.size //Util.parseMapFrom(clsIndicesPath, nameToCode = true).size
val categoricalFeaturesInfo = immutable.Map[Int, Int]()
val numTrees = Props.get("model_numTrees").toInt
val featureSubsetStrategy = Props.get("model_featureSubsetStrategy") // Let the algorithm choose.
val impurity = Props.get("model_impurity")
val maxDepth = Props.get("model_maxDepth").toInt
val maxBins = Props.get("model_maxBins").toInt
val trainingData = MLUtils.loadLibSVMFile(sc, svmTrainDataPath).cache()
val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
FileUtils.deleteFilesInHDFS(modelPath)
model.save(sc, modelPath)
testErrorRate(trainingData, model)
}
private def testErrorRate(trainingData: RDD[LabeledPoint], model: RandomForestModel) = {
if (Props.get("model_test").toBoolean) {
val testData = trainingData.sample(false, Props.get("model_sampleRate").toDouble)
val labelAndPreds = testData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val testError = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
logInfo(s"????????????$testError")
} else {
logInfo(s"???????????")
}
}
def predictAndSave(lineAndVectors: Array[(String, org.apache.spark.mllib.linalg.Vector)], resultPath: String) = {
val model = RandomForestModel.load(sc, modelPath)
val result = lineAndVectors.map(lv => (s"${clsCodeToNameMap(model.predict(lv._2).toInt.toString)}\t${lv._1}")).mkString("\n")
FileUtils.saveFileToHDFS(resultPath, result)
}
}
示例8: KMeansTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object KMeansTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("KMeansTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/KMeans.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Load and parse the data
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
val parsedData = data.map(s => s.features).cache()
// Cluster the data into two classes using KMeans
val numClusters = 2
val numIterations = 20
val clusters = KMeans.train(parsedData, numClusters, numIterations)
val predictionAndLabels = data.map(
s => {
(clusters.predict(s.features), s.label)
})
// Evaluate clustering by computing Within Set Sum of Squared Errors
println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
}
}
示例9: DecisionTreeTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object DecisionTreeTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("DecisionTreeTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/DecisionTree.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (training, test) = (splits(0), splits(1))
// Train a DecisionTree model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "entropy" // Also, we can use entrophy
val maxDepth = 14
val maxBins = 16384
val model = DecisionTree.trainClassifier(training, numClasses, categoricalFeaturesInfo,
impurity, maxDepth, maxBins)
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
}
}
示例10: SVMTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object SVMTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("SVMTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/SVM.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
// Split data into training (80%) and test (20%).
val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
// Run training algorithm to build the model
val numIterations = 100
val model = SVMWithSGD.train(training, numIterations)
// Clear the default threshold.
model.setThreshold(-5000)
// Compute raw scores on the test set.
val scoreAndLabels = test.map { point =>
val score = model.predict(point.features)
(score, point.label)
}
// Get evaluation metrics.
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
println("Sensitivity = " + scoreAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / scoreAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + scoreAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / scoreAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + scoreAndLabels.filter(x => x._1 == x._2).count().toDouble / scoreAndLabels.count().toDouble)
}
}
示例11: LogisticRegressionTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object LogisticRegressionTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("LogisticRegressionTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/LogisticRegression.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
// Run training algorithm to build the model
val model = new LogisticRegressionWithLBFGS()
.setNumClasses(2)
.run(training)
// Compute raw scores on the test set.
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
// Get evaluation metrics.
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
}
}
示例12: RandomForestTest
//设置package包名称以及导入依赖的类
package cn.edu.bjtu
import org.apache.spark.SparkConf
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object RandomForestTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("RandomForestTest")
.setMaster("spark://master:7077")
.setJars(Array("/home/hadoop/RandomForest.jar"))
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (training, test) = (splits(0), splits(1))
// Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 3 // Use more in practice.
val featureSubsetStrategy = "18" // Let the algorithm choose.
val impurity = "gini"
val maxDepth = 14
val maxBins = 16384
val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo,
numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
val auROC = metrics.areaUnderROC()
println("Area under ROC = " + auROC)
println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
}
}
示例13: Url
//设置package包名称以及导入依赖的类
package dataset
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import utils.MetaFile
object Url {
def saveUrl(sc: SparkContext): Unit ={
var whole : RDD[LabeledPoint] =MLUtils.loadLibSVMFile(sc, ("/Users/sara/url_svmlight/Day" + 0 + ".svm"))
for (i <- 1 to 120) {
val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, ("/Users/sara/url_svmlight/Day" + i + ".svm"))
whole = whole union(examples)
}
whole.repartition(4);
whole.saveAsObjectFile("file:/Users/sara/galaxy/data/url")
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("url")
val sc = new SparkContext(conf)
//val objects = new Array[MetaFile](120)
}
}
示例14: LRAccuracyTest
//设置package包名称以及导入依赖的类
package MLlib
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}
object LRAccuracyTest {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
l => LabeledPoint(l.label, l.features.toSparse))
// Split data into training (60%) and test (40%).
val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0).cache()
val test = splits(1)
// Run training algorithm to build the model
val model = new SparseLogisticRegressionWithLBFGS()
.setNumClasses(5)
.run(training)
// Compute raw scores on the test set.
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
// Get evaluation metrics.
val metrics = new MulticlassMetrics(predictionAndLabels)
val precision = metrics.precision
println("Precision = " + precision)
}
}
示例15: ModelTrainer
//设置package包名称以及导入依赖的类
package modelmanager
import java.io.File
import com.typesafe.config.Config
import org.apache.commons.io.FileUtils
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.streaming.StreamingContext
import scala.collection.JavaConversions._
object ModelTrainer {
val nClasses: Int = 2
val positiveLabel: Double = 1.0
val negativeLabel: Double = 0.0
val maxBins: Int = 100
def trainModels(ssc: StreamingContext, config: Config) = {
//Load configuration
val depth = config.getInt("models.trainingConfiguration.depth")
val impurity = config.getString("models.trainingConfiguration.impurity")
val strategy = config.getString("models.trainingConfiguration.strategy")
val seed = config.getInt("models.trainingConfiguration.seed")
val forestSize = config.getInt("models.trainingConfiguration.forestSize")
val dataPath = config.getString("models.trainingConfiguration.pathToTrainingData")
val modelsPath = config.getString("models.pathToModels")
val events = config.getStringList("models.models")
val categoricalInfo = Range(0, config.getInt("eventsCount")).map((_, 2)).toMap
val models = events.par.map(modelName => {
(modelName,
RandomForest.trainClassifier(
MLUtils.loadLibSVMFile(ssc.sparkContext, dataPath + modelName + ".libsvm"),
nClasses,
categoricalInfo,
forestSize,
strategy,
impurity,
depth,
maxBins,
seed))
})
if (config.getBoolean("models.saveModels"))
models.seq.foreach(x => {
FileUtils.deleteQuietly(new File(modelsPath + x._1))
x._2.save(ssc.sparkContext, modelsPath + x._1)
})
models
}
}