当前位置: 首页>>代码示例>>Scala>>正文


Scala MLUtils类代码示例

本文整理汇总了Scala中org.apache.spark.mllib.util.MLUtils的典型用法代码示例。如果您正苦于以下问题:Scala MLUtils类的具体用法?Scala MLUtils怎么用?Scala MLUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了MLUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: HME_BD

//设置package包名称以及导入依赖的类
package org.apache.spark.mllib.feature

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.mllib.util.MLUtils

//Train with K-1, predict with 1

class HME_BD(val data: RDD[LabeledPoint], val nTrees: Int, val k: Int, val maxDepth: Int = 10, val seed: Int) extends Serializable {

  private val labels = data.map(_.label).distinct().collect()
  private var modelNoise: Array[RDD[LabeledPoint]] = new Array[RDD[LabeledPoint]](k)

  def runFilter(): RDD[LabeledPoint] = {

    val cvdat = MLUtils.kFold(data, k, seed)

    //RF Parameters
    val numClasses = labels.length
    val categoricalFeaturesInfo = Map[Int, Int]()
    val featureSubsetStrategy = "all"
    val impurity = "gini"
    val maxBins = 32

    modelNoise = cvdat.map {
      case (train, test) =>
        {

          var filteredData: RDD[LabeledPoint] = data.context.emptyRDD

          val rfModel = RandomForest.trainClassifier(train, numClasses, categoricalFeaturesInfo,
            nTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)

          val rfPred = rfModel.predict(test.map(_.features)).zipWithIndex.map { case (k, v) => (v, k) }

          val joinedPreds = test.zipWithIndex.map { case (k, v) => (v, k) }.join(rfPred).map { l =>
            val example = l._2._1
            val rfVote = l._2._2

            if (rfVote != example.label) {
              LabeledPoint(-1, example.features)
            } else {
              example
            }

          }.filter { point => point.label != -1 }
          filteredData.union(joinedPreds)
        }
    }

    var filteredData = modelNoise(0).filter { point => point.label != -1 }

    for (i <- 1 to k - 1) {
      filteredData = filteredData.union(modelNoise(i).filter { point => point.label != -1 })
    }

    return filteredData
  }
} 
开发者ID:djgarcia,项目名称:NoiseFramework,代码行数:62,代码来源:HME_BD.scala

示例2: StandardScalarSample

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalarSample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
    val sc = new SparkContext(conf)
    val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
    println(data1.first())

    // Without converting the features into dense vectors, transformation with zero mean will raise
    // exception on sparse vector.
    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    println(data2.first())
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:28,代码来源:StandardScalarSample.scala

示例3: CLIParserDataGen

//设置package包名称以及导入依赖的类
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
import utils.Utils

import scala.util.Try

import scalax.file.Path


class CLIParserDataGen(arguments: Seq[String]) extends org.rogach.scallop.ScallopConf(arguments) {
  val numPoints = opt[Int](required = true, short = 'n', descr = "Number of data points to generate")
  val numFeatures = opt[Int](required = true, short = 'm', descr = "Number of features to generate")
  val partitions = opt[Int](required = false, default = Some(4), short = 'p', validate = (0 <),
    descr = "Number of spark partitions to be used. Optional.")
  val dir = opt[String](required = true, default = Some("../dataset"), short = 'd', descr = "working directory where dataset is stored. Default is \"../results\". ")
  val datasetType = opt[String](required = false, default = Some("Regression"), descr = "Type of dataset. Can be \"Regression\" for the moment.")
  verify()
}


object GenerateData {
  def main(args: Array[String]) {
    //Spark conf
    val conf = new SparkConf().setAppName("Distributed Machine Learning").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //Parser arguments
    val parser = new CLIParserDataGen(args)
    val numPoints = parser.numPoints()
    val numFeatures = parser.numFeatures()
    val numPartitions = parser.partitions()
    val workingDir = parser.dir()
    val datasetType = parser.datasetType()

    if ( datasetType == "Regression" ) {
      val data = Utils.generateLabeledPoints(sc, numPoints, numFeatures, 1, 1.0, numPartitions, System.nanoTime())
      MLUtils.saveAsLibSVMFile(data, workingDir)
    } else {
      print("Error: dataset generation of type \"" + datasetType + "\" not supported.")
      System.exit(1)
    }
  }
} 
开发者ID:mlbench,项目名称:mlbench,代码行数:46,代码来源:GenerateData.scala

示例4: PrepArgParser

//设置package包名称以及导入依赖的类
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}
import utils.Utils

import scala.util.Try
import scalax.file.Path



class PrepArgParser(arguments: Seq[String]) extends org.rogach.scallop.ScallopConf(arguments) {
  val dataset = opt[String](required = true, short = 'd',
    descr = "absolute address of the libsvm dataset. This must be provided.")
  val partitions = opt[Int](required = false, default = Some(4), short = 'p', validate = (0 <),
    descr = "Number of spark partitions to be used. Optional.")
  val dir = opt[String](required = true, default = Some("../results/"), short = 'w', descr = "working directory where results " +
    "are stored. Default is \"../results\". ")
  val method = opt[String](required = true, short = 'm',
    descr = "Method can be either \"Regression\" or \"Classification\". This must be provided")
  verify()
}

object PrepareData {
  def main(args: Array[String]) {
    //Spark conf
    val conf = new SparkConf().setAppName("Distributed Machine Learning").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    //Turn off logs
    val rootLogger = Logger.getRootLogger()
    rootLogger.setLevel(Level.ERROR)
    //Parse arguments
    val parser = new PrepArgParser(args)
    val dataset = parser.dataset()
    var workingDir = parser.dir()
    val numPartitions = parser.partitions()
    val method = parser.method()

    //Load data
    val (train, test) = method match {
      case "Classification" => Utils.loadAbsolutLibSVMBinaryClassification(dataset, numPartitions, sc)
      case "Regression" => Utils.loadAbsolutLibSVMRegression(dataset, numPartitions, sc)
      case _ => throw new IllegalArgumentException("The method " + method + " is not supported.")
    }

    // append "/" to workingDir if necessary
    workingDir = workingDir + ( if (workingDir.takeRight(1) != "/") "/" else "" )
    val trainPath: Path = Path.fromString(workingDir + "train")
    Try(trainPath.deleteRecursively(continueOnFailure = false))
    val testPath: Path = Path.fromString(workingDir + "test")
    Try(testPath.deleteRecursively(continueOnFailure = false))
    MLUtils.saveAsLibSVMFile(train, workingDir + "train")
    MLUtils.saveAsLibSVMFile(test, workingDir + "test")
  }
} 
开发者ID:mlbench,项目名称:mlbench,代码行数:56,代码来源:PrepareData.scala

示例5: PCAExample2

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LinearRegressionWithSGD

object PCAExample2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "data/mnist.bz2")
    val df = spark.read.format("libsvm").load("C:/Exp/mnist.bz2")
    df.show(20)
    
    val featureSize = data.first().features.size
    println("Feature Size: " + featureSize)

    val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val (training, test) = (splits(0), splits(1))


    val pca = new PCA(featureSize/2).fit(data.map(_.features))
    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))

    val numIterations = 20
    val stepSize = 0.0001
    val model = LinearRegressionWithSGD.train(training, numIterations, stepSize)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations, stepSize)

    val valuesAndPreds = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    val valuesAndPreds_pca = test_pca.map { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)
    }

    val MSE = valuesAndPreds.map { case (v, p) => math.pow(v - p, 2) }.mean()
    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow(v - p, 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)  
    
    println("Model coefficients:"+ model.toString())
    println("Model with PCA coefficients:"+ model_pca.toString())
    

    spark.stop()

  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:61,代码来源:PCA_LinearRegression_Demo.scala

示例6: StandardScalarSample

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalarSample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
    val sc = new SparkContext(conf)
    val data = MLUtils.loadLibSVMFile(sc,
      org.sparksamples.Util.SPARK_HOME +  "/data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
    println(data1.first())

    // Without converting the features into dense vectors, transformation with zero mean will raise
    // exception on sparse vector.
    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    println(data2.first())
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:29,代码来源:StandardScalarSample.scala

示例7: MLLibRandomForestModel

//设置package包名称以及导入依赖的类
package com.asto.dmp.articlecate.biz

import com.asto.dmp.articlecate.base.Props
import com.asto.dmp.articlecate.utils.FileUtils
import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import com.asto.dmp.articlecate.biz.ClsFeaturesParser._
import scala.collection._

class MLLibRandomForestModel(val sc: SparkContext, val modelPath: String) extends scala.Serializable with Logging {

  def genRandomForestModel(svmTrainDataPath: String) = {
    val numClasses = ClsFeaturesParser.clsNameToCodeMap.size //Util.parseMapFrom(clsIndicesPath, nameToCode = true).size
    val categoricalFeaturesInfo = immutable.Map[Int, Int]()
    val numTrees = Props.get("model_numTrees").toInt
    val featureSubsetStrategy = Props.get("model_featureSubsetStrategy") // Let the algorithm choose.
    val impurity = Props.get("model_impurity")
    val maxDepth = Props.get("model_maxDepth").toInt
    val maxBins = Props.get("model_maxBins").toInt

    val trainingData = MLUtils.loadLibSVMFile(sc, svmTrainDataPath).cache()
    val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

    FileUtils.deleteFilesInHDFS(modelPath)
    model.save(sc, modelPath)

    testErrorRate(trainingData, model)
  }

  
  private def testErrorRate(trainingData: RDD[LabeledPoint], model: RandomForestModel) = {
    if (Props.get("model_test").toBoolean) {
      val testData = trainingData.sample(false, Props.get("model_sampleRate").toDouble)
      val labelAndPreds = testData.map { point =>
        val prediction = model.predict(point.features)
        (point.label, prediction)
      }
      val testError = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
      logInfo(s"????????????$testError")
    } else {
      logInfo(s"???????????")
    }
  }

  def predictAndSave(lineAndVectors: Array[(String, org.apache.spark.mllib.linalg.Vector)], resultPath: String) = {
    val model = RandomForestModel.load(sc, modelPath)
    val result = lineAndVectors.map(lv => (s"${clsCodeToNameMap(model.predict(lv._2).toInt.toString)}\t${lv._1}")).mkString("\n")
    FileUtils.saveFileToHDFS(resultPath, result)
  }
} 
开发者ID:luciuschina,项目名称:ArticleCategories,代码行数:56,代码来源:MLLibRandomForestModel.scala

示例8: KMeansTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu

import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession

object KMeansTest {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName("KMeansTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/KMeans.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Load and parse the data
    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
    val parsedData = data.map(s => s.features).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)
    val predictionAndLabels = data.map(
      s => {
        (clusters.predict(s.features), s.label)
      })
    // Evaluate clustering by computing Within Set Sum of Squared Errors
    println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:40,代码来源:KMeansTest.scala

示例9: DecisionTreeTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu


import org.apache.spark.SparkConf
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession

object DecisionTreeTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("DecisionTreeTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/DecisionTree.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Load and parse the data file.
    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))

    val (training, test) = (splits(0), splits(1))

    // Train a DecisionTree model.
    //  Empty categoricalFeaturesInfo indicates all features are continuous.
    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val impurity = "entropy" // Also, we can use entrophy
    val maxDepth = 14
    val maxBins = 16384

    val model = DecisionTree.trainClassifier(training, numClasses, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins)

    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }
    val metrics = new BinaryClassificationMetrics(predictionAndLabels)
    val auROC = metrics.areaUnderROC()
    println("Area under ROC = " + auROC)
    println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:55,代码来源:DecisionTreeTest.scala

示例10: SVMTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu


import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession

object SVMTest {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName("SVMTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/SVM.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
    // Split data into training (80%) and test (20%).
    val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val numIterations = 100
    val model = SVMWithSGD.train(training, numIterations)

    // Clear the default threshold.
    model.setThreshold(-5000)

    // Compute raw scores on the test set.
    val scoreAndLabels = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    // Get evaluation metrics.
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    val auROC = metrics.areaUnderROC()
    println("Area under ROC = " + auROC)
    println("Sensitivity = " + scoreAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / scoreAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + scoreAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / scoreAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + scoreAndLabels.filter(x => x._1 == x._2).count().toDouble / scoreAndLabels.count().toDouble)
  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:52,代码来源:SVMTest.scala

示例11: LogisticRegressionTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu


import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession

object LogisticRegressionTest {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName("LogisticRegressionTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/LogisticRegression.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")

    val splits = data.randomSplit(Array(0.7, 0.3), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new LogisticRegressionWithLBFGS()
      .setNumClasses(2)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Get evaluation metrics.
    val metrics = new BinaryClassificationMetrics(predictionAndLabels)
    val auROC = metrics.areaUnderROC()
    println("Area under ROC = " + auROC)
    println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:51,代码来源:LogisticRegressionTest.scala

示例12: RandomForestTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu


import org.apache.spark.SparkConf
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession
object RandomForestTest {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName("RandomForestTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/RandomForest.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Load and parse the data file.
    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")

    // Split the data into training and test sets (30% held out for testing)
    val splits = data.randomSplit(Array(0.7, 0.3))

    val (training, test) = (splits(0), splits(1))

    // Train a RandomForest model.
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    val numClasses = 2
    val categoricalFeaturesInfo = Map[Int, Int]()
    val numTrees = 3 // Use more in practice.
    val featureSubsetStrategy = "18" // Let the algorithm choose.
    val impurity = "gini"
    val maxDepth = 14
    val maxBins = 16384

    val model = RandomForest.trainClassifier(training, numClasses, categoricalFeaturesInfo,
      numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)

    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }
    val metrics = new BinaryClassificationMetrics(predictionAndLabels)
    val auROC = metrics.areaUnderROC()
    println("Area under ROC = " + auROC)
    println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:57,代码来源:RandomForestTest.scala

示例13: Url

//设置package包名称以及导入依赖的类
package dataset

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import utils.MetaFile


object Url {

  def saveUrl(sc: SparkContext): Unit ={
    var whole : RDD[LabeledPoint] =MLUtils.loadLibSVMFile(sc, ("/Users/sara/url_svmlight/Day" + 0 + ".svm"))
    for (i <- 1 to 120) {
      val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, ("/Users/sara/url_svmlight/Day" + i + ".svm"))
      whole = whole union(examples)
    }

    whole.repartition(4);
    whole.saveAsObjectFile("file:/Users/sara/galaxy/data/url")
  }
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("url")
    val sc = new SparkContext(conf)

    //val objects = new Array[MetaFile](120)

  }

} 
开发者ID:HPCL,项目名称:GalacticSpark,代码行数:31,代码来源:Url.scala

示例14: LRAccuracyTest

//设置package包名称以及导入依赖的类
package MLlib

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel, SparseLogisticRegressionWithLBFGS}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkContext, SparkConf}


object LRAccuracyTest {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(s"LogisticRegressionTest with $args").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").map(
      l => LabeledPoint(l.label, l.features.toSparse))

    // Split data into training (60%) and test (40%).
    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    // Run training algorithm to build the model
    val model = new SparseLogisticRegressionWithLBFGS()
      .setNumClasses(5)
      .run(training)

    // Compute raw scores on the test set.
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Get evaluation metrics.
    val metrics = new MulticlassMetrics(predictionAndLabels)

    val precision = metrics.precision
    println("Precision = " + precision)


  }

} 
开发者ID:intel-analytics,项目名称:SparseML,代码行数:47,代码来源:LRAccuracyTest.scala

示例15: ModelTrainer

//设置package包名称以及导入依赖的类
package modelmanager

import java.io.File

import com.typesafe.config.Config
import org.apache.commons.io.FileUtils
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.streaming.StreamingContext

import scala.collection.JavaConversions._

object ModelTrainer {
  val nClasses: Int = 2
  val positiveLabel: Double = 1.0
  val negativeLabel: Double = 0.0
  val maxBins: Int = 100

  def trainModels(ssc: StreamingContext, config: Config) = {

    //Load configuration
    val depth = config.getInt("models.trainingConfiguration.depth")
    val impurity = config.getString("models.trainingConfiguration.impurity")
    val strategy = config.getString("models.trainingConfiguration.strategy")
    val seed = config.getInt("models.trainingConfiguration.seed")
    val forestSize = config.getInt("models.trainingConfiguration.forestSize")
    val dataPath = config.getString("models.trainingConfiguration.pathToTrainingData")
    val modelsPath = config.getString("models.pathToModels")
    val events = config.getStringList("models.models")
    val categoricalInfo = Range(0, config.getInt("eventsCount")).map((_, 2)).toMap

    val models = events.par.map(modelName => {
      (modelName,
        RandomForest.trainClassifier(
          MLUtils.loadLibSVMFile(ssc.sparkContext, dataPath + modelName + ".libsvm"),
          nClasses,
          categoricalInfo,
          forestSize,
          strategy,
          impurity,
          depth,
          maxBins,
          seed))
    })

    if (config.getBoolean("models.saveModels"))
      models.seq.foreach(x => {
        FileUtils.deleteQuietly(new File(modelsPath + x._1))
        x._2.save(ssc.sparkContext, modelsPath + x._1)
      })
    models
  }
} 
开发者ID:jandion,项目名称:SparkOFP,代码行数:54,代码来源:ModelTrainer.scala


注:本文中的org.apache.spark.mllib.util.MLUtils类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。