当前位置: 首页>>代码示例>>Scala>>正文


Scala VectorAssembler类代码示例

本文整理汇总了Scala中org.apache.spark.ml.feature.VectorAssembler的典型用法代码示例。如果您正苦于以下问题:Scala VectorAssembler类的具体用法?Scala VectorAssembler怎么用?Scala VectorAssembler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了VectorAssembler类的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: LRCV

//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater

import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{ StringIndexerModel, VectorAssembler }
import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel, ParamGridBuilder }
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }

class LRCV(sc: SparkContext) {

  implicit val sqlContext = new SQLContext(sc)

  val lr = new LogisticRegression().setMaxIter(10).setFeaturesCol("scaledFeatures")

  val paramGrid = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(0.1, 0.01))
    .build()

  val assembler = new VectorAssembler()
    .setInputCols(Array("gender", "age", "weight", "height", "indexedJob"))
    .setOutputCol("features")

  val pipeline = new Pipeline()
    .setStages(Array(assembler, standardScaler("features"), lr))

  val cv = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(new BinaryClassificationEvaluator)
    .setEstimatorParamMaps(paramGrid)
    .setNumFolds(10)

  def train(df: DataFrame): (StringIndexerModel, CrossValidatorModel, Matrix) = {

    // need to index strings on all data to not missing the job fields.
    // other alternative can be manually assign values for each job like gender.
    val indexerModel = stringIndexer("job").fit(df)
    val indexed = indexerModel.transform(df)

    val splits = indexed.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1)

    val cvModel = cv.fit(training)

    val predictionAndLabels = cvModel
      .transform(test)
      .select("label", "prediction").map {
        case Row(label: Double, prediction: Double) ?
          (prediction, label)
      }

    printBinaryMetrics(predictionAndLabels)

    (indexerModel, cvModel, confusionMatrix(predictionAndLabels))

  }

} 
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:62,代码来源:LRCV.scala

示例2: LinearRegressionPipeline

//设置package包名称以及导入依赖的类
package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}


object LinearRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new LinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setRegParam(0.1)
      .setElasticNetParam(1.0)
      .setMaxIter(10)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def linearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

    println(s"r2: ${trainingSummary.r2}")
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:61,代码来源:LinearRegressionPipeline.scala

示例3: DecisionTreePipeline

//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala

示例4: NaiveBayesPipeline

//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala

示例5: RandomForestPipeline

//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala

示例6: DecisionTreePipeline

//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object DecisionTreePipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def decisionTreePipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val dt = new DecisionTreeClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += dt
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:60,代码来源:DecisionTreePipeline.scala

示例7: NaiveBayesPipeline

//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object NaiveBayesPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def naiveBayesPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val nb = new NaiveBayes()

    stages += vectorAssembler
    stages += nb
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:52,代码来源:NaiveBayesPipeline.scala

示例8: RandomForestPipeline

//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:62,代码来源:RandomForestPipeline.scala

示例9: Pca

//设置package包名称以及导入依赖的类
package com.github.dongjinleekr.spark.example

import com.github.dongjinleekr.spark.dataset.Iris
import com.github.dongjinleekr.spark.dataset.Iris._
import org.apache.spark.ml.feature.{PCA, VectorAssembler}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession


object Pca {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("PCA Example")
      .getOrCreate()

    // Read the file
    val raw = spark.read
      .schema(Iris.schema)
      .option("header", true)
      .csv("hdfs:///datasets/iris/data.csv")

    // Normalize:
    // 1. Combine the features into vector.
    // 2. Convert enumerating value into Int type.
    val assembler = new VectorAssembler()
      .setInputCols(Iris.schema.fields.map(_.name).slice(1, 5))
      .setOutputCol("features")

    def speciesToInt: (String => Int) = { s: String => Species.toInt(s) }

    val newSpecies = udf(speciesToInt).apply(col("species"))
    val df = assembler.transform(raw)
      .withColumn("species", newSpecies)
      .select("id", "features", "species")

    // PCA (2)
    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(2)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)
  }
} 
开发者ID:dongjinleekr,项目名称:spark-dataset,代码行数:48,代码来源:Pca.scala

示例10: BaseTransformerConverter

//设置package包名称以及导入依赖的类
package org.apache.spark.ml.mleap.converter.runtime

import com.truecar.mleap.runtime.transformer
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.classification.RandomForestClassificationModel
import org.apache.spark.ml.feature.{IndexToString, StandardScalerModel, StringIndexerModel, VectorAssembler}
import org.apache.spark.ml.mleap.classification.SVMModel
import org.apache.spark.ml.mleap.converter.runtime.classification.{RandomForestClassificationModelToMleap, SupportVectorMachineModelToMleap}
import org.apache.spark.ml.mleap.converter.runtime.feature.{IndexToStringToMleap, StandardScalerModelToMleap, StringIndexerModelToMleap, VectorAssemblerModelToMleap}
import org.apache.spark.ml.mleap.converter.runtime.regression.{LinearRegressionModelToMleap, RandomForestRegressionModelToMleap}
import org.apache.spark.ml.regression.{LinearRegressionModel, RandomForestRegressionModel}


trait BaseTransformerConverter extends SparkTransformerConverter {
  // regression
  implicit val mleapLinearRegressionModelToMleap: TransformerToMleap[LinearRegressionModel, transformer.LinearRegressionModel] =
    addConverter(LinearRegressionModelToMleap)
  implicit val mleapRandomForestRegressionModelToMleap: TransformerToMleap[RandomForestRegressionModel, transformer.RandomForestRegressionModel] =
    addConverter(RandomForestRegressionModelToMleap)

  // classification
  implicit val mleapRandomForestClassificationModelToMleap: TransformerToMleap[RandomForestClassificationModel, transformer.RandomForestClassificationModel] =
    addConverter(RandomForestClassificationModelToMleap)
  implicit val mleapSupportVectorMachineModelToMleap: TransformerToMleap[SVMModel, transformer.SupportVectorMachineModel] =
    addConverter(SupportVectorMachineModelToMleap)

  //feature
  implicit val mleapIndexToStringToMleap: TransformerToMleap[IndexToString, transformer.ReverseStringIndexerModel] =
    addConverter(IndexToStringToMleap)
  implicit val mleapStandardScalerModelToMleap: TransformerToMleap[StandardScalerModel, transformer.StandardScalerModel] =
    addConverter(StandardScalerModelToMleap)
  implicit val mleapStringIndexerModelToMleap: TransformerToMleap[StringIndexerModel, transformer.StringIndexerModel] =
    addConverter(StringIndexerModelToMleap)
  implicit val mleapVectorAssemblerToMleap: TransformerToMleap[VectorAssembler, transformer.VectorAssemblerModel] =
    addConverter(VectorAssemblerModelToMleap)

  // other
  implicit val mleapPipelineModelToMleap: TransformerToMleap[PipelineModel, transformer.PipelineModel] =
    addConverter(PipelineModelToMleap(this))
}
object BaseTransformerConverter extends BaseTransformerConverter 
开发者ID:TrueCar,项目名称:mleap,代码行数:42,代码来源:BaseTransformerConverter.scala

示例11: Test

//设置package包名称以及导入依赖的类
package org.apache.spark.test

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.StringIndexer

object Test {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Simple Application")
    val sc = new SparkContext(conf)
      
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //KMEANS
    val npart = 216
    
    def time[A](a: => A) = {
    	val now = System.nanoTime
    	val result = a
    	val sec = (System.nanoTime - now) * 1e-9
    	println("Total time (secs): " + sec)
    	result
    }
    
    val file = "hdfs://hadoop-master:8020/user/spark/datasets/higgs/HIGGS.csv"
    val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "false")
      .option("inferSchema", "true").load(file).repartition(npart)
    
    
    import org.apache.spark.ml.feature.VectorAssembler 
    val featureAssembler = new VectorAssembler().setInputCols(df.columns.drop(1)).setOutputCol("features")
    val processedDf = featureAssembler.transform(df).cache()
    
    print("Num. elements: " + processedDf.count)
    
    // Trains a k-means model.
    import org.apache.spark.ml.clustering.KMeans
    val kmeans = new KMeans().setSeed(1L)
    val cmodel = time(kmeans.fit(processedDf.select("features")))    
    
    //RANDOM FOREST
    import org.apache.spark.ml.classification.RandomForestClassifier
    val labelCol = df.columns.head
    
    val indexer = new StringIndexer().setInputCol(labelCol).setOutputCol("labelIndexed")
    val imodel = indexer.fit(processedDf)
    val indexedDF = imodel.transform(processedDf)
    
    val rf = new RandomForestClassifier().setFeaturesCol("features").setLabelCol("labelIndexed")
    val model = time(rf.fit(indexedDF))
  }
} 
开发者ID:sramirez,项目名称:scalabilityTestSpark,代码行数:54,代码来源:Test.scala


注:本文中的org.apache.spark.ml.feature.VectorAssembler类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。