本文整理汇总了Scala中org.apache.spark.ml.clustering.KMeans类的典型用法代码示例。如果您正苦于以下问题:Scala KMeans类的具体用法?Scala KMeans怎么用?Scala KMeans使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了KMeans类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: dataToDouble
//设置package包名称以及导入依赖的类
package es.us.cluster
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
val dataset = sparkSession.read
.format("com.databricks.spark.csv")
.option("header", "true") // Use first line of all files as header
.option("inferSchema", "true") // Automatically infer data types
.load(origen)
// Trains a k-means model.
val kmeans = new KMeans().setK(2).setSeed(1L).setMaxIter(numIterations)
val model = kmeans.fit(dataset)
// Evaluate clustering by computing Within Set Sum of Squared Errors.
val WSSSE = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $WSSSE")
// Shows the result.
println("Cluster Centers: ")
model.clusterCenters.foreach(println)
sparkSession.stop()
}
//Return 0 if the data is empty, else return data parsed to Double
def dataToDouble(s: String): Double = {
return if (s.isEmpty) 0 else s.toDouble
}
}
示例2: s
//设置package包名称以及导入依赖的类
println("**************************")
println("K-Means sample")
println("**************************")
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
println("")
println("Define the input data")
println("=====================")
println("")
val data = sqlContext.read.format("com.databricks.spark.csv").option("inferSchema", "true").load("/tmp/iris.data")
sqlContext.udf.register("toVector", (a: Double, b: Double, c:Double, d:Double) => Vectors.dense(a, b, c, d))
println("")
println("Create feature vectors and generate a K-Means model")
println("===================================================")
println("")
val features = data.selectExpr("toVector(C0, C1, C2, C3) as feature", "C4 as name")
val kmeans = new KMeans().setK(3).setFeaturesCol("feature").setPredictionCol("prediction")
val model = kmeans.fit(features)
println("")
println("Calculate cluster centers")
println("=========================")
println("")
val predicted = model.transform(features)
predicted.show
predicted.registerTempTable("predicted")
import org.apache.spark.sql.expressions.Window
println("")
println("Find top three objects")
println("======================")
println("")
val top3 = sqlContext.sql("SELECT * FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY name) AS rn FROM predicted) x WHERE rn <= 3")
top3.show()
示例3: trainModel
//设置package包名称以及导入依赖的类
package com.infosupport.recommendedcontent.core
import akka.actor.{Actor, ActorLogging, Props}
import org.apache.spark.SparkContext
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorAssembler}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions._
import com.datastax.spark.connector._
private def trainModel(cluster: Int) = {
val spark = SparkSession
.builder()
.getOrCreate()
import spark.implicits._
val transactionDataDF = spark.read
.format("org.apache.spark.sql.cassandra")
.options(Map( "table" -> "transaction_data", "keyspace" -> "events"))
.load()
val userIdDF = transactionDataDF.select("user_id", "original_amount").withColumn("user_id", $"user_id".cast(IntegerType))
val assembler = new VectorAssembler()
.setInputCols(Array("user_id", "original_amount"))
.setOutputCol("features")
val training = assembler.transform(userIdDF)
val kmeans = new KMeans().setK(cluster).setSeed(1L)
val model = kmeans.fit(training)
// Evaluate clustering by computing Within Set Sum of Squared Errors.
val WSSSE = model.computeCost(training)
val transformed = model.transform(training)
// transformed
// .select("user_id", "original_amount" , "prediction")
// .show(false)
transformed.select("user_id", "original_amount" , "prediction").withColumn("user_id", convertUserId($"user_id")).show(false)
println(s"Within Set Sum of Squared Errors = $WSSSE")
//myDF.withColumn("Code", when(myDF("Amt") < 100, "Little").otherwise("Big"))
val rowRDD = transformed.map(p => UserCluster(p.getAs("prediction"), p.getAs("user_id"), p.getAs("original_amount"))).rdd
rowRDD.saveToCassandra("events", "userid_by_amount_cluster")
}
}
示例4:
//设置package包名称以及导入依赖的类
println("**************************")
println("Large K-Means sample")
println("**************************")
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
println("")
println("Define the input data")
println("=====================")
println("")
val dataUrl = "s3n://xxxxxxxxxx/USCensus1990.data.txt"
val data = sqlContext.read.format("com.databricks.spark.csv").option("inferSchema", "true").option("header", "true").load(dataUrl)
data.registerTempTable("input")
println("")
println("Create a K-Means model and calculate cluster centers")
println("====================================================")
println("")
sqlContext.udf.register("toVector", (i1: Integer, i2: Integer, i3: Integer) => Vectors.dense(i1.toDouble, i2.toDouble, i3.toDouble))
val features = data.selectExpr("toVector(dIncome1, dIncome2, dIncome3) as feature", "caseid")
val kmeans = new KMeans().setK(3).setFeaturesCol("feature").setPredictionCol("prediction")
val model = kmeans.fit(features)
val predicted = model.transform(features)
predicted.show
示例5: KMeansExample
//设置package包名称以及导入依赖的类
package org.lavenderx.tutorial.spark.ml
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.DataFrame
import org.lavenderx.tutorial.spark.SparkConnector
object KMeansExample extends SparkConnector {
def main(args: Array[String]) {
// $example on$
// Crates a DataFrame
val dataset: DataFrame = sparkSQLContext.createDataFrame(Seq(
(1, Vectors.dense(0.0, 0.0, 0.0)),
(2, Vectors.dense(1.1, 4.5, 0.3)),
(3, Vectors.dense(0.2, 0.2, 0.2)),
(4, Vectors.dense(9.0, 9.0, 9.0)),
(5, Vectors.dense(9.1, 0.8, 9.1)),
(6, Vectors.dense(9.2, 1.2, 6.2)),
(7, Vectors.dense(3.5, 6.2, 7.2)),
(8, Vectors.dense(7.2, 9.2, 5.8)),
(9, Vectors.dense(5.6, 4.4, 9.2)),
(10, Vectors.dense(4.9, 8.6, 2.9))
)).toDF("id", "features")
// Trains a k-means model
val kmeans = new KMeans()
.setK(3)
.setFeaturesCol("features")
.setPredictionCol("prediction")
val model = kmeans.fit(dataset)
// Shows the result
println("Final Centers: ")
model.clusterCenters.foreach(println)
// $example off$
sparkContext.stop()
}
}
示例6: Test
//设置package包名称以及导入依赖的类
package org.apache.spark.test
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.StringIndexer
object Test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
//KMEANS
val npart = 216
def time[A](a: => A) = {
val now = System.nanoTime
val result = a
val sec = (System.nanoTime - now) * 1e-9
println("Total time (secs): " + sec)
result
}
val file = "hdfs://hadoop-master:8020/user/spark/datasets/higgs/HIGGS.csv"
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "false")
.option("inferSchema", "true").load(file).repartition(npart)
import org.apache.spark.ml.feature.VectorAssembler
val featureAssembler = new VectorAssembler().setInputCols(df.columns.drop(1)).setOutputCol("features")
val processedDf = featureAssembler.transform(df).cache()
print("Num. elements: " + processedDf.count)
// Trains a k-means model.
import org.apache.spark.ml.clustering.KMeans
val kmeans = new KMeans().setSeed(1L)
val cmodel = time(kmeans.fit(processedDf.select("features")))
//RANDOM FOREST
import org.apache.spark.ml.classification.RandomForestClassifier
val labelCol = df.columns.head
val indexer = new StringIndexer().setInputCol(labelCol).setOutputCol("labelIndexed")
val imodel = indexer.fit(processedDf)
val indexedDF = imodel.transform(processedDf)
val rf = new RandomForestClassifier().setFeaturesCol("features").setLabelCol("labelIndexed")
val model = time(rf.fit(indexedDF))
}
}