本文整理汇总了Scala中org.apache.spark.mllib.stat.Statistics类的典型用法代码示例。如果您正苦于以下问题:Scala Statistics类的具体用法?Scala Statistics怎么用?Scala Statistics使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Statistics类的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: SimpleApp
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.{Vector, Vectors}
object SimpleApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val data = Array(1,2,3)
val distData = sc.parallelize(data)
val vectorData = distData.map(x => Vectors.dense(x))
val summary = Statistics.colStats(vectorData)
println("mean is: %s".format(summary.mean))
println("max is: %s".format(summary.max))
println("min is: %s".format(summary.min))
//find correlation
// student, exam1, exam2, exam3
val data = sc.parallelize(Array("111, 60, 65, 73", "222, 98,95,88", "333, 56,67,62"))
val vectorRdd = data.map((line: String) => line.split(",").drop(1).map((ele: String) => ele.toDouble)).map(Vectors.dense)
val corrMatrix = Statistics.corr(vectorRdd)
}
}
示例2: StatisticsApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
object StatisticsApp {
def main(args: Array[String]) {
if (args.length != 4) {
System.err.println(
"Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>")
System.exit(1)
}
val Seq(appName, batchInterval, hostname, port) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
val substream = ssc.socketTextStream(hostname, port.toInt)
.filter(!_.contains("NaN"))
.map(_.split(" "))
.filter(f => f(1) != "0")
.map(f => f.map(f => f.toDouble))
substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => {
val stats = Statistics.colStats(rdd)
println("Count: " + stats.count)
println("Max: " + stats.max.toArray.mkString(" "))
println("Min: " + stats.min.toArray.mkString(" "))
println("Mean: " + stats.mean.toArray.mkString(" "))
println("L1-Norm: " + stats.normL1.toArray.mkString(" "))
println("L2-Norm: " + stats.normL2.toArray.mkString(" "))
println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" "))
println("Varience: " + stats.variance.toArray.mkString(" "))
})
ssc.start()
ssc.awaitTermination()
}
}
示例3: ChiSqApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
object ChiSqApp {
def main(args: Array[String]) {
if (args.length != 4) {
System.err.println(
"Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
System.exit(1)
}
val Seq(appName, batchInterval, hostname, port) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
val substream = ssc.socketTextStream(hostname, port.toInt)
.filter(!_.contains("NaN"))
.map(_.split(" "))
.filter(f => f(1) != "0")
.map(f => f.map(f => f.toDouble))
substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
.filter(f => f(0) == 4.0 || f(0) == 5.0)
.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
.foreachRDD(rdd => {
Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))
})
ssc.start()
ssc.awaitTermination()
}
}
示例4: CorrelationApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
object CorrelationApp {
def main(args: Array[String]) {
if (args.length != 4) {
System.err.println(
"Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
System.exit(1)
}
val Seq(appName, batchInterval, hostname, port) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
val substream = ssc.socketTextStream(hostname, port.toInt)
.filter(!_.contains("NaN"))
.map(_.split(" "))
.filter(f => f(1) != "0")
.map(f => f.map(f => f.toDouble))
val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
walkingOrRunning.map(f => f.features).foreachRDD(rdd => {
val corrSpearman = Statistics.corr(rdd, "spearman")
val corrPearson = Statistics.corr(rdd, "pearson")
println("Correlation Spearman: \n" + corrSpearman)
println("Correlation Pearson: \n" + corrPearson)
})
ssc.start()
ssc.awaitTermination()
}
}
示例5:
//设置package包名称以及导入依赖的类
//?
val result = hiveContext.sql("select max(login_times) from model_input_loyal_t") //?
val max_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(login_times) from model_input_loyal_t") //??
val min_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_login_times = max_login_times - min_login_times
val result = hiveContext.sql("select max(stay_time) from model_input_loyal_t") //??
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = hiveContext.sql("select min(stay_time) from model_input_loyal_t") //???
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time = max_stay_time - min_stay_time
val result = hiveContext.sql("select max(view_days) from model_input_loyal_t") //?
val max_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(view_days) from model_input_loyal_t") //??
val min_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_view_days = max_view_days - min_view_days
val result = hiveContext.sql("select max(pv) from model_input_loyal_t") //?
val max_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(pv) from model_input_loyal_t") //??
val min_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_pv = max_pv - min_pv
val result =hiveContext.sql("select max(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2")
val max_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //?
val result = hiveContext.sql("select min(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2")
val min_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //??
val region_last_viewtime = max_last_viewtime - min_last_viewtime
//??login_times:0.2,stay_time:0.3,view_days:0.3,pv:0.15,last_viewtime:0.05
val normalization= hiveContext.sql("select t1.cookie , (((t1.login_times - "+min_login_times+") * 0.2/"+region_login_times+") + ((t1.stay_time- "+min_stay_time+") * 0.3/"+region_stay_time+") +((t1.view_days - "+min_view_days+")* 0.3/"+region_view_days+") +((t1.pv - "+min_pv+")* 0.15/"+region_pv+") +((unix_timestamp(t1.last_viewtime,'yyyy-MM-dd')- "+min_last_viewtime+")*0.05 / " + region_last_viewtime + "))*100 as loyalty_score from model_input_loyal_t t1")
normalization.registerTempTable("temporary_points") //??
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.Vectors
val rdd = normalization.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[Double].toDouble))
val summary = Statistics.colStats(rdd)
println(summary.mean)
val means = summary.mean(0)
println(summary.variance)
val standard_deviation = summary.variance(0)
//????????????½?0???
val r = means - standard_deviation*5
val low_bound = if (r > 0) r else 0
val up_bound = means + standard_deviation*5
val loyalty_temporary = hiveContext.sql("(select t1.lenovo_id,t1.loyalty_score,t1.loyalty_level from model_output_loyal_t t1 where 1=0) union all (select t2.cookie, t2.loyalty_score,(case when t2.loyalty_score <= "+low_bound+" then '' when t2.loyalty_score < "+up_bound+" then '' else '' end)as loyalty_level from temporary_points t2)")
loyalty_temporary.registerTempTable("temporary_loyalty")
hiveContext.sql("insert overwrite table data.model_output_loyal_t partition (l_day='2016-10-01') select * from temporary_loyalty")
示例6: ChiSqLearning
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.linalg.{ Matrix, Matrices, Vectors }
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.{
SparkConf,
SparkContext
}
object ChiSqLearning {
def main(args: Array[String]) {
val vd = Vectors.dense(1, 2, 3, 4, 5)
val vdResult = Statistics.chiSqTest(vd)
println(vd)
println(vdResult)
println("-------------------------------")
val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6))
val mtxResult = Statistics.chiSqTest(mtx)
println(mtx)
println(mtxResult)
//print :??????????????p?,???????p
println("-------------------------------")
val mtx2 = Matrices.dense(2, 2, Array(19.0, 34, 24, 10.0))
printChiSqTest(mtx2)
printChiSqTest(Matrices.dense(2, 2, Array(26.0, 36, 7, 2.0)))
// val mtxResult2 = Statistics.chiSqTest(mtx2)
// println(mtx2)
// println(mtxResult2)
}
def printChiSqTest(matrix: Matrix): Unit = {
println("-------------------------------")
val mtxResult2 = Statistics.chiSqTest(matrix)
println(matrix)
println(mtxResult2)
}
}
示例7: RandomDistribution
//设置package包名称以及导入依赖的类
package net.akmorrow13.endive.featurizers
import breeze.linalg.DenseVector
import breeze.stats.distributions.{ExponentialFamily, DiscreteDistr, Poisson, Gaussian}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
object RandomDistribution {
def poisson(data: RDD[DenseVector[Double]]): Poisson = {
val summary: MultivariateStatisticalSummary = Statistics.colStats(data.map(r => Vectors.dense(r.toArray)))
val mean = summary.mean.toArray.sum
new Poisson(mean)
}
def gaussian(data: RDD[DenseVector[Double]]): Gaussian = {
val summary: MultivariateStatisticalSummary = Statistics.colStats(data.map(r => Vectors.dense(r.toArray)))
val mean = summary.mean.toArray.sum
val variance = summary.variance.toArray.sum
println(s"mean: ${mean} variance ${variance}")
new Gaussian(mean, variance)
}
}