本文整理汇总了Scala中org.apache.spark.sql.functions.avg类的典型用法代码示例。如果您正苦于以下问题:Scala avg类的具体用法?Scala avg怎么用?Scala avg使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了avg类的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: Spark
//设置package包名称以及导入依赖的类
package uk.co.bitcat.streaming.spark
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.avg
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
object Spark {
private case class Measurement(time: String, pollution: Int)
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Pollution Monitor").setMaster("local[*]")
// Setting the batch interval over which we perform our pollution average calculation
val streamingContext = new StreamingContext(conf, Seconds(10))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "pollution_consumer",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
// Creating a stream to read from Kafka
val topics = Array("pollution")
val stream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
// Calculate the pollution average over the last interval
stream.foreachRDD { rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val row = rdd
.map(_.value.split(","))
.map(attributes => Measurement(attributes(0).trim, attributes(1).trim.toInt))
.toDF()
.agg(avg($"pollution") as "pollutionAverage")
.filter($"pollutionAverage" > 75.0)
.foreach(row => println("Raise alert for pollution level: " + row(0)))
}
streamingContext.start()
streamingContext.awaitTermination()
}
}
示例2: NestedStructureTest
//设置package包名称以及导入依赖的类
package me.invkrh.showcase.nested
import scala.util.Random
import me.invkrh.showcase.{JsonSerde, SparkJobSpec}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.avg
object NestedStructureTest {
object Position extends Enumeration { val DEV, OPS = Value }
case class Person(name: String, age: Int, position: String, employer: Option[Employer])
case class Employer(name: String, city: String)
private val input = (0 to 10) map { i =>
Person(
"Hao" + i,
20,
Position.apply(i % 2).toString,
if (Random.nextBoolean()) Some(Employer("criteo", "Paris" + i % 4)) else None
)
}
private val ser = input.map(JsonSerde.serialize)
}
class NestedStructureTest extends SparkJobSpec {
import spark.implicits._
import NestedStructureTest._
private val df = spark.sparkContext
.makeRDD(ser)
.map(p => JsonSerde.deserialize[Person](p))
.toDF()
def withJsonSerde(df: DataFrame): Unit = {
df.printSchema()
df.show(false)
}
"NestedStructure" can {
"show the case" in {
showCase("Nested Structure") {
note("Input is a List of Person object")
input foreach println
note("Serialized to string, fields of Option.None are ignored")
ser foreach println
note("Converted to DataFrame with all fields for each row")
df.show(false)
note("GroupBy employer.name, if the nested field is null, the key will be null")
val res = df
.groupBy('employer getField "name" as "company_name")
.agg(avg($"age"))
res.show(false)
}
}
}
}