本文整理汇总了Scala中org.apache.spark.sql.streaming.OutputMode类的典型用法代码示例。如果您正苦于以下问题:Scala OutputMode类的具体用法?Scala OutputMode怎么用?Scala OutputMode使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了OutputMode类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: HttpTextSinkProvider
//设置package包名称以及导入依赖的类
package org.apache.spark.sql.execution.streaming
import org.apache.spark.annotation.InterfaceStability
import org.apache.spark.internal.Logging
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.sources.StreamSinkProvider
import org.apache.spark.sql.streaming.OutputMode
import Params._
class HttpTextSinkProvider extends StreamSinkProvider with DataSourceRegister {
def createSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode): Sink = {
new HttpTextSink(parameters.getString("httpServletUrl"), parameters.getString("topic"), parameters.getBool("useGzipCompress", true));
}
def shortName(): String = "httpText"
}
class HttpTextSink(httpPostURL: String, topic: String, useGzipCompress: Boolean) extends Sink with Logging {
val sender = new HttpTextSender(httpPostURL);
val RETRY_TIMES = 5;
val SLEEP_TIME = 100;
override def addBatch(batchId: Long, data: DataFrame) {
//send data to the HTTP server
var success = false;
var retried = 0;
while (!success && retried < RETRY_TIMES) {
try {
retried += 1;
sender.sendTextArray(topic, batchId, data.collect().map { _.get(0).asInstanceOf[String] }, useGzipCompress);
success = true;
}
catch {
case e: Throwable ? {
success = false;
super.logWarning(s"failed to send", e);
if (retried < RETRY_TIMES) {
val sleepTime = SLEEP_TIME * retried;
super.logWarning(s"will retry to send after ${sleepTime}ms");
Thread.sleep(sleepTime);
}
else {
throw e;
}
}
}
}
}
}
示例2: CustomSinkProvider
//设置package包名称以及导入依赖的类
package com.knockdata.spark.highcharts
import com.knockdata.spark.highcharts.model.Highcharts
import org.apache.spark.sql._
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.StreamSinkProvider
import org.apache.spark.sql.streaming.OutputMode
class CustomSinkProvider extends StreamSinkProvider {
def createSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode): Sink = {
new Sink {
override def addBatch(batchId: Long, data: DataFrame): Unit = {
val chartId = parameters("chartId")
val chartParagraphId = parameters("chartParagraphId")
println(s"batchId: $batchId, chartId: $chartId, chartParagraphId: $chartParagraphId")
// data.show(3)
val z = Registry.get(s"$chartId-z").asInstanceOf[ZeppelinContextHolder]
val seriesHolder = Registry.get(s"$chartId-seriesHolder").asInstanceOf[SeriesHolder]
val outputMode = Registry.get(s"$chartId-outputMode").asInstanceOf[CustomOutputMode]
seriesHolder.dataFrame = data
val result = seriesHolder.result
val (normalSeriesList, drilldownSeriesList) = outputMode.result(result._1, result._2)
val chart = new Highcharts(normalSeriesList, seriesHolder.chartId)
.drilldown(drilldownSeriesList)
val plotData = chart.plotData
// val escaped = plotData.replace("%angular", "")
// println(s" put $chartParagraphId $escaped")
z.put(chartParagraphId, plotData)
println(s"run $chartParagraphId")
z.run(chartParagraphId)
}
}
}
}
示例3: StructuredStreamingKafka
//设置package包名称以及导入依赖的类
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.OutputMode
object StructuredStreamingKafka extends App {
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession
.builder
.appName("StructuredStreamingKafka")
.master("local[*]")
.getOrCreate()
import spark.implicits._
spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "data")
.option("startingOffsets", "earliest")
.option("failOnDataLoss", "false")
.load()
.groupBy(window($"timestamp", "10 seconds"))
.count()
.selectExpr(
"date_format(window.end, \"y-MM-dd hh:mm:ss\") AS key",
"CAST(count AS STRING) AS value")
.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("topic", "stats")
.option("checkpointLocation", "checkpoints")
.outputMode(OutputMode.Update())
.start().awaitTermination()
}
开发者ID:lightbend-reference-architectures,项目名称:structured-streaming-kafka,代码行数:41,代码来源:StructuredStreamingKafka.scala
示例4: CustomSinkProvider
//设置package包名称以及导入依赖的类
package com.rockiey.kafka
import org.apache.spark.sql._
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.StreamSinkProvider
import org.apache.spark.sql.streaming.OutputMode
class CustomSinkProvider extends StreamSinkProvider {
def createSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode): Sink = {
new Sink {
override def addBatch(batchId: Long, data: DataFrame): Unit = {
data.printSchema()
data.show()
println(s"count ${data.count()}")
}
}
}
}
示例5: ClickHouseSinkProvider
//设置package包名称以及导入依赖的类
package io.clickhouse.ext.spark.streaming
import io.clickhouse.ext.ClickHouseUtils
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Encoders, SQLContext}
import org.apache.spark.sql.sources.StreamSinkProvider
import org.apache.spark.sql.streaming.OutputMode
import scala.reflect.{ClassTag, classTag}
import scala.reflect.runtime.universe.TypeTag
abstract class ClickHouseSinkProvider[T <: Product: ClassTag](implicit tag: TypeTag[T]) extends StreamSinkProvider with Serializable with Logging {
def clickHouseServers: Seq[(String, Int)]
def dbName: String
def tableName: Option[String] = None
def eventDateColumnName: String
def indexColumns: Seq[String]
def partitionFunc: (org.apache.spark.sql.Row) => java.sql.Date
override def createSink(
sqlContext: SQLContext,
parameters: Map[String, String],
partitionColumns: Seq[String],
outputMode: OutputMode): ClickHouseSink[T] = {
val typeEncoder = Encoders.product[T]
val schema = typeEncoder.schema
val _tableName = tableName.get //tableName.getOrElse(classOf[T].getName)
val createTableSql = ClickHouseUtils.createTableIfNotExistsSql(
schema,
dbName,
_tableName,
eventDateColumnName,
indexColumns
)
log.info("create new table sql:")
log.info(createTableSql)
val connection = ClickHouseUtils.createConnection(getConnectionString())
try{
connection.createStatement().execute(createTableSql)
}finally {
connection.close()
log.info(s"ClickHouse table ${dbName}.${_tableName} created")
}
log.info("Creating ClickHouse sink")
new ClickHouseSink[T](dbName, _tableName, eventDateColumnName)(getConnectionString)(partitionFunc)
}
def getConnectionString(): (String, Int) = clickHouseServers.head
}
示例6:
//设置package包名称以及导入依赖的类
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
val fromKafkaTopic1ToConsole = spark.
readStream.
format("kafka").
option("subscribe", "topic1").
option("kafka.bootstrap.servers", "localhost:9092").
option("startingoffsets", "earliest"). // latest, earliest or JSON with {"topicA":{"part":offset,"p1":-1},"topicB":{"0":-2}}
load.
select($"key" cast "string", $"value" cast "string"). // deserialize records
as[(String, String)].
writeStream.
trigger(Trigger.ProcessingTime("10 seconds")).
queryName("from-kafka-to-console").
outputMode(OutputMode.Append).
format("console").
start
// ...after some time
fromKafkaTopic1ToConsole.stop