本文整理汇总了Scala中org.apache.spark.streaming.kafka.KafkaUtils类的典型用法代码示例。如果您正苦于以下问题:Scala KafkaUtils类的具体用法?Scala KafkaUtils怎么用?Scala KafkaUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了KafkaUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: StreamingApp
//设置package包名称以及导入依赖的类
package spark.test
import data.processing.avro.AvroDecoder
import kafka.serializer.StringDecoder
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka.KafkaUtils
object StreamingApp {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Streaming Application")
val ssc = new StreamingContext(conf, Seconds(1))
val topicsSet = "test".split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")
val directKafkaStream = KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](
ssc, kafkaParams, topicsSet
)
directKafkaStream.foreachRDD(rdd =>
rdd.foreachPartition(partitionOfRecords => {
val avroDecoder = new AvroDecoder("/event-record.json")
partitionOfRecords.map(m => (m._1, avroDecoder.decode(m._2))).foreach(m => println(m))
}))
ssc.start()
ssc.awaitTermination()
}
}
示例2: KafkaPayload
//设置package包名称以及导入依赖的类
package tools
import kafka.serializer.DefaultDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
case class KafkaPayload(value: Array[Byte])
class KafkaDStreamSource(config: Map[String, String]) {
def createSource(ssc: StreamingContext, topic: String): DStream[KafkaPayload] = {
val kafkaParams = config
val kafkaTopics = Set(topic)
KafkaUtils.
createDirectStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
ssc,
kafkaParams,
kafkaTopics).
map(dStream => KafkaPayload(dStream._2))
}
}
object KafkaDStreamSource {
def apply(config: Map[String, String]): KafkaDStreamSource = new KafkaDStreamSource(config)
}
示例3: StationJourneyCountCustomApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel
object StationJourneyCountCustomApp {
def main(args: Array[String]) {
if (args.length != 7) {
System.err.println(
"Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
System.exit(1)
}
val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
//.set("spark.streaming.receiver.writeAheadLog.enable", "true")
val ssc = new StreamingContext(conf, Seconds(10))
ssc.checkpoint(checkpointDir)
val topics = Map[String, Int](
topic -> 1)
val params = Map[String, String](
"zookeeper.connect" -> zkQuorum,
"group.id" -> consumerGroupId,
"bootstrap.servers" -> brokerUrl)
KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
.map(rec => rec.split(","))
.map(rec => ((rec(3), rec(7)), 1))
.reduceByKey(_ + _)
.repartition(1)
.map(rec => (rec._2, rec._1))
.transform(rdd => rdd.sortByKey(ascending = false))
.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例4: createStream
//设置package包名称以及导入依赖的类
package it.agilelab.bigdata.wasp.consumers.readers
import it.agilelab.bigdata.wasp.core.WaspSystem
import it.agilelab.bigdata.wasp.core.WaspSystem._
import it.agilelab.bigdata.wasp.core.kafka.CheckOrCreateTopic
import it.agilelab.bigdata.wasp.core.logging.WaspLogger
import it.agilelab.bigdata.wasp.core.models.{DefaultConfiguration, TopicModel}
import it.agilelab.bigdata.wasp.core.utils.{AvroToJsonUtil, ConfigManager, JsonToByteArrayUtil}
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
//TODO: check warning (not understood)
def createStream(group: String, topic: TopicModel)(implicit ssc: StreamingContext): DStream[String] = {
val kafkaConfig = ConfigManager.getKafkaConfig
val kafkaConfigMap: Map[String, String] = Map(
"zookeeper.connect" -> kafkaConfig.zookeeper.toString,
"zookeeper.connection.timeout.ms" -> kafkaConfig.zookeeper.timeout.getOrElse(DefaultConfiguration.timeout).toString
)
if (??[Boolean](WaspSystem.getKafkaAdminActor, CheckOrCreateTopic(topic.name, topic.partitions, topic.replicas))) {
val receiver = KafkaUtils.createStream[String, Array[Byte], StringDecoder, DefaultDecoder](
ssc,
kafkaConfigMap + ("group.id" -> group),
Map(topic.name -> 3),
StorageLevel.MEMORY_AND_DISK_2
)
topic.topicDataType match {
case "avro" => receiver.map(x => (x._1, AvroToJsonUtil.avroToJson(x._2))).map(_._2)
case "json" => receiver.map(x => (x._1, JsonToByteArrayUtil.byteArrayToJson(x._2))).map(_._2)
case _ => receiver.map(x => (x._1, AvroToJsonUtil.avroToJson(x._2))).map(_._2)
}
} else {
logger.error(s"Topic not found on Kafka: $topic")
throw new Exception(s"Topic not found on Kafka: $topic")
}
}
}
示例5: StationJourneyCountDirectApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
object StationJourneyCountDirectApp {
def main(args: Array[String]) {
if (args.length != 7) {
System.err.println(
"Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
System.exit(1)
}
val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(10))
ssc.checkpoint(checkpointDir)
val topics = Set(topic)
val params = Map[String, String](
"zookeeper.connect" -> zkQuorum,
"group.id" -> consumerGroupId,
"bootstrap.servers" -> brokerUrl)
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, params, topics).map(_._2)
.map(rec => rec.split(","))
.map(rec => ((rec(3), rec(7)), 1))
.reduceByKey(_ + _)
.repartition(1)
.map(rec => (rec._2, rec._1))
.transform(rdd => rdd.sortByKey(ascending = false))
.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例6: StationJourneyCountApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
object StationJourneyCountApp {
def main(args: Array[String]) {
if (args.length != 7) {
System.err.println(
"Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
System.exit(1)
}
val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
//.set("spark.streaming.receiver.writeAheadLog.enable", "true")
val ssc = new StreamingContext(conf, Seconds(10))
ssc.checkpoint(checkpointDir)
val topics = Map[String, Int](
topic -> 1)
KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
.map(rec => rec.split(","))
.map(rec => ((rec(3), rec(7)), 1))
.reduceByKey(_ + _)
.repartition(1)
.map(rec => (rec._2, rec._1))
.transform(rdd => rdd.sortByKey(ascending = false))
.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例7: SparkJob
//设置package包名称以及导入依赖的类
package de.codecentric.dcos_intro.spark
import de.codecentric.dcos_intro.{Tweet, TweetDecoder}
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import com.datastax.spark.connector.streaming._
object SparkJob {
def main(args: Array[String]) {
val consumerTopic = args(0)
val sparkConf = new SparkConf()
.setAppName(getClass.getName)
.set("spark.cassandra.connection.host", s"${args(1)}")
.set("spark.cassandra.connection.port", s"${args(2)}")
val consumerProperties = Map("bootstrap.servers" -> args(3), "auto.offset.reset" -> "smallest")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val kafkaStream = KafkaUtils.createDirectStream[String, Tweet, StringDecoder, TweetDecoder](
ssc,
consumerProperties,
Set(consumerTopic)
)
kafkaStream.map(tuple => tuple._2).saveToCassandra("dcos", "tweets")
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
}
示例8: ApplicationContext
//设置package包名称以及导入依赖的类
package com.playing.contexts
import com.playing.utils.SparkConfig
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
object ApplicationContext {
def main(args: Array[String]): Unit = {
val Array(brokers, topics) = args
// Create context with 2 second batch interval
val ssc = SparkConfig.ssc
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet)
// Get the lines, split them into words, count the words and print
val lines = messages.map(_._2)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
wordCounts.print()
// Start the computation
ssc.start()
ssc.awaitTermination()
}
}
示例9: WeKafka
//设置package包名称以及导入依赖的类
package com.tuyoo.kafka
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.hdfs.server.common.Storage
import org.apache.spark.storage.StorageLevel
class WeKafka extends Serializable{
def getKafkaDStream(args:Array[String],ssc: StreamingContext):DStream[(String,String)]={
val Array(zkQuorum, group, topics, numThreads) = args
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val numInputDStreams = 10
// val kafkaDStreams=(1 to numInputDStreams).map(_=>KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).persist(StorageLevel.MEMORY_AND_DISK).map(_._2).persist(StorageLevel.MEMORY_AND_DISK)
// .map(_.replaceAll("\u0000", "")).persist(StorageLevel.MEMORY_AND_DISK))
// ssc.union(kafkaDStreams)
KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).persist(StorageLevel.MEMORY_AND_DISK)//.map(_._2).persist(StorageLevel.MEMORY_AND_DISK)
//.map(_.replaceAll("\u0000", "")).persist(StorageLevel.MEMORY_AND_DISK)
}
}
示例10: SparkStreamingOnKafkaReceiver
//设置package包名称以及导入依赖的类
package com.jjzhk.sparkexamples.streaming
import org.apache.spark.SparkConf
import org.apache.spark.api.java.StorageLevels
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Durations, StreamingContext}
object SparkStreamingOnKafkaReceiver {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("SparkStreamingOnKafkaReceiver")
val sc = new StreamingContext(conf, Durations.seconds(30))
val topicMap = Map[String, Int]("HelloKafka" -> 1) // keytopic, value???????
val lines = KafkaUtils.createStream(sc, "Master:2181,Worker1:2181,Worker2:2181", "MyFirstConsumerGroup", topicMap,
StorageLevels.MEMORY_AND_DISK_SER_2)
val words = lines.flatMap(_._2.split(" ")).map((_, 1))
val wordCounts = words.reduceByKey(_+_)
wordCounts.print()
sc.start()
sc.awaitTermination()
}
}
示例11: InputManager
//设置package包名称以及导入依赖的类
package iomanager
import com.typesafe.config.Config
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.JavaConversions._
object InputManager {
def createInputStream(ssc: StreamingContext, config: Config): DStream[Set[Int]] = {
val windowDuration = Seconds(config.getInt("input.observationWindow"))
val topics = config.getStringList("input.kafka.topics")
val brokers = config.getStringList("input.kafka.brokers").reduce(_ + "," + _)
val group = config.getString("input.kafka.group")
val eventIndex = ssc.sparkContext.broadcast(config.getStringList("eventIndex")
.map(_.split(",")).map(x => x.last -> x.head.toInt).toMap)
val topicMap = topics.map((_, 2)).toMap
val eventsStream =
KafkaUtils.createStream(ssc, brokers, group, topicMap, StorageLevel.MEMORY_ONLY_2)
.transform((data)=>{
data.map(_._2).filter(eventIndex.value.contains)
.map(eventIndex.value.get(_).get).distinct.map(x=>Set(x))
})
.reduce(_++_)
.persist(StorageLevel.MEMORY_ONLY_2)
eventsStream
.window(windowDuration)
.persist(StorageLevel.MEMORY_ONLY_2)
}
}
示例12: KafkaSource
//设置package包名称以及导入依赖的类
package com.ippontech.kafka
import com.ippontech.kafka.stores.OffsetsStore
import com.typesafe.scalalogging.slf4j.LazyLogging
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import scala.reflect.ClassTag
object KafkaSource extends LazyLogging {
def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag]
(ssc: StreamingContext, kafkaParams: Map[String, String], offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] = {
val topics = Set(topic)
val storedOffsets = offsetsStore.readOffsets(topic)
val kafkaStream = storedOffsets match {
case None =>
// start from the latest offsets
KafkaUtils.createDirectStream[K, V, KD, VD](ssc, kafkaParams, topics)
case Some(fromOffsets) =>
// start from previously saved offsets
val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](ssc, kafkaParams, fromOffsets, messageHandler)
}
// save the offsets
kafkaStream.foreachRDD(rdd => offsetsStore.saveOffsets(topic, rdd))
kafkaStream
}
// Kafka input stream
def kafkaStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag]
(ssc: StreamingContext, brokers: String, offsetsStore: OffsetsStore, topic: String): InputDStream[(K, V)] =
kafkaStream(ssc, Map("metadata.broker.list" -> brokers), offsetsStore, topic)
}
示例13: DataConsumer
//设置package包名称以及导入依赖的类
package com.safak
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DataConsumer {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println(
s"""
|Usage: DirectKafkaWordCount <brokers> <topics>
| <brokers> is a list of one or more Kafka brokers
| <topics> is a list of one or more kafka topics to consume from
|
""".stripMargin)
System.exit(1)
}
val Array(brokers, topics) = args
val sparkConf = new SparkConf()
.setAppName("StreamingApp")
val ssc = new StreamingContext(sparkConf, Seconds(2))
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
// Create a new stream which can decode byte arrays.
val messageStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
kafkaParams,
topicsSet)
messageStream.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
val rdd2 = rdd.map(_._2)
println("New Batch")
println("Rdd Count: " + rdd2.count())
rdd2.collect().foreach(println)
}
});
ssc.start()
ssc.awaitTermination()
}
}
示例14: Run
//设置package包名称以及导入依赖的类
import Schemas.{Sales_v2, Shipments_v1}
import io.confluent.kafka.serializers.KafkaAvroDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.SparkConf
import redis.embedded.RedisServer
object Run extends App {
// Start embedded Redis Server
val redisServer = new RedisServer(6379)
redisServer.start()
val sparkConf = new SparkConf()
.setAppName("e-commerce-demo-inventory")
.setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(4))
val kafkaParams = Map[String, String](
"auto.offset.reset" -> "smallest",
"zookeeper.connect" -> "cloudera.landoop.com:22181",
"group.id" -> "group111112",
"metadata.broker.list" -> "cloudera.landoop.com:29092",
"schema.registry.url" -> "http://cloudera.landoop.com:28081")
val salesTopic = Set("generator-sales")
val salesStream = KafkaUtils.createDirectStream[Object, Object, KafkaAvroDecoder, KafkaAvroDecoder](ssc, kafkaParams, salesTopic)
val shipmentsTopic = Set("generator-shipments")
val shipmentsStream = KafkaUtils.createDirectStream[Object, Object, KafkaAvroDecoder, KafkaAvroDecoder](ssc, kafkaParams, shipmentsTopic)
val sales = salesStream.map[Sales_v2](AvroConverter.getSale(_))
val shipment = salesStream.map[Shipments_v1](AvroConverter.getShipment(_))
sales.map(x => 1).reduce(_ + _)
sales.print()
shipment.map(x => 1).reduce(_ + _)
.print()
sys.ShutdownHookThread {
println("Gracefully stopping Spark Streaming Application")
ssc.stop(stopSparkContext = true, stopGracefully = true)
redisServer.stop()
println("Application stopped")
}
ssc.start()
ssc.awaitTermination()
}
示例15: KafkaDStreamSource
//设置package包名称以及导入依赖的类
package org.yuboxu.spark
import kafka.serializer.{StringDecoder, DefaultDecocder}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
class KafkaDStreamSource(config: Map[String, String]) {
def createSource(ssc: StreamingContext, topic: String): DStream[KafkaPayload] = {
val kafkaParams = config
val kafkaTopics = Set(topic)
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
// spark streaming context
ssc,
// kafka configuration parameters
kafkaParams,
// names of the topics to consume
kafkaTopics).map(dstream => KafkaPayload(Option(dstream._1), dstream._2)
)
}
}
object KafkaDStreamSource {
def apply(config: Map[String, String]): KafkaDStreamSource = new KafkaDStreamSource(config)
}