本文整理汇总了Scala中org.apache.spark.streaming.StreamingContext类的典型用法代码示例。如果您正苦于以下问题:Scala StreamingContext类的具体用法?Scala StreamingContext怎么用?Scala StreamingContext使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StreamingContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: KafkaUtility
//设置package包名称以及导入依赖的类
package com.knoldus.streaming.kafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object KafkaUtility {
//TODO It should read from config
private val kafkaParams = Map(
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "earliest",
"group.id" -> "tweet-consumer"
)
private val preferredHosts = LocationStrategies.PreferConsistent
def createDStreamFromKafka(ssc: StreamingContext, topics: List[String]): InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
preferredHosts,
ConsumerStrategies.Subscribe[String, String](topics.distinct, kafkaParams)
)
}
示例2: KMeansClusteringApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
object KMeansClusteringApp {
def main(args: Array[String]) {
if (args.length != 4) {
System.err.println(
"Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
System.exit(1)
}
val Seq(appName, batchInterval, hostname, port) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
val substream = ssc.socketTextStream(hostname, port.toInt)
.filter(!_.contains("NaN"))
.map(_.split(" "))
.filter(f => f(1) != "0")
val orientationStream = substream
.map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
.map(arr => arr.map(_.toDouble))
.filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
val model = new StreamingKMeans()
.setK(3)
.setDecayFactor(0)
.setRandomCenters(18, 0.0)
model.trainOn(train.map(v => v.features))
val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))
ssc.start()
ssc.awaitTermination()
}
}
示例3: RedditVariationApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
object RedditVariationApp {
def main(args: Array[String]) {
if (args.length != 2) {
System.err.println(
"Usage: RedditVariationApp <appname> <input_path>")
System.exit(1)
}
val Seq(appName, inputPath) = args.toSeq
val LOG = LogManager.getLogger(this.getClass)
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(1))
LOG.info("Started at %d".format(ssc.sparkContext.startTime))
val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)
val merged = comments.union(comments)
val repartitionedComments = comments.repartition(4)
val rddMin = comments.glom().map(arr =>
arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))
ssc.start()
ssc.awaitTermination()
}
}
示例4: Consumer
//设置package包名称以及导入依赖的类
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession
object Consumer {
def main(args: Array[String]): Unit = {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("streaming")
val sparkConf = new SparkConf().setMaster("local[8]").setAppName("KafkaTest")
val streamingContext = new StreamingContext(sparkConf, Seconds(1))
// Create a input direct stream
val kafkaStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val sc = SparkSession.builder().master("local[8]").appName("KafkaTest").getOrCreate()
val model = SVMModel.load(sc.sparkContext, "/home/xiaoyu/model")
val result = kafkaStream.map(record => (record.key, record.value))
result.foreachRDD(
patient => {
patient.collect().toBuffer.foreach(
(x: (Any, String)) => {
val features = x._2.split(',').map(x => x.toDouble).tail
println(model.predict(Vectors.dense(features)))
}
)
}
)
streamingContext.start()
streamingContext.awaitTermination()
}
}
示例5: User
//设置package包名称以及导入依赖的类
package services.users
import java.io.File
import java.nio.charset.Charset
import java.sql.Timestamp
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.StreamingContext
import services.events.EventStream
import services.Util
case class User(userId: String, testFinishTime: Timestamp, nickname: String, gender: String)
object User {
val DELIMITER = ','
val USER_FEED = "/Users/mahesh/data/affinitas/feeds/users/"
val USER_DATA = "/Users/mahesh/data/affinitas/tables/users/"
var ssc: StreamingContext = null
var sql: SparkSession = null
lazy val usersFeedDF = sql.read
.format("com.databricks.spark.csv")
.option("header", false)
.schema(StructType(Array(
StructField("userId", StringType, true),
StructField("nickname", StringType, true),
StructField("gender", StringType, true)
)
)).load(User.USER_FEED)
//EventStream.testFinishStream.print()
lazy val usersMap = usersFeedDF.rdd.map(record => (record.getString(0), (record.getString(1), record.getString(2))))
def initialize(sscO: StreamingContext, sqlO: SparkSession) = {
ssc = sscO
sql = sqlO
new File(USER_FEED).mkdirs()
new File(USER_DATA).mkdirs()
EventStream.testFinishStream.foreachRDD( {
rdd => {
val testFinishMap = rdd.map(record => (record.userId, record.timestamp))
val userData = testFinishMap.join(usersMap)
.map(record => Array(record._1, record._2._1, record._2._2._1, record._2._2._2))
.collect()
Util.writeCsvToDir(userData, DELIMITER.toString, USER_DATA)
}
})
}
}
示例6: VeChallengeIngest
//设置package包名称以及导入依赖的类
package io.github.adrianulbona.ve
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import twitter4j.{GeoLocation, Place, Status}
object VeChallengeIngest {
case class Location(latitude: Double, longitude: Double)
case class Tweet(time: Long, text: String, user: String, isRetweet: Boolean, country: String, location: Location)
def main(args: Array[String]) {
val spark = SparkSession.builder
.master("local[*]")
.appName("ve-challenge")
.getOrCreate()
import spark.sqlContext.implicits._
val ssc = new StreamingContext(spark.sparkContext, Minutes(2))
val stream = TwitterUtils.createStream(ssc, None, Seq("challenge"))
stream.map(extract).map(normalize).foreachRDD((batch, time) => {
val batchDF: DataFrame = batch.toDF.cache
batchDF.groupBy($"country").count().toDF("country", "count").orderBy($"count".desc).show(6)
batchDF.coalesce(1).write.parquet("tweets/batch=" + time.milliseconds)
batchDF.unpersist()
})
ssc.start()
ssc.awaitTermination()
spark.stop()
}
def extract(status: Status): (Long, String, String, Boolean, Option[Place], Option[GeoLocation]) = {
(status.getCreatedAt.getTime,
status.getText,
status.getUser.getName,
status.isRetweet,
Option(status.getPlace),
Option(status.getGeoLocation))
}
def normalize(extract: (Long, String, String, Boolean, Option[Place], Option[GeoLocation])): Tweet = extract match {
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), None) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(Double.NaN, Double.NaN))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, "unknown", Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, None) =>
Tweet(time, text, user, isRetweet, "unknown", Location(Double.NaN, Double.NaN))
}
}
示例7: KafkaPayload
//设置package包名称以及导入依赖的类
package tools
import kafka.serializer.DefaultDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
case class KafkaPayload(value: Array[Byte])
class KafkaDStreamSource(config: Map[String, String]) {
def createSource(ssc: StreamingContext, topic: String): DStream[KafkaPayload] = {
val kafkaParams = config
val kafkaTopics = Set(topic)
KafkaUtils.
createDirectStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
ssc,
kafkaParams,
kafkaTopics).
map(dStream => KafkaPayload(dStream._2))
}
}
object KafkaDStreamSource {
def apply(config: Map[String, String]): KafkaDStreamSource = new KafkaDStreamSource(config)
}
示例8: Collector
//设置package包名称以及导入依赖的类
package com.databricks.apps.twitterClassifier
import com.google.gson.Gson
import org.apache.spark.SparkContext
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.twitter.TwitterUtils
object Collector {
def doIt(options: CollectOptions, sc: SparkContext, ssc: StreamingContext) {
val tweetStream: DStream[String] = TwitterUtils.createStream(ssc, maybeTwitterAuth)
.map(new Gson().toJson(_))
var numTweetsCollected = 0L
tweetStream.foreachRDD { (rdd, time) =>
val count = rdd.count
if (count > 0) {
rdd.saveAsTextFile(options.tweetDirectory.getAbsolutePath)
numTweetsCollected += count
if (numTweetsCollected > options.numTweetsToCollect) System.exit(0)
}
}
ssc.start()
ssc.awaitTermination()
}
}
示例9: Predict
//设置package包名称以及导入依赖的类
package com.databricks.apps.twitterClassifier
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.streaming.twitter._
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Predict extends App {
import SparkSetup._
val options = PredictOptions.parse(args)
val ssc = new StreamingContext(sc, Seconds(options.intervalInSecs))
Predictor.doIt(options, sc, ssc)
}
object Predictor {
def doIt(options: PredictOptions, sc: SparkContext, ssc: StreamingContext) {
println("Initializing the the KMeans model...")
val model: KMeansModel = new KMeansModel(sc.objectFile[Vector](options.modelDirectory.getCanonicalPath).collect)
println("Materializing Twitter stream...")
TwitterUtils.createStream(ssc, maybeTwitterAuth)
.map(_.getText)
.foreachRDD { rdd =>
rdd.filter(t => model.predict(featurize(t)) == options.clusterNumber)
.foreach(print) // register DStream as an output stream and materialize it
}
println("Initialization complete, starting streaming computation.")
ssc.start()
ssc.awaitTermination()
}
}
示例10: StreamingWordCount
//设置package包名称以及导入依赖的类
package org.examples.scala.examples
import org.apache.spark.streaming.{Seconds, StreamingContext}
import StreamingContext._
import org.apache.spark._
import org.apache.spark.SparkContext._
object StreamingWordCount {
def run(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage BasicStreamingExample <master> <output>")
}
val Array(master, output) = args.take(2)
val conf = new SparkConf().setMaster(master).setAppName("BasicStreamingExample")
val ssc = new StreamingContext(conf, Seconds(30))
val lines = ssc.socketTextStream("localhost" , 7777)
val words = lines.flatMap(_.split(" "))
val wc = words.map(x => (x, 1)).reduceByKey((x, y) => x + y)
wc.saveAsTextFiles(output)
wc.print
println("pandas: sscstart")
ssc.start()
println("pandas: awaittermination")
ssc.awaitTermination()
println("pandas: done!")
}
}
示例11: KafkaStreaming
//设置package包名称以及导入依赖的类
package org.myorganization.spark.streaming
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.kafka._
import kafka.serializer.StringDecoder
object KafkaStreaming {
def main(args: Array[String]): Unit = {
val (batchDuration, topics, bootstrapServers) = getParams(args)
val conf = new SparkConf().setAppName("gpKafkaStreaming")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchDuration))
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("bootstrap.servers" -> bootstrapServers, "auto.offset.reset" -> "smallest")
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
val data = messages.map(_._2)
val loggerSerializerLogs = data.map(_.split("""\s+"""))
.filter(x => x.length > 6)
.map(x => (x(0), x(6)))
.filter(filterLogLines)
.map(x => x._1)
val logCounts = loggerSerializerLogs.map(x => (x, 1L)).reduceByKey(_ + _)
logCounts.print(10)
ssc.start()
ssc.awaitTermination()
}
def filterLogLines(line: Tuple2[String, String]): Boolean = {
val pattern = """logger.+"""
line._2.matches(pattern)
}
def getParams(args: Array[String]): Tuple3[Int, String, String] = {
if (args.length !=3 ) {
System.err.println(s"""
|Usage: spark-kafka.sh <sampling-period> <topics> <bootstrap-servers>
| <sampling-period> is the duration of each batch (in seconds)
| <topics> is a list of one or more kafka topics to consume from
| <bootstrap-servers> is a list of one or more Kafka bootstrap servers
|
""".stripMargin)
System.exit(1)
}
Tuple3[Int, String, String](args(0).toInt, args(1), args(2))
}
}
示例12: TestUpdateStateByKey
//设置package包名称以及导入依赖的类
package examples.streaming
import org.apache.spark.streaming.{StreamingContext, Duration}
import org.apache.spark.SparkConf
object TestUpdateStateByKey {
val checkpointDir: String = "hdfs://localhost:9000/user/hduser/spark-chkpt"
def main(args: Array[String]): Unit = {
val ssc = StreamingContext.getOrCreate(checkpointDir, createFunc _)
ssc.start()
ssc.awaitTermination()
}
def updateFunc(values: Seq[Int], state: Option[Int]): Option[Int] = {
Some(values.size + state.getOrElse(0))
}
def createFunc(): StreamingContext = {
val ssc = new StreamingContext(new SparkConf().setAppName("TestUpdateStateByKeyJob"),
Duration(2000))
ssc.checkpoint(checkpointDir)
ssc.socketTextStream("localhost", 9999)
.flatMap(_.split(" "))
.map((_, 1))
.updateStateByKey(updateFunc _)
.checkpoint(Duration(10000))
.print()
ssc
}
}
示例13: TwitterStreaming
//设置package包名称以及导入依赖的类
package local.riverside
import org.apache.lucene.analysis.ja.JapaneseAnalyzer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.spark.SparkConf
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Durations, StreamingContext}
object TwitterStreaming {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Twitter Streaming")
val ssc = new StreamingContext(conf, Durations.minutes(1L))
val filter = if (args.isEmpty) Nil else args.toList
val stream = TwitterUtils.createStream(ssc, None, filter)
stream
.flatMap { status =>
val text = status.getText
val analyzer = new JapaneseAnalyzer
val tokenStream = analyzer.tokenStream("", text)
val charAttr = tokenStream.addAttribute(classOf[CharTermAttribute])
tokenStream.reset()
try {
Iterator.continually(tokenStream.incrementToken())
.takeWhile(identity)
.map(_ => charAttr.toString)
.toVector
} finally {
tokenStream.end()
tokenStream.close()
}
}
.map(word => (word, 1))
.reduceByKey((a, b) => a + b)
.saveAsTextFiles("output/tweet")
ssc.start()
ssc.awaitTermination()
}
}
示例14: VeChallengeRealTime
//设置package包名称以及导入依赖的类
package io.github.adrianulbona.ve
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, StreamingContext}
import twitter4j.Place
object VeChallengeRealTime {
def main(args: Array[String]) {
val spark = SparkSession.builder
.master("local[*]")
.appName("ve-challenge")
.getOrCreate()
val ssc = new StreamingContext(spark.sparkContext, Minutes(2))
val stream = TwitterUtils.createStream(ssc, None, Seq("challenge"))
val places: DStream[Place] = stream.map(status => Option(status.getPlace))
.filter(optionPlace => optionPlace.isDefined)
.map(place => place.get)
places.map(place => place.getCountryCode)
.countByValue()
.foreachRDD(batch => printStats(batch.sortBy({ case (_, count) => count }, ascending = false).take(5)))
ssc.start()
ssc.awaitTermination()
spark.stop()
}
def printStats(top5Countries: Array[(String, Long)]) {
println()
println(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()))
top5Countries.foreach(println)
}
}
示例15: Main
//设置package包名称以及导入依赖的类
import Fqueue.{FqueueReceiver, FqueueSender}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Main {
private def sendData() = {
val fqueuSender = new FqueueSender("localhost:18740", 4, 4000)
fqueuSender.connect()
while (true) {
val ret = fqueuSender.enQueue("track_BOdao2015*", "123")
Thread.sleep(1000)
}
fqueuSender.stop()
}
private def getData() = {
val fqueueReceiver = new FqueueReceiver("localhost:18740", 4, 4000)
fqueueReceiver.connect()
val data = fqueueReceiver.deQueue("track_BOdao2015*")
println(data.getOrElse("null"))
fqueueReceiver.stop()
}
def main(args: Array[String]) {
new Thread("fqueue sender") {
override def run() { sendData() }
}.start()
val config = new SparkConf().setAppName("testfqueue").setMaster("local[2]")
val ssc = new StreamingContext(config, Seconds(5))
val lines = ssc.receiverStream(new FqueueStreamingReceiver("localhost:18740", 4, 4000))
lines.print()
ssc.start()
ssc.awaitTermination()
}
}