本文整理汇总了Scala中org.apache.spark.storage.StorageLevel类的典型用法代码示例。如果您正苦于以下问题:Scala StorageLevel类的具体用法?Scala StorageLevel怎么用?Scala StorageLevel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StorageLevel类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: StationJourneyCountCustomApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel
object StationJourneyCountCustomApp {
def main(args: Array[String]) {
if (args.length != 7) {
System.err.println(
"Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
System.exit(1)
}
val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
//.set("spark.streaming.receiver.writeAheadLog.enable", "true")
val ssc = new StreamingContext(conf, Seconds(10))
ssc.checkpoint(checkpointDir)
val topics = Map[String, Int](
topic -> 1)
val params = Map[String, String](
"zookeeper.connect" -> zkQuorum,
"group.id" -> consumerGroupId,
"bootstrap.servers" -> brokerUrl)
KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
.map(rec => rec.split(","))
.map(rec => ((rec(3), rec(7)), 1))
.reduceByKey(_ + _)
.repartition(1)
.map(rec => (rec._2, rec._1))
.transform(rdd => rdd.sortByKey(ascending = false))
.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
示例2: FqueueStreamingReceiver
//设置package包名称以及导入依赖的类
import java.io.{BufferedReader, InputStreamReader}
import java.net.Socket
import Fqueue.FqueueReceiver
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
class FqueueStreamingReceiver(val address: String, val connectionPoolSize: Int, val timeOut: Int)
extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging {
private var receiver: Option[FqueueReceiver] = None
def onStart() {
new Thread("Socket Receiver") {
override def run() { receive() }
}.start()
}
def onStop(): Unit = {
receiver foreach { _.stop() }
}
private def receive(): Unit = {
val fqueueReceiver = new FqueueReceiver(address, connectionPoolSize, timeOut)
receiver = Some(fqueueReceiver)
receiver foreach { _.connect() }
try
{
var stop = false
while (!isStopped() && !stop) {
val data = fqueueReceiver.deQueue("track_BOdao2015*")
data match {
case Some(str) => store(str)
case None => Thread.sleep(1000)//stop = true
}
}
receiver foreach { _.stop() }
} catch {
case e: Exception =>
println("get data from fqueue err! pleace sure the server is live")
println(e.getMessage)
println(e.getStackTraceString)
receiver foreach { _.stop() }
}
}
}
示例3: SimpleDataStream
//设置package包名称以及导入依赖的类
package com.fortysevendeg.log
import com.fortysevendeg.log.models._
import com.fortysevendeg.log.utils.Regex._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import scala.language.postfixOps
object SimpleDataStream {
def main(args: Array[String]) = {
// run:
// $ adb logcat -v time | nc -lk 9999
// Spark configuration
val conf = new SparkConf().setMaster("local[2]").setAppName("SimpleDataStream")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Milliseconds(1000))
ssc.checkpoint("/tmp")
val logLines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val logs = logLines.flatMap { line =>
for {
typePlusAppPlusPid <- typePlusAppPlusPid.findFirstIn(line)
data = extractTypeAppPid(typePlusAppPlusPid)
logType = data._1
app <- data._2
pid <- data._3
date <- date.findFirstIn(line)
message <- message.findFirstIn(line)
} yield {
LogLine(LogInfo(app, pid, logType, date), message.substring(2))
}
}
logs foreachRDD (_.foreach { log =>
println(s"${log.info.logType}: ${log.info.app}: ${log.message}")
})
ssc.start()
ssc.awaitTermination()
}
}
示例4: appl
//设置package包名称以及导入依赖的类
package model
import model.{DataFramesBuilder, IndexDataFramesSearch}
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import play.api.libs.json._
import play.api.libs.functional.syntax._
object appl {
case class Node(ID: Long, Name: String)
case class fatherNode(ID:Long, Name:String, children:List[Node])
def search(spark: SparkSession, nodeID:Long) = {
val path = "/Users/mali/Downloads/taxdmp/"
val edgesPath = path + "nodes.dmp"
val verticesPath = path + "names.dmp"
implicit val nodesWrites: Writes[Node] = (
(JsPath \ "ID").write[Long] and
(JsPath \ "Name").write[String]
)(unlift(Node.unapply))
implicit val fatherNodeWrites: Writes[fatherNode] = (
(JsPath \ "ID").write[Long] and
(JsPath \ "Name").write[String] and
(JsPath \ "children").write[List[Node]]
)(unlift(fatherNode.unapply))
val edParentDF = DataFramesBuilder.getEdgesParentDF(edgesPath, spark)
val veDF = DataFramesBuilder.getVerticesDF(verticesPath, spark)
val df = edParentDF.getOrElse(spark.createDataFrame(List())).persist(StorageLevel.MEMORY_ONLY).cache()
val bv = DataFramesBuilder.buildPathToRootDF(df, spark, 3)
// val edParentDF = spark.read.parquet(path + "edParentDF").persist(StorageLevel.MEMORY_ONLY).cache()
// val veDF = spark.read.parquet(path + "veDF").persist(StorageLevel.MEMORY_ONLY).cache()
val indexDF = spark.read.parquet(path + "pathToRootDF").persist(StorageLevel.MEMORY_ONLY).cache()
println(IndexDataFramesSearch.getChildren(indexDF, nodeID))
println("success???")
val result = IndexDataFramesSearch.getChildren(indexDF, nodeID)
val nodeFather = new Node(nodeID,DataFramesSearch.findVNameByID(indexDF,nodeID))
println(nodeFather.ID+" : "+nodeFather.Name)
val childs = result.map(i => Node(i,DataFramesSearch.findVNameByID(indexDF,i)))
println(childs)
val tree = new fatherNode(nodeID,DataFramesSearch.findVNameByID(indexDF,nodeID),childs)
val res = tree
val json = Json.toJson(res)
json
}
}
示例5: streaming
//设置package包名称以及导入依赖的类
package edu.uchicago.cs.data.client
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
object streaming {
def main(args: Array[String]) {
if (args.length < 2) {
System.err.println("Usage: NetworkWorkCount <hostname> <port>")
System.exit(1)
}
//StreamingExamples.setStreamingLogLevels()
val sparkConf = new SparkConf().setAppName("NetworkWorkCount")
val ssc = new StreamingContext(sparkConf, Seconds(1))
// Create a socket stream on target ip:port and count the
// words in input stream of \n delimited text (e.g. generated by 'nc').
// Note that no duplication in storage level only for running locally.
// Replication necessary in distributed scenario for fault tolerance.
val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x,1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
示例6: JMSInputDStream
//设置package包名称以及导入依赖的类
package com.redhat.spark.streaming.jms
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.receiver.Receiver
private[streaming]
class JMSInputDStream(
@transient ssc_ : StreamingContext,
brokerURL: String,
username: String,
password: String,
queuename: String,
selector: String,
storageLevel: StorageLevel
) extends ReceiverInputDStream[JMSEvent](ssc_) {
override def getReceiver(): Receiver[JMSEvent] = {
new JMSReceiver(brokerURL, username, password, queuename, selector, storageLevel)
}
}
示例7: AppConfigs
//设置package包名称以及导入依赖的类
package com.groupon.dse.configs
import java.util.Properties
import org.apache.spark.storage.StorageLevel
object AppConfigs {
val SparkReceivers = ("spark.num.receivers", "1")
val SparkStorageLevel = ("spark.storage.level", "MEMORY_AND_DISK_SER_2")
val Topics = ("topics", "")
val TopicsBlackList = ("topics.blacklist", "")
val TopicsEnableBlockingConsumption = ("topic.consumption.blocking", "false")
val TopicConsumptionPolicy = ("topic.consumption.policy", "OFFSET")
val TopicConsumptionOffsetThreshold = ("topic.consumption.offset.threshold", "0")
val TopicConsumptionTimeThresholdMs = ("topic.consumption.time.threshold.ms", "1000")
val TopicFetchSizeBytes = ("topic.fetch.size.bytes", "1048576")
val TopicRepartitionFactor = ("topic.repartition.factor", "1")
val TopicStartOffset = ("topic.start.offset", "-1") //-1: Max, -2: Min, Other: Actual offset value
val PartitionRefreshIntervalMs = ("partition.refresh.interval.ms", "30000")
val PartitionWarmUpRefreshIntervalMs = ("partition.warmup.refresh.interval.ms", "10000")
val ReceiverRestIntervalOnFailMs = ("receiver.rest.interval.fail.ms", "2500")
val ReceiverRestIntervalOnSuccessMs = ("receiver.rest.interval.success.ms", "100")
val KafkaBrokerConnect = ("kafka.broker.zk.connect", "")
val KafkaSocketTimeoutMs = ("kafka.socket.timeout", "10000")
val KafkaSocketBufferSizeBytes = ("kafka.socket.buffer.size", "1048576")
val KafkaZkSessionTimeoutMs = ("kafka.zk.session.timeout.ms", "10000")
val KafkaZkConnectionTimeoutMs = ("kafka.zk.connection.timeout.ms", "10000")
val StateControllerType = ("statecontroller.type", "MEMORY")
val ZookeeperStateControllerConnect = ("statecontroller.zk.connect", "")
val ZookeeperStateControllerRoot = ("statecontroller.zk.root", "/baryon")
val ZookeeperStateControllerConnTimeoutMs = ("statecontroller.zk.conn.timeout.ms", "120000")
val ZookeeperStateControllerSessionTimeoutMs = ("statecontroller.zk.session.timeout.ms", "60000")
val TopicFetcherType = ("topics.fetcher.type", "LOCAL")
val HDFSTopicSource = ("topics.fetcher.hdfs.source", "")
val HTTPTopicSource = ("topics.fetcher.http.source", "")
def validatedBooleanConfig(
properties: Properties,
propertyName: String,
propertyDefault: String)
: Boolean = {
properties.getProperty(propertyName, propertyDefault) match {
case "true" => true
case "false" => false
case _ => throw InvalidConfigException(s"$propertyName should be set to true or false")
}
}
case class MissingConfigException(message: String) extends Exception(message)
case class InvalidConfigException(message: String) extends Exception(message)
}
示例8: NetCDF
//设置package包名称以及导入依赖的类
package se.kth.climate.fast.netcdf
import se.kth.climate.fast.common.Metadata
import se.kth.climate.fast.netcdf.hadoop._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import com.typesafe.scalalogging.LazyLogging
import scala.concurrent.Future
import scala.concurrent.ExecutionContext
import com.google.gson.Gson
import ucar.nc2.NetcdfFile
object NetCDF extends LazyLogging {
def metaDataAsync(path: String)(implicit sc: SparkContext, ec: ExecutionContext): Future[Metadata] = {
Future {
NetCDF.metaData(path)
}
}
def metaData(path: String)(implicit sc: SparkContext): Metadata = {
val metaSRDD = sc.textFile(path + "/metadata.json", 1);
val metaS = metaSRDD.collect().mkString;
val gson = new Gson();
gson.fromJson(metaS, classOf[Metadata]);
}
def rawData(path: String)(implicit sc: SparkContext): RDD[NetcdfFile] = {
val rdd = sc.newAPIHadoopFile[Void, NCWritable, NetCDFFileFormat](path)
val ncrdd = rdd.map {
case (_, v) => {
val ncfile = v.get;
//ncfile.setImmutable(); // can't write them out, so don't let anyone mutate them
ncfile
}
}
//ncrdd.persist(StorageLevel.MEMORY_ONLY_SER);
ncrdd.cache();
ncrdd
}
}
示例9: RedditUtils
//设置package包名称以及导入依赖的类
package com.github.catalystcode.fortis.spark.streaming.reddit
import com.github.catalystcode.fortis.spark.streaming.reddit.client.RedditClient
import com.github.catalystcode.fortis.spark.streaming.reddit.dto.RedditObject
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
object RedditUtils {
def createPageStream(redditAuth: RedditAuth,
keywords: Seq[String],
ssc: StreamingContext,
storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
pollingPeriodInSeconds: Int = 3,
subredit: Option[String] = None,
searchLimit: Int = 25,
searchResultType: Option[String] = Option("link")
): ReceiverInputDStream[RedditObject] = {
return new RedditInputDStream(
client = new RedditClient(redditAuth.applicationId, redditAuth.secret),
keywords = keywords,
ssc = ssc,
storageLevel = storageLevel,
subredit = subredit,
searchLimit = searchLimit,
searchResultType = searchResultType,
pollingPeriodInSeconds = pollingPeriodInSeconds)
}
}
示例10: GraphProviders
//设置package包名称以及导入依赖的类
package ml.sparkling.graph.loaders.csv.providers
import ml.sparkling.graph.loaders.csv.types.Types
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession;
import scala.reflect.ClassTag
object GraphProviders {
val defaultStorageLevel=StorageLevel.MEMORY_ONLY
def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
vertexProvider: Row => Seq[(VertexId, VD)],
edgeProvider: Row => Seq[Edge[ED]],
edgeStorageLevel: StorageLevel = defaultStorageLevel,
vertexStorageLevel: StorageLevel =defaultStorageLevel)
(dataFrame: DataFrame): Graph[VD, ED] = {
def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = {
dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => {
rowIterator.flatMap { case row => mappingFunction(row) }
})
}
val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider)
val edges: RDD[Edge[ED]] = mapRows(edgeProvider)
defaultVertex match{
case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel)
case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel)
}
}
def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)],
edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]],
columnsToIndex: Seq[Int],
edgeStorageLevel: StorageLevel = defaultStorageLevel,
vertexStorageLevel: StorageLevel = defaultStorageLevel)
(dataFrame: DataFrame): Graph[VD, ED] = {
val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap
def extractIdFromIndex(vertex: VD) = index(vertex)
simpleGraphBuilder(defaultVertex,
vertexProvider(_: Row, extractIdFromIndex _),
edgeProvider(_: Row, extractIdFromIndex _),
edgeStorageLevel,
vertexStorageLevel)(dataFrame)
}
}
示例11: createStream
//设置package包名称以及导入依赖的类
package it.agilelab.bigdata.wasp.consumers.readers
import it.agilelab.bigdata.wasp.core.WaspSystem
import it.agilelab.bigdata.wasp.core.WaspSystem._
import it.agilelab.bigdata.wasp.core.kafka.CheckOrCreateTopic
import it.agilelab.bigdata.wasp.core.logging.WaspLogger
import it.agilelab.bigdata.wasp.core.models.{DefaultConfiguration, TopicModel}
import it.agilelab.bigdata.wasp.core.utils.{AvroToJsonUtil, ConfigManager, JsonToByteArrayUtil}
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
//TODO: check warning (not understood)
def createStream(group: String, topic: TopicModel)(implicit ssc: StreamingContext): DStream[String] = {
val kafkaConfig = ConfigManager.getKafkaConfig
val kafkaConfigMap: Map[String, String] = Map(
"zookeeper.connect" -> kafkaConfig.zookeeper.toString,
"zookeeper.connection.timeout.ms" -> kafkaConfig.zookeeper.timeout.getOrElse(DefaultConfiguration.timeout).toString
)
if (??[Boolean](WaspSystem.getKafkaAdminActor, CheckOrCreateTopic(topic.name, topic.partitions, topic.replicas))) {
val receiver = KafkaUtils.createStream[String, Array[Byte], StringDecoder, DefaultDecoder](
ssc,
kafkaConfigMap + ("group.id" -> group),
Map(topic.name -> 3),
StorageLevel.MEMORY_AND_DISK_2
)
topic.topicDataType match {
case "avro" => receiver.map(x => (x._1, AvroToJsonUtil.avroToJson(x._2))).map(_._2)
case "json" => receiver.map(x => (x._1, JsonToByteArrayUtil.byteArrayToJson(x._2))).map(_._2)
case _ => receiver.map(x => (x._1, AvroToJsonUtil.avroToJson(x._2))).map(_._2)
}
} else {
logger.error(s"Topic not found on Kafka: $topic")
throw new Exception(s"Topic not found on Kafka: $topic")
}
}
}
示例12: TwitterInputDStream
//设置package包名称以及导入依赖的类
package com.aluxian.tweeather.streaming
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import twitter4j.auth.{Authorization, OAuthAuthorization}
import twitter4j.conf.ConfigurationBuilder
import twitter4j.{FilterQuery, Status}
class TwitterInputDStream(@transient ssc: StreamingContext,
twitterAuth: Option[Authorization],
filterQuery: Option[FilterQuery],
storageLevel: StorageLevel
) extends ReceiverInputDStream[Status](ssc) {
private val authorization = twitterAuth.getOrElse(createOAuthAuthorization())
private def createOAuthAuthorization(): Authorization = {
new OAuthAuthorization(new ConfigurationBuilder().build())
}
override def getReceiver(): Receiver[Status] = {
new TwitterReceiver(authorization, filterQuery, storageLevel)
}
}
示例13: TwitterUtils
//设置package包名称以及导入依赖的类
package com.aluxian.tweeather.streaming
import java.util.Properties
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import twitter4j.auth.{AccessToken, Authorization}
import twitter4j.{FilterQuery, Status, TwitterFactory}
object TwitterUtils {
def createMultiStream(ssc: StreamingContext,
queryBuilder: () => FilterQuery = () => null,
credentials: Seq[Authorization] = loadDefaultCredentials(),
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER
): DStream[Status] = {
credentials
.map(auth => createStream(ssc, Some(queryBuilder()), Some(auth)))
.reduce { (accStream, stream) => accStream.union(stream) }
}
private def loadDefaultCredentials(): Seq[Authorization] = {
val props = loadTwitterProperties()
val num = props.getProperty("twitter.credentials").toInt
1.to(num).map(i => {
val twitter = new TwitterFactory().getInstance()
twitter.setOAuthConsumer(
props.getProperty(s"twitter.credentials.$i.consumerKey"),
props.getProperty(s"twitter.credentials.$i.consumerSecret")
)
twitter.setOAuthAccessToken(new AccessToken(
props.getProperty(s"twitter.credentials.$i.token"),
props.getProperty(s"twitter.credentials.$i.tokenSecret")
))
twitter.getAuthorization
})
}
private def loadTwitterProperties(): Properties = {
val properties = new Properties()
val stream = getClass.getResourceAsStream("/com/aluxian/tweeather/res/twitter.properties")
properties.load(stream)
stream.close()
properties
}
}
示例14: FacebookPostReceiver
//设置package包名称以及导入依赖的类
package com.github.catalystcode.fortis.spark.streaming.facebook
import java.util.Date
import com.github.catalystcode.fortis.spark.streaming.facebook.client.FacebookPageClient
import com.github.catalystcode.fortis.spark.streaming.facebook.dto.FacebookPost
import com.github.catalystcode.fortis.spark.streaming.{PollingReceiver, PollingSchedule}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
private class FacebookPostReceiver(
clients: Set[FacebookPageClient],
pollingSchedule: PollingSchedule,
storageLevel: StorageLevel,
pollingWorkers: Int
) extends PollingReceiver[FacebookPost](pollingSchedule, pollingWorkers, storageLevel) with Logger {
@volatile private var lastIngestedDate: Option[Date] = None
override protected def poll(): Unit = {
clients.par.foreach(_
.loadNewFacebookPosts(lastIngestedDate)
.filter(x => {
logDebug(s"Got facebook ${x.post.getPermalinkUrl} from page ${x.pageId} time ${x.post.getCreatedTime}")
isNew(x)
})
.foreach(x => {
logInfo(s"Storing facebook ${x.post.getPermalinkUrl}")
store(x)
markStored(x)
})
)
}
private def isNew(item: FacebookPost) = {
lastIngestedDate.isEmpty || item.post.getCreatedTime.after(lastIngestedDate.get)
}
private def markStored(item: FacebookPost): Unit = {
if (isNew(item)) {
lastIngestedDate = Some(item.post.getCreatedTime)
logDebug(s"Updating last ingested date to ${item.post.getCreatedTime}")
}
}
}
class FacebookPostInputDStream(
ssc: StreamingContext,
clients: Set[FacebookPageClient],
pollingSchedule: PollingSchedule,
pollingWorkers: Int,
storageLevel: StorageLevel
) extends ReceiverInputDStream[FacebookPost](ssc) {
override def getReceiver(): Receiver[FacebookPost] = {
logDebug("Creating facebook receiver")
new FacebookPostReceiver(clients, pollingSchedule, storageLevel, pollingWorkers)
}
}
示例15: FacebookCommentsReceiver
//设置package包名称以及导入依赖的类
package com.github.catalystcode.fortis.spark.streaming.facebook
import java.util.Date
import com.github.catalystcode.fortis.spark.streaming.facebook.client.FacebookPageClient
import com.github.catalystcode.fortis.spark.streaming.facebook.dto.FacebookComment
import com.github.catalystcode.fortis.spark.streaming.{PollingReceiver, PollingSchedule}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
private class FacebookCommentsReceiver(
clients: Set[FacebookPageClient],
pollingSchedule: PollingSchedule,
storageLevel: StorageLevel,
pollingWorkers: Int
) extends PollingReceiver[FacebookComment](pollingSchedule, pollingWorkers, storageLevel) with Logger {
@volatile private var lastIngestedDate: Option[Date] = None
override protected def poll(): Unit = {
clients.par.foreach(_
.loadNewFacebookComments(lastIngestedDate)
.filter(x => {
logDebug(s"Got comment with id ${x.comment.getId} from page ${x.pageId}")
isNew(x)
})
.foreach(x => {
logInfo(s"Storing comment ${x.comment.getId} from page ${x.pageId}")
store(x)
markStored(x)
})
)
}
private def isNew(item: FacebookComment) = {
lastIngestedDate.isEmpty || item.comment.getCreatedTime.after(lastIngestedDate.get)
}
private def markStored(item: FacebookComment): Unit = {
if (isNew(item)) {
lastIngestedDate = Some(item.comment.getCreatedTime)
logDebug(s"Updating last ingested date to ${lastIngestedDate.get}")
}
}
}
class FacebookCommentsInputDStream(
ssc: StreamingContext,
clients: Set[FacebookPageClient],
pollingSchedule: PollingSchedule,
pollingWorkers: Int,
storageLevel: StorageLevel
) extends ReceiverInputDStream[FacebookComment](ssc) {
override def getReceiver(): Receiver[FacebookComment] = {
logDebug("Creating facebook receiver")
new FacebookCommentsReceiver(clients, pollingSchedule, storageLevel, pollingWorkers)
}
}