本文整理汇总了Java中org.apache.spark.streaming.api.java.JavaStreamingContext.checkpoint方法的典型用法代码示例。如果您正苦于以下问题:Java JavaStreamingContext.checkpoint方法的具体用法?Java JavaStreamingContext.checkpoint怎么用?Java JavaStreamingContext.checkpoint使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.streaming.api.java.JavaStreamingContext
的用法示例。
在下文中一共展示了JavaStreamingContext.checkpoint方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: checkpoint
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
private void checkpoint(JavaStreamingContext jssc, CheckpointDir checkpointDir) {
Path rootCheckpointPath = checkpointDir.getRootCheckpointDir();
Path sparkCheckpointPath = checkpointDir.getSparkCheckpointDir();
Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();
try {
FileSystem fileSystem =
rootCheckpointPath.getFileSystem(jssc.sparkContext().hadoopConfiguration());
if (!fileSystem.exists(rootCheckpointPath)) {
fileSystem.mkdirs(rootCheckpointPath);
}
if (!fileSystem.exists(sparkCheckpointPath)) {
fileSystem.mkdirs(sparkCheckpointPath);
}
if (!fileSystem.exists(beamCheckpointPath)) {
fileSystem.mkdirs(beamCheckpointPath);
}
} catch (IOException e) {
throw new RuntimeException("Failed to create checkpoint dir", e);
}
jssc.checkpoint(sparkCheckpointPath.toString());
}
示例2: processMQTT
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
/**
* This will start the spark stream that is reading from the MQTT queue
*
* @param broker - MQTT broker url
* @param topic - MQTT topic name
* @param numSeconds - Number of seconds between batch size
*/
public void processMQTT(final String broker, final String topic, final int numSeconds) {
LOG.info("************ SparkStreamingMQTTOutside.processMQTT start");
// Create the spark application and set the name to MQTT
SparkConf sparkConf = new SparkConf().setAppName("MQTT");
// Create the spark streaming context with a 'numSeconds' second batch size
jssc = new JavaStreamingContext(sparkConf, Durations.seconds(numSeconds));
jssc.checkpoint(checkpointDirectory);
LOG.info("************ SparkStreamingMQTTOutside.processMQTT about to read the MQTTUtils.createStream");
//2. MQTTUtils to collect MQTT messages
JavaReceiverInputDStream<String> messages = MQTTUtils.createStream(jssc, broker, topic);
LOG.info("************ SparkStreamingMQTTOutside.processMQTT about to do foreachRDD");
//process the messages on the queue and save them to the database
messages.foreachRDD(new SaveRDD());
LOG.info("************ SparkStreamingMQTTOutside.processMQTT prior to context.strt");
// Start the context
jssc.start();
jssc.awaitTermination();
}
示例3: sparkInit
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public void sparkInit() {
PropertyConfigurator.configure(SparkTwitterStreaming.class.getClassLoader().getResource("log4j.properties"));
// note: import org.apache.log4j.Logger;
// note: import org.apache.log4j.Level;
// Logger.getLogger("org").setLevel(Level.WARN);
// Logger.getLogger("akka").setLevel(Level.WARN);
// Set spark streaming info
ssc = new JavaStreamingContext(
"local[2]", "SparkTwitterStreamingJava",
new Duration(1000), System.getenv("SPARK_HOME"),
JavaStreamingContext.jarOfClass(SparkTwitterStreaming.class));
// HDFS directory for checkpointing
/*
* checkpoint saves the RDD to an HDFS file
* http://apache-spark-user-list.1001560.n3.nabble.com/checkpoint-and-not-running-out-of-disk-space-td1525.html
* dfs.namenode.checkpoint.dir -> hdfs-site.xml
*/
// String checkpointDir = TutorialHelper.getHdfsUrl() + "/checkpoint/";
String checkpointDir = "file:///Users/feiyu/workspace/Hadoop/hdfs/namesecondary/checkpoint";
ssc.checkpoint(checkpointDir);
}
示例4: setup
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
@Override
public void setup() {
// Create a StreamingContext with a SparkConf configuration
SparkConf sparkConf = new SparkConf(false)
.setAppName("JaiSpark")
.setSparkHome("target/sparkhome")
.setMaster("local")
.set("spark.executor.memory", "128m")
.set("spark.local.dir",
new File("target/sparkhome/tmp").getAbsolutePath())
.set("spark.cores.max", "2").set("spark.akka.threads", "2")
.set("spark.akka.timeout", "60").set("spark.logConf", "true")
.set("spark.cleaner.delay", "3700")
.set("spark.cleaner.ttl", "86400")
.set("spark.shuffle.spill", "false")
.set("spark.driver.host", "localhost")
.set("spark.driver.port", "43214");
jssc = new JavaStreamingContext(sparkConf, new Duration(5000));
String checkpointDir = hadoopClusterService.getHDFSUri()
+ "/sparkcheckpoint";
jssc.checkpoint(checkpointDir);
startFlumeStream();
}
示例5: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
System.setProperty("hadoop.home.dir", "E:\\hadoop");
SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
@SuppressWarnings("unchecked")
List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
// Update the cumulative count function
Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> call(String word, Optional<Integer> one,
State<Integer> state) {
int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
Tuple2<String, Integer> output = new Tuple2<>(word, sum);
state.update(sum);
return output;
}
};
// DStream made of get cumulative counts that get updated in every batch
JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));
stateDstream.print();
streamingContext.start();
streamingContext.awaitTermination();
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:40,代码来源:WordCountSocketStateful.java
示例6: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws InterruptedException {
System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example")
.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();
JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
Durations.milliseconds(1000));
JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);
jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint");
JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.readValue(x, FlightDetails.class);
});
JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream
.mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f));
Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = (
flightId, curFlightDetail, state) -> {
List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>();
boolean isLanded = false;
if (curFlightDetail.isPresent()) {
details.add(curFlightDetail.get());
if (curFlightDetail.get().isLanded()) {
isLanded = true;
}
}
Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0);
if (isLanded) {
state.remove();
} else {
state.update(details);
}
return new Tuple2<String, Double>(flightId, avgSpeed);
};
JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream
.mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5)));
streamWithState.print();
jssc.start();
jssc.awaitTermination();
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:52,代码来源:StateFulProcessingExample.java
示例7: createContext
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
streamingContext.checkpoint(checkpointDirectory);
// Initial state RDD input to mapWithState
@SuppressWarnings("unchecked")
List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);
JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
.reduceByKey((count1, count2) -> count1 + count2);
// Update the cumulative count function
Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
Tuple2<String, Integer> output = new Tuple2<>(word, sum);
state.update(sum);
return output;
}
};
// DStream made of get cumulative counts that get updated in every batch
JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));
stateDstream.print();
return streamingContext;
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:35,代码来源:WordCountRecoverableEx.java
示例8: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("SparkJoinTest");
JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(1));
ssc.checkpoint("checkpoint");
JavaReceiverInputDStream<String> lines = ssc.socketTextStream("127.0.0.1", 9999);
JavaPairDStream<String, Long> nameStream = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String l) throws Exception {
return Arrays.asList(l.split(" "));
}
}).mapToPair(new PairFunction<String, String, Long>() {
public Tuple2<String, Long> call(String w) throws Exception {
return new Tuple2<>(w, 1L);
}
}).window(Durations.seconds(30), Durations.seconds(10));
JavaReceiverInputDStream<String> lines2 = ssc.socketTextStream("127.0.0.1", 9998);
JavaPairDStream<String, Long> nameAgeStream = lines2.mapToPair(new PairFunction<String, String, Long>() {
@Override
public Tuple2<String, Long> call(String s) throws Exception {
String[] list = s.split(" ");
String name = list[0];
long age = 0L;
if (list.length > 1)
age = Long.parseLong(list[1]);
return new Tuple2<String, Long>(name, age);
}
}).window(Durations.seconds(11), Durations.seconds(11));
// nameStream.print();
// nameAgeStream.print();
JavaPairDStream<String, Tuple2<Long, Long>> joinedStream = nameStream.join(nameAgeStream);
joinedStream.print();
ssc.start();
ssc.awaitTermination();
}
示例9: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) {
//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
System.setProperty("hadoop.home.dir", "E:\\hadoop");
//Logger rootLogger = LogManager.getRootLogger();
//rootLogger.setLevel(Level.WARN);
SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
streamingContext.checkpoint("E:\\hadoop\\checkpoint");
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
wordCounts.print();
wordCounts.window(Durations.minutes(8)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
wordCounts.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
wordCounts.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
wordCounts.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
wordCounts.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
//comment these two operation to make it run
wordCounts.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
wordCounts.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
streamingContext.start();
try {
streamingContext.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:51,代码来源:WindowBatchInterval.java
示例10: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) {
//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
System.setProperty("hadoop.home.dir", "E:\\hadoop");
//Logger rootLogger = LogManager.getRootLogger();
//rootLogger.setLevel(Level.WARN);
SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
streamingContext.checkpoint("E:\\hadoop\\checkpoint");
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
kafkaParams.put("auto.offset.reset", "latest");
// kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("mytopic", "anothertopic");
final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
/* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
.countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
*/
hashtagRDD.window(Durations.minutes(8)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
/*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
/* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
streamingContext.start();
try {
streamingContext.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
示例11: setUp
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
@Before
public void setUp() {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("TestingSparkStreaming");
jssc = new JavaStreamingContext(conf, new Duration(1000));
jssc.checkpoint("checkpoint");
}
示例12: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
//read Spark and Cassandra properties and create SparkConf
Properties prop = PropertyFileReader.readPropertyFile();
SparkConf conf = new SparkConf()
.setAppName(prop.getProperty("com.iot.app.spark.app.name"))
.setMaster(prop.getProperty("com.iot.app.spark.master"))
.set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host"))
.set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port"))
.set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive"));
//batch interval of 5 seconds for incoming stream
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
//add check point directory
jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
//read and set Kafka properties
Map<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper"));
kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist"));
String topic = prop.getProperty("com.iot.app.kafka.topic");
Set<String> topicsSet = new HashSet<String>();
topicsSet.add(topic);
//create direct kafka stream
JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream(
jssc,
String.class,
IoTData.class,
StringDecoder.class,
IoTDataDecoder.class,
kafkaParams,
topicsSet
);
logger.info("Starting Stream Processing");
//We need non filtered stream for poi traffic data calculation
JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2());
//We need filtered stream for total and traffic data calculation
JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a );
// Check vehicle Id is already processed
JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream
.mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour
// Filter processed vehicle ids and keep un-processed
JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2)
.filter(tuple -> tuple._2.equals(Boolean.FALSE));
// Get stream of IoTdata
JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
//cache stream as it is used in total and window based computation
filteredIotDataStream.cache();
//process data
IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor();
iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream);
iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream);
//poi data
POIData poiData = new POIData();
poiData.setLatitude(33.877495);
poiData.setLongitude(-95.50238);
poiData.setRadius(30);//30 km
//broadcast variables. We will monitor vehicles on Route 37 which are of type Truck
Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck"));
//call method to process stream
iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues);
//start context
jssc.start();
jssc.awaitTermination();
}
示例13: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Stateful Network Word Count");
JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(1));
ssc.checkpoint("checkpoint");
ssc.addStreamingListener(new PerformanceStreamingListener());
JavaReceiverInputDStream<String> lines = ssc.socketTextStream("127.0.0.1", 9999);
JavaPairDStream<String, Long> wordCounts = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String l) throws Exception {
return Arrays.asList(l.split(" "));
}
}).mapToPair(new PairFunction<String, String, Long>() {
public Tuple2<String, Long> call(String w) throws Exception {
return new Tuple2<>(w, 1L);
}
})
.reduceByKey(new Function2<Long, Long, Long>() {
@Override
public Long call(Long aLong, Long aLong2) throws Exception {
return aLong + aLong2;
}
})
.updateStateByKey(new Function2<List<Long>, Optional<Long>, Optional<Long>>() {
public Optional<Long> call(List<Long> values, Optional<Long> state) throws Exception {
if (values == null || values.isEmpty()) {
return state;
}
long sum = 0L;
for (Long v : values) {
sum += v;
}
return Optional.of(state.or(0L) + sum);
}
});
// .updateStateByKey(new Function2<List<Iterable<Long>>, Optional<Long>, Optional<Long>>() {
// @Override
// public Optional<Long> call(List<Iterable<Long>> iterables, Optional<Long> longOptional) throws Exception {
// if (iterables == null || iterables.isEmpty()) {
// return longOptional;
// }
// long sum = 0L;
// for (Iterable<Long> iterable : iterables) {
// for(Long l : iterable)
// sum += l;
// }
// return Optional.of(longOptional.or(0L) + sum);
// }
// });
wordCounts.print();
wordCounts.foreach(new Function2<JavaPairRDD<String, Long>, Time, Void>() {
@Override
public Void call(JavaPairRDD<String, Long> stringLongJavaPairRDD, Time time) throws Exception {
return null;
}
});
ssc.start();
ssc.awaitTermination();
}
示例14: main
import org.apache.spark.streaming.api.java.JavaStreamingContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// Startup the Spark Conf.
SparkConf conf = new SparkConf()
.setAppName("A Databricks Reference Application: Logs Analysis with Spark");
JavaStreamingContext jssc = new JavaStreamingContext(conf,
Flags.getInstance().getSlideInterval());
// Checkpointing must be enabled to use the updateStateByKey function & windowed operations.
jssc.checkpoint(Flags.getInstance().getCheckpointDirectory());
// This methods monitors a directory for new files to read in for streaming.
JavaDStream<String> logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory());
JavaDStream<ApacheAccessLog> accessLogsDStream
= logData.map(new Functions.ParseFromLogLine()).cache();
final LogAnalyzerTotal logAnalyzerTotal = new LogAnalyzerTotal();
final LogAnalyzerWindowed logAnalyzerWindowed = new LogAnalyzerWindowed();
// Process the DStream which gathers stats for all of time.
logAnalyzerTotal.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
// Calculate statistics for the last time interval.
logAnalyzerWindowed.processAccessLogs(Flags.getInstance().getOutputDirectory(), accessLogsDStream);
// Render the output each time there is a new RDD in the accessLogsDStream.
final Renderer renderer = new Renderer();
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {
public Void call(JavaRDD<ApacheAccessLog> rdd) {
// Call this to output the stats.
try {
renderer.render(logAnalyzerTotal.getLogStatistics(),
logAnalyzerWindowed.getLogStatistics());
} catch (Exception e) {
}
return null;
}
});
// Start the streaming server.
jssc.start(); // Start the computation
jssc.awaitTermination(); // Wait for the computation to terminate
}