本文整理汇总了Java中org.apache.spark.streaming.api.java.JavaPairInputDStream类的典型用法代码示例。如果您正苦于以下问题:Java JavaPairInputDStream类的具体用法?Java JavaPairInputDStream怎么用?Java JavaPairInputDStream使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
JavaPairInputDStream类属于org.apache.spark.streaming.api.java包,在下文中一共展示了JavaPairInputDStream类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("kafka-sandbox")
.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Set<String> topics = Collections.singleton("mytopic");
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", "localhost:9092");
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
directKafkaStream.foreachRDD(rdd -> {
System.out.println("--- New RDD with " + rdd.partitions().size()
+ " partitions and " + rdd.count() + " records");
rdd.foreach(record -> System.out.println(record._2));
});
ssc.start();
ssc.awaitTermination();
}
示例2: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
SparkConf sc = new SparkConf().setAppName("POC-Kafka-New");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
Collections.singleton(EXAMPLE_TOPIC));
JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));
jsc.start();
jsc.awaitTermination();
}
}
示例3: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException, IOException {
SparkConf sc = new SparkConf().setAppName("POC-BigQuery");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), Collections.singleton(EXAMPLE_TOPIC));
Configuration conf = new Configuration();
BigQueryConfiguration.configureBigQueryOutput(conf, BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA);
conf.set("mapreduce.job.outputformat.class", BigQueryOutputFormat.class.getName());
JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
records.foreachRDD(rdd -> {
System.out.printf("Amount of XMLs: %d\n", rdd.count());
long time = System.currentTimeMillis();
rdd.mapToPair(new PrepToBQ()).saveAsNewAPIHadoopDataset(conf);
System.out.printf("Sent to BQ in %fs\n", (System.currentTimeMillis()-time)/1000f);
});
jsc.start();
jsc.awaitTermination();
}
}
示例4: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化参数
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 从Kafka Stream获取数据
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 写入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 启动Streaming服务器
jssc.start(); // 启动计算
jssc.awaitTermination(); // 等待终止
}
示例5: kafka
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
private static <K, V> TransformEvaluator<KafkaIO.Read.Unbound<K, V>> kafka() {
return new TransformEvaluator<KafkaIO.Read.Unbound<K, V>>() {
@Override
public void evaluate(KafkaIO.Read.Unbound<K, V> transform, EvaluationContext context) {
StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
JavaStreamingContext jssc = sec.getStreamingContext();
Class<K> keyClazz = transform.getKeyClass();
Class<V> valueClazz = transform.getValueClass();
Class<? extends Decoder<K>> keyDecoderClazz = transform.getKeyDecoderClass();
Class<? extends Decoder<V>> valueDecoderClazz = transform.getValueDecoderClass();
Map<String, String> kafkaParams = transform.getKafkaParams();
Set<String> topics = transform.getTopics();
JavaPairInputDStream<K, V> inputPairStream = KafkaUtils.createDirectStream(jssc, keyClazz,
valueClazz, keyDecoderClazz, valueDecoderClazz, kafkaParams, topics);
JavaDStream<WindowedValue<KV<K, V>>> inputStream =
inputPairStream.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
sec.setStream(transform, inputStream);
}
};
}
示例6: create
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public JavaStreamingContext create() {
sparkConf.set("spark.streaming.kafka.maxRatePerPartition", String.valueOf(maxRatePerPartition));
JavaStreamingContext result = new JavaStreamingContext(sparkConf, new Duration(duration));
Map<String, String> props = new HashMap<>();
if (!autoOffsetValue.isEmpty()) {
props.put(AbstractStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
}
logMessage("topic list " + topic, isRunningInMesos);
logMessage("Auto offset reset is set to " + autoOffsetValue, isRunningInMesos);
props.putAll(extraKafkaConfigs);
for (Map.Entry<String, String> map : props.entrySet()) {
logMessage(Utils.format("Adding extra kafka config, {}:{}", map.getKey(), map.getValue()), isRunningInMesos);
}
props.put("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
JavaPairInputDStream<byte[], byte[]> dStream;
if (offsetHelper.isSDCCheckPointing()) {
JavaInputDStream stream =
KafkaUtils.createDirectStream(
result,
byte[].class,
byte[].class,
Tuple2.class,
props,
MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions),
MESSAGE_HANDLER_FUNCTION
);
ClassTag<byte[]> byteClassTag = scala.reflect.ClassTag$.MODULE$.apply(byte[].class);
dStream = JavaPairInputDStream.fromInputDStream(stream.inputDStream(), byteClassTag, byteClassTag);
} else {
dStream =
KafkaUtils.createDirectStream(result, byte[].class, byte[].class,
props, new HashSet<>(Arrays.asList(topic.split(","))));
}
Driver$.MODULE$.foreach(dStream.dstream(), MaprStreamsOffsetManagerImpl.get());
return result;
}
示例7: startNewStream
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
private JavaPairDStream<String, String> startNewStream(JavaStreamingContext jsc) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
ImmutableMap.of("metadata.broker.list", kafka, "auto.offset.reset", "smallest"),
Collections.singleton(topic));
return stream.transformToPair(new ToPairWithOffsets<>(tuple -> tuple._2()));
}
示例8: startNewStream
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
private static JavaPairDStream<String, String> startNewStream(JavaStreamingContext jsc) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(jsc, String.class, String.class,
StringDecoder.class, StringDecoder.class, Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
Collections.singleton(EXAMPLE_TOPIC));
return stream.transformToPair(new ToPairWithOffset<>(tuple -> tuple._2()));
}
示例9: processStream
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
System.out.println("--> Processing stream");
Map<String, String> props = new HashMap<>();
props.put("bootstrap.servers", "localhost:9092");
props.put("schema.registry.url", "http://localhost:8081");
props.put("group.id", "spark");
props.put("specific.avro.reader", "true");
props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));
JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);
stream.foreachRDD(rdd -> {
rdd.foreachPartition(iterator -> {
while (iterator.hasNext()) {
Tuple2<String, Object> next = iterator.next();
Model model = (Model) next._2();
System.out.println(next._1() + " --> " + model);
}
}
);
});
}
示例10: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("kafka-sandbox")
.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Set<String> topics = Collections.singleton("mytopic");
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", "localhost:9092");
JavaPairInputDStream<String, byte[]> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class, byte[].class, StringDecoder.class, DefaultDecoder.class, kafkaParams, topics);
directKafkaStream
.map(message -> recordInjection.invert(message._2).get())
.foreachRDD(rdd -> {
rdd.foreach(record -> {
System.out.println("str1= " + record.get("str1")
+ ", str2= " + record.get("str2")
+ ", int1=" + record.get("int1"));
});
});
ssc.start();
ssc.awaitTermination();
}
示例11: stringStreamFromKafkaWithTime
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
@Override
public SparkWorkloadOperator<WithTime<String>> stringStreamFromKafkaWithTime(String zkConStr,
String kafkaServers,
String group,
String topics,
String offset,
String componentId,
int parallelism) {
HashSet<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
HashMap<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", kafkaServers);
kafkaParams.put("auto.offset.reset", offset);
kafkaParams.put("zookeeper.connect", zkConStr);
kafkaParams.put("group.id", group);
// Create direct kafka stream with brokers and topics
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
JavaDStream<WithTime<String>> lines = messages.map(mapFunctionWithTime);
return new SparkWorkloadOperator<>(lines, parallelism);
}
示例12: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Usage: DirectKafkaWordCount <brokers> <topics>\n" + " <brokers> is a list of one or more Kafka brokers\n"
+ " <topics> is a list of one or more kafka topics to consume from\n\n");
System.exit(1);
}
String brokers = args[0];
String topics = args[1];
// Create context with 2 second batch interval
SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);
// Create direct kafka stream with brokers and topics
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
// Get the lines, split them into words, count the words and print
JavaDStream<String> lines = messages.map(tuple2 -> tuple2._2());
JavaDStream<String> words = lines.flatMap(x -> Lists.newArrayList(SPACE.split(x)));
JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<String, Integer>(s, 1)).reduceByKey(
(i1, i2) -> i1 + i2);
wordCounts.print();
// Start the computation
jssc.start();
jssc.awaitTermination();
}
示例13: getKafkaInputStream
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public JavaPairInputDStream<String, String> getKafkaInputStream(Set<String> topics){
return KafkaUtils.createDirectStream(jssc, String.class, String.class,
Config.getConfig().getPrefixedMap("kafka."),
topics);
}
示例14: main
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf()
.setAppName("BenchmarkSpark")
.set("spark.streaming.backpressure.enabled","true")
// uncomment it to set physical limits of processing
// .set("spark.streaming.receiver.maxRate", "10000")
// .set("spark.streaming.kafka.maxRatePerPartition", "10000")
.setMaster("local");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Milliseconds.apply(TIME/2));
// see: http://spark.apache.org/docs/latest/streaming-kafka-integration.html
Set<String> topicMap = new HashSet<>(Arrays.asList(KAFKA_TOPIC));
Map<String, String> kafkaParams = new HashMap<String, String>() {
{
put("metadata.broker.list", "localhost:9092");
//put("auto.offset.reset", "smallest");
put("auto.offset.reset", "largest");
}
};
JavaPairInputDStream<String, String> messages =
KafkaUtils.createDirectStream(jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicMap);
messages
.map(x -> {
Message message = gson.fromJson(x._2(), Message.class);
// time delay emulation
int count = 0;
byte[] array = message.getUid().getBytes();
for (int j = 0; j < delayFactor; j++) {
for (int i = 0; i < array.length; i++) {
if (array[0] == array[i]) count++;
}
}
return new Tuple5<String, Message, Long, Long, Integer>(
x._2(), message, System.currentTimeMillis(), 1L, count);
})
.window(Milliseconds.apply(TIME), Milliseconds.apply(TIME))
.reduce((x1, x2) -> new Tuple5<String, Message, Long, Long, Integer>(
x1._1(),
x1._2(),
x1._3(),
x1._4() + x2._4(),
x1._5()))
.foreachRDD(rdd -> rdd.foreachPartition(
partitionOfRecords -> partitionOfRecords.forEachRemaining(x -> {
System.out.println(
"***************************************************************************"
+ "\nProcessing time: " + Long.toString(System.currentTimeMillis() - x._3())
+ "\nExpected time: " + Long.toString(TIME)
+ "\nProcessed messages: " + Long.toString(x._4())
+ "\nMessage example: " + x._1()
+ "\nRecovered json: " + x._2()
);
}
)
)
);
jssc.start();
jssc.awaitTermination();
}
示例15: readData
import org.apache.spark.streaming.api.java.JavaPairInputDStream; //导入依赖的package包/类
/**
* Method to read in the data from an allowed input format, filter, and return a RDD of MapWritable data elements
*/
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readData() throws IOException, PIRException
{
logger.info("Reading data ");
Job job = Job.getInstance();
String baseQuery = SystemConfiguration.getProperty("pir.baseQuery");
String jobName = "pirSpark_base_" + baseQuery + "_" + System.currentTimeMillis();
job.setJobName(jobName);
job.getConfiguration().setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
job.getConfiguration().set("query", baseQuery);
job.getConfiguration().set("dataSchemaName", qSchema.getDataSchemaName());
job.getConfiguration().set("data.schemas", SystemConfiguration.getProperty("data.schemas"));
// Set the inputFormatClass based upon the baseInputFormat property
String classString = SystemConfiguration.getProperty("pir.baseInputFormat");
Class<? extends BaseInputFormat<Text,MapWritable>> inputClass;
try
{
inputClass = (Class<? extends BaseInputFormat<Text,MapWritable>>) Class.forName(classString);
} catch (ClassNotFoundException | ClassCastException e)
{
throw new PIRException(classString + " cannot be instantiated or does not extend BaseInputFormat", e);
}
job.setInputFormatClass(inputClass);
FileInputFormat.setInputPaths(job, inputData);
// Read data from hdfs
logger.info("useQueueStream = " + useQueueStream);
JavaDStream<MapWritable> mwStream;
if (useQueueStream)
{
Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), inputClass, Text.class, MapWritable.class).values()
.coalesce(numDataPartitions);
rddQueue.add(rddIn);
mwStream = jssc.queueStream(rddQueue);
}
else
{
JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, inputClass);
mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
{
private static final long serialVersionUID = 1L;
@Override
public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
{
return pair.values();
}
}).repartition(numDataPartitions);
}
// Filter out by the provided stopListFile entries
if (qSchema.getFilter() != null)
{
return mwStream.filter(new FilterData(accum, bVars));
}
return mwStream;
}