本文整理汇总了Java中org.apache.spark.SparkConf类的典型用法代码示例。如果您正苦于以下问题:Java SparkConf类的具体用法?Java SparkConf怎么用?Java SparkConf使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
SparkConf类属于org.apache.spark包,在下文中一共展示了SparkConf类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
JavaSparkContext jsc=new JavaSparkContext(conf);
//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
// textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
}
示例2: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("kafka-sandbox")
.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000));
Set<String> topics = Collections.singleton("mytopic");
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", "localhost:9092");
JavaPairInputDStream<String, String> directKafkaStream = KafkaUtils.createDirectStream(ssc,
String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topics);
directKafkaStream.foreachRDD(rdd -> {
System.out.println("--- New RDD with " + rdd.partitions().size()
+ " partitions and " + rdd.count() + " records");
rdd.foreach(record -> System.out.println(record._2));
});
ssc.start();
ssc.awaitTermination();
}
示例3: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
示例4: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
SparkConf sc = new SparkConf().setAppName("POC-Kafka-New");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT),
Collections.singleton(EXAMPLE_TOPIC));
JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));
jsc.start();
jsc.awaitTermination();
}
}
示例5: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
SparkConf sc = new SparkConf().setAppName("POC-Streaming");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
//JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
//textFileStream process lines of files, so xml has to be 1 line to work //alternative below
JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
rddQueue.add(files);
JavaDStream<String> records = jsc.queueStream(rddQueue);
records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));
jsc.start();
jsc.awaitTermination();
}
}
示例6: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException, IOException {
SparkConf sc = new SparkConf().setAppName("POC-BigQuery");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(60000))) {
JavaPairInputDStream<String, String> stream = KafkaUtils.createDirectStream(
jsc, String.class, String.class, StringDecoder.class, StringDecoder.class,
Collections.singletonMap("metadata.broker.list", KAFKA_HOST_PORT), Collections.singleton(EXAMPLE_TOPIC));
Configuration conf = new Configuration();
BigQueryConfiguration.configureBigQueryOutput(conf, BQ_EXAMPLE_TABLE, BQ_EXAMPLE_SCHEMA);
conf.set("mapreduce.job.outputformat.class", BigQueryOutputFormat.class.getName());
JavaDStream<ExampleXML> records = stream.map(t -> t._2()).map(new ParseXML());
records.foreachRDD(rdd -> {
System.out.printf("Amount of XMLs: %d\n", rdd.count());
long time = System.currentTimeMillis();
rdd.mapToPair(new PrepToBQ()).saveAsNewAPIHadoopDataset(conf);
System.out.printf("Sent to BQ in %fs\n", (System.currentTimeMillis()-time)/1000f);
});
jsc.start();
jsc.awaitTermination();
}
}
示例7: wordCountJava8
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
示例8: test
import org.apache.spark.SparkConf; //导入依赖的package包/类
@Test
public void test() {
String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";
SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
JavaSparkContext context = new JavaSparkContext(conf);
JavaRDD<String> rdd = context.textFile(hdfsPath);
rdd.foreach((str) -> System.out.println(str));
}
示例9: Spark
import org.apache.spark.SparkConf; //导入依赖的package包/类
/**
* Initializes a Spark connection. Use it afterwards for execution of Spark
* SQL queries.
*
* @param appName
* the name of the app that will be used with this Spark
* connection
* @param database
* name of the database that will be used with this Spark
* connection
*/
public Spark(String appName, String database) {
// TODO check what will happen if there is already in use the same app
// name
this.sparkConfiguration = new SparkConf().setAppName(appName);
this.javaContext = new JavaSparkContext(sparkConfiguration);
this.hiveContext = new HiveContext(javaContext);
// TODO check what kind of exception can be thrown here if there is a
// problem with spark connection
this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
// TODO check what kind of exception is thrown if database already
// use the created database
this.hiveContext.sql((String.format("USE %s", database)));
}
示例10: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: JavaSleep <seconds>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("JavaSleep");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
Integer parallel = sparkConf.getInt("spark.default.parallelism", ctx.defaultParallelism());
Integer seconds = Integer.parseInt(args[0]);
Integer[] init_val = new Integer[parallel];
Arrays.fill(init_val, seconds);
JavaRDD<Integer> workload = ctx.parallelize(Arrays.asList(init_val), parallel).map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer s) throws InterruptedException {
Thread.sleep(s * 1000);
return 0;
}
});
List<Integer> output = workload.collect();
ctx.stop();
}
示例11: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
Flags.setFromCommandLineArgs(THE_OPTIONS, args);
// 初始化Spark Conf.
SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
SQLContext sqlContext = new SQLContext(sc);
// 初始化参数
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());
// 从Kafka Stream获取数据
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = 5266880065425088203L;
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
List<ApacheAccessLog> list = new ArrayList<>();
try {
// 映射每一行
list.add(ApacheAccessLog.parseFromLogLine(line));
return list;
} catch (RuntimeException e) {
return list;
}
}).cache();
accessLogsDStream.foreachRDD(rdd -> {
// rdd to DataFrame
DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
// 写入Parquet文件
df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());
return null;
});
// 启动Streaming服务器
jssc.start(); // 启动计算
jssc.awaitTermination(); // 等待终止
}
示例12: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setAppName("Wordcount Background");
conf.setMaster("local");
JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(15));
JavaDStream<String> lines = ssc.textFileStream("/home/rahul/DATASET");
JavaDStream<String> words = lines.flatMap(WORDS_EXTRACTOR);
JavaPairDStream<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
JavaPairDStream<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
counter.print();
ssc.start();
ssc.awaitTermination();
/*JavaRDD<String> file = context.textFile("/home/rahul/Desktop/palestine.txt");
JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR);
JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER);
JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER);
counter.saveAsTextFile("/home/rahul/Desktop/wc");
context.close();*/
}
示例13: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("data-in");
SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaSpark");
JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(5));
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
);
JavaPairDStream<String, Integer> countOfMessageKeys = stream
.map((ConsumerRecord<String, String> record) -> record.key())
.mapToPair((String s) -> new Tuple2<>(s, 1))
.reduceByKey((Integer i1, Integer i2)-> i1 + i2);
countOfMessageKeys.print();
// Start the computation
streamingContext.start();
streamingContext.awaitTermination();
}
示例14: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
//conf.set("spark.default.parallelism", String.valueOf(args[2]));
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));
repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例15: main
import org.apache.spark.SparkConf; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", "Divide or merge to n partitions" ) );
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("SplitFasta");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd = sc.textFile(in);
JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));
crdd.saveAsTextFile(out);
sc.stop();
}