本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.stop方法的具体用法?Java JavaSparkContext.stop怎么用?Java JavaSparkContext.stop使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaSparkContext
的用法示例。
在下文中一共展示了JavaSparkContext.stop方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
//conf.set("spark.default.parallelism", String.valueOf(args[2]));
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));
repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例2: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", "Divide or merge to n partitions" ) );
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("SplitFasta");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd = sc.textFile(in);
JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));
crdd.saveAsTextFile(out);
sc.stop();
}
示例3: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public void run() {
long microsLower = day.toInstant().toEpochMilli() * 1000;
long microsUpper = day.plus(Period.ofDays(1)).toInstant().toEpochMilli() * 1000 - 1;
log.info("Running Dependencies job for {}: {} ≤ Span.timestamp {}", day, microsLower, microsUpper);
JavaSparkContext sc = new JavaSparkContext(conf);
try {
JavaPairRDD<String, Iterable<Span>> traces = javaFunctions(sc)
.cassandraTable(keyspace, "traces", mapRowTo(Span.class))
.where("start_time < ? AND start_time > ?", microsUpper, microsLower)
.mapToPair(span -> new Tuple2<>(span.getTraceId(), span))
.groupByKey();
List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces);
store(sc, dependencyLinks);
log.info("Done, {} dependency objects created", dependencyLinks.size());
} finally {
sc.stop();
}
}
示例4: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
void run(String spanResource, String depResource) {
log.info("Running Dependencies job for {}, reading from {} index, result storing to {}", day, spanResource ,depResource);
JavaSparkContext sc = new JavaSparkContext(conf);
try {
JavaPairRDD<String, Iterable<Span>> traces = JavaEsSpark.esJsonRDD(sc, spanResource)
.map(new ElasticTupleToSpan())
.groupBy(Span::getTraceId);
List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces);
store(sc, dependencyLinks, depResource);
log.info("Done, {} dependency objects created", dependencyLinks.size());
} finally {
sc.stop();
}
}
示例5: run
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public void run() throws IOException {
FileSystem fs = DistributedFileSystem.get(new Configuration());
Path inpath = new Path(input);
Path outpath = new Path(output);
if (!fs.exists(inpath)) {
throw new IllegalArgumentException("Input file not found: " + inpath);
}
if (fs.exists(outpath)) {
throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
}
SparkConf conf = new SparkConf();
conf.setMaster(sparkMaster);
conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
JavaSparkContext ctx = new JavaSparkContext(conf);
//STEP1: READ
JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
//.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
//STEP2: PARSE
JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
(PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
Metadata md = new Metadata();
try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
String content = TikaHolder.tika.parseToString(stream, md);
md.add("CONTENT", content);
}
return new Tuple2<>(rec._1(), md);
});
//STEP3: FORMAT
JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
String key = rec._1().toString();
Metadata metadata = rec._2();
JSONObject object = new JSONObject();
for (String name : metadata.names()) {
if (metadata.isMultiValued(name)) {
JSONArray arr = new JSONArray();
for (String val : metadata.getValues(name)) {
arr.add(val);
}
object.put(name, arr);
} else {
object.put(name, metadata.get(name));
}
}
return key + "\t\t" + object.toJSONString();
});
//STEP4: SAVE
LOG.info("Saving at " + outpath);
outRDD.saveAsTextFile(output);
LOG.info("Stopping");
ctx.stop();
}
示例6: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("DecompressInterleave");
//conf.set("spark.scheduler.mode", "FAIR");
//conf.set("spark.scheduler.allocation.file", "/opt/cloudera/parcels/CDH-5.10.0-1.cdh5.10.0.p0.41/etc/hadoop/conf.dist/pools.xml");
JavaSparkContext sc = new JavaSparkContext(conf);
//sc.setLocalProperty("spark.scheduler.pool", "production");
Options options = new Options();
Option splitDirOpt = new Option( "out", true, "Path to output directory in hdfs." );
Option numsplitsOpt = new Option( "splitsize", true, "Number of reads in split, depends on the size of read file, number of cores and available memory." );
options.addOption( new Option( "decompress", "" ) );
options.addOption( new Option( "temp", true, "" ) );
options.addOption( new Option( "in", true, "" ) );
options.addOption( new Option( "remtemp", "" ) );
options.addOption( new Option( "merge", "" ) );
options.addOption( numsplitsOpt );
options.addOption( splitDirOpt );
options.addOption(new Option( "help", "print this message" ));
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "spark-submit <spark specific args>", options, true );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String input = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
int splitsize = (cmd.hasOption("splitsize")==true)? Integer.parseInt(cmd.getOptionValue("splitsize")):0;
boolean merge = cmd.hasOption("merge");
String outpath = cmd.getOptionValue("out");
FileSystem fs = FileSystem.get(new Configuration());
int splitlen = splitsize*4; //FASTQ read is expressed by 4 lines
FileStatus[] dirs = fs.listStatus(new Path(input));
Arrays.asList(dirs).forEach(dir ->{
if(dir.isDirectory()){
try {
FileStatus fst = fs.getFileStatus(new Path(input+"/"+dir.getPath().getName()+"/1.fq"));
FileStatus fst2 = fs.getFileStatus(new Path(input+"/"+dir.getPath().getName()+"/2.fq"));
if(merge)
interleaveSplitFastq(fst, fst2, outpath, splitlen, sc);
else //SAVES SEPARATE HDFS DIRECTORIES
interleaveSplitFastq(fst, fst2, outpath+"/"+dir.getPath().getName(), splitlen, sc);
} catch (IOException e) {
e.printStackTrace();
}
}
});
sc.stop();
}
示例7: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SamToFastq");
sc = new JavaSparkContext(conf);
String in = args[0];
String out = args[1];
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);
fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例8: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("MergeFastq");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));
coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例9: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("Interleave");
//conf.set("spark.scheduler.mode", "FAIR");
//conf.set("spark.scheduler.allocation.file", "/opt/cloudera/parcels/CDH-5.10.0-1.cdh5.10.0.p0.41/etc/hadoop/conf.dist/pools.xml");
JavaSparkContext sc = new JavaSparkContext(conf);
//sc.setLocalProperty("spark.scheduler.pool", "production");
Options options = new Options();
Option pairedOpt = new Option( "paired", "Split paired end reads to separate folders, does not interleave." );
Option intOpt = new Option( "singlesplit", "" );
options.addOption( new Option( "decompress", "" ) );
options.addOption( pairedOpt );
options.addOption( intOpt );
options.addOption(new Option( "help", "print this message" ));
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "spark-submit <spark specific args>", options, true );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String fastq = args[0];
String fastq2 = args[1];
String outdir = args[2];
int splitsize = Integer.valueOf(args[3]);
boolean paired = cmd.hasOption("paired");
boolean singlesplit = cmd.hasOption("singlesplit");
boolean decompress = cmd.hasOption("decompress");
String outdir2 = null;
if(paired)
outdir2 = outdir+"2";
FileSystem fs = FileSystem.get(new Configuration());
if(decompress){
decompress(fs, fastq, "temp1.fq");
decompress(fs, fastq2, "temp2.fq");
fastq = "temp1.fq";
fastq2 = "temp2.fq";
}
//Count split positions
int splitlen = splitsize*4; //FASTQ read is expressed by 4 lines
if(singlesplit){
FileStatus fstatus = fs.getFileStatus(new Path(fastq));
splitFastq(fstatus, fastq, outdir, splitlen, sc);
if(paired){
FileStatus fstatus2 = fs.getFileStatus(new Path(fastq2));
splitFastq(fstatus2, fastq2, outdir2, splitlen, sc);
}
}else{
FileStatus fst = fs.getFileStatus(new Path(fastq));
FileStatus fst2 = fs.getFileStatus(new Path(fastq2));
interleaveSplitFastq(fst, fst2, outdir, splitlen, sc);
}
if(decompress){
fs.delete(new Path("temp1.fq"), false);
fs.delete(new Path("temp2.fq"), false);
}
sc.stop();
}
示例10: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", true,"Divide or merge to n partitions" ) );
options.addOption(new Option( "fa", true, "Include only files with extension given " ));
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd;
if(fastaonly!=null)
rdd = sc.textFile(in+"/*."+fastaonly);
else
rdd = sc.textFile(in); //take whole directory as input
JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{
String[] fseq = fasta.trim().split("\n");
String id = fseq[0].split(" ")[0];
//Give unique id for sequence
String seq_id = id+"_"+UUID.randomUUID().toString();
String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");
return ">"+seq_id+"\n"+seq;
});
if(partitions!=null)
crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
else
crdd.saveAsTextFile(out);
sc.stop();
}
示例11: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例12: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
boolean isLocal = false;
final String master = isLocal ? "local[4]" : "spark://10.128.184.199:7077";
final String csv = isLocal ? "Z:/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv" : "/pi_nj_57/RCS_SP1/RAW_DATA_MORE/2016_03/TAXI/TAXI_20160301.csv";
final String appName = "SpeedCalculator";
Calculator calculator = new Calculator();
SparkConf conf = new SparkConf()
.set("spark.executor.memory", "4G")
.set("spark.submit.deployMode", "cluster")
.setMaster("spark://10.128.184.199:7077")
.setJars(new String[]{"C:\\Users\\i321761\\Desktop\\git\\github.wdf.sap.corp\\i321761\\hadoop-sample\\target\\hadoopsample-1.0-SNAPSHOT.jar"});
JavaSparkContext sc = new JavaSparkContext(master, appName, conf);
// JavaRDD<String> rdd = sc.textFile(csv, 2);
JavaRDD<String> rdd = sc.parallelize(Arrays.asList("abc", "def"));
long start = System.currentTimeMillis();
System.out.println("Count Start ....");
// Convert csv string to taxi point structure and remove invalid records
JavaRDD<ITaxiMonitor.TaxiPoint> taxiPointRDD = rdd.map(line -> TaxiPointUtil.parseTaxiPoint(line))
.filter(point -> point != null && !point.receiveTime.isEmpty() && point.receiveTime.contains(" 08:"));
JavaPairRDD<Long, List<ITaxiMonitor.TaxiPoint>> slotsIn5 = taxiPointRDD
.keyBy(point -> (DateTimeUtil.parseToMillSecond(point.receiveTime, "UTC+8") / 300000) * 300000)
.combineByKey(
// 收到每个key的第一条记录时的初始化工作
v -> {
List<ITaxiMonitor.TaxiPoint> points = new ArrayList();
points.add(v);
return points;
},
// 对于某个key,收到新的记录时的操作
(c, v) -> {
c.add(v);
return c;
},
// 一个key的集合可能分布在多个task上,如何合并同一个key的操作
(c1, c2) -> {
c1.addAll(c2);
return c1;
}
)
.sortByKey();
// 一个key代表5分钟的交通数据集合,对每个5分钟的集合调用计算接口计算出交通速度
slotsIn5.map(slot -> calculator.execute(slot._2(), slot._1(), slot._1()))
.collect().forEach(speedResult -> {
speedResult.getTimedEdgeSpeeds().forEach(timedEdgeSpeeds -> {
long t = DateTimeUtil.parseToMillSecond(timedEdgeSpeeds.timestamp, "UTC+0");
timedEdgeSpeeds.edgeSpeeds.forEach(speed -> System.out.println(" * EDGE_SPEED: " + TaxiPointUtil.formatEdgeSpeed(t, speed, ",")));
});
});
slotsIn5.take(10)
.forEach(slot -> System.out.println("slot: " + slot._1() + ", " + DateTimeUtil.formatToUTC(slot._1()) + ", count: " + slot._2().size()));
// .foreach(slot -> System.out.println("slot: " + DateTimeUtil.formatToUTC(slot._1()) + ", count" + slot._2().size()));
sc.stop();
}