本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.saveAsTextFile方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.saveAsTextFile方法的具体用法?Java JavaRDD.saveAsTextFile怎么用?Java JavaRDD.saveAsTextFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.saveAsTextFile方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", "Divide or merge to n partitions" ) );
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("SplitFasta");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd = sc.textFile(in);
JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));
crdd.saveAsTextFile(out);
sc.stop();
}
示例2: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {
final Validator validator = new Validator(args);
ValidatorParameters params = validator.getParameters();
validator.setDoPrintInProcessRecord(false);
logger.info("Input file is " + params.getArgs());
SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
JavaSparkContext context = new JavaSparkContext(conf);
System.err.println(validator.getParameters().formatParameters());
JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);
JavaRDD<String> baseCountsRDD = inputFile
.flatMap(content -> {
MarcReader reader = ReadMarc.getMarcStringReader(content);
Record marc4jRecord = reader.next();
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
validator.processRecord(marcRecord, 1);
return ValidationErrorFormatter
.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
.iterator();
}
);
baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
}
示例3: run
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public void run() throws IOException {
FileSystem fs = DistributedFileSystem.get(new Configuration());
Path inpath = new Path(input);
Path outpath = new Path(output);
if (!fs.exists(inpath)) {
throw new IllegalArgumentException("Input file not found: " + inpath);
}
if (fs.exists(outpath)) {
throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
}
SparkConf conf = new SparkConf();
conf.setMaster(sparkMaster);
conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
JavaSparkContext ctx = new JavaSparkContext(conf);
//STEP1: READ
JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
//.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
//STEP2: PARSE
JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
(PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
Metadata md = new Metadata();
try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
String content = TikaHolder.tika.parseToString(stream, md);
md.add("CONTENT", content);
}
return new Tuple2<>(rec._1(), md);
});
//STEP3: FORMAT
JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
String key = rec._1().toString();
Metadata metadata = rec._2();
JSONObject object = new JSONObject();
for (String name : metadata.names()) {
if (metadata.isMultiValued(name)) {
JSONArray arr = new JSONArray();
for (String val : metadata.getValues(name)) {
arr.add(val);
}
object.put(name, arr);
} else {
object.put(name, metadata.get(name));
}
}
return key + "\t\t" + object.toJSONString();
});
//STEP4: SAVE
LOG.info("Saving at " + outpath);
outRDD.saveAsTextFile(output);
LOG.info("Stopping");
ctx.stop();
}
示例4: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", true,"Divide or merge to n partitions" ) );
options.addOption(new Option( "fa", true, "Include only files with extension given " ));
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd;
if(fastaonly!=null)
rdd = sc.textFile(in+"/*."+fastaonly);
else
rdd = sc.textFile(in); //take whole directory as input
JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{
String[] fseq = fasta.trim().split("\n");
String id = fseq[0].split(" ")[0];
//Give unique id for sequence
String seq_id = id+"_"+UUID.randomUUID().toString();
String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");
return ">"+seq_id+"\n"+seq;
});
if(partitions!=null)
crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
else
crdd.saveAsTextFile(out);
sc.stop();
}
示例5: writeOutput
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public void writeOutput(JavaRDD<Rating> javaRDD) {
javaRDD.saveAsTextFile("");
}