当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.saveAsTextFile方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.saveAsTextFile方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.saveAsTextFile方法的具体用法?Java JavaRDD.saveAsTextFile怎么用?Java JavaRDD.saveAsTextFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.saveAsTextFile方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:35,代码来源:SplitFasta.java

示例2: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
开发者ID:pkiraly,项目名称:metadata-qa-marc,代码行数:29,代码来源:ParallelValidator.java

示例3: run

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public void run() throws IOException {
    FileSystem fs = DistributedFileSystem.get(new Configuration());
    Path inpath = new Path(input);
    Path outpath = new Path(output);
    if (!fs.exists(inpath)) {
        throw new IllegalArgumentException("Input file not found: " + inpath);
    }
    if (fs.exists(outpath)) {
        throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
    }

    SparkConf conf = new SparkConf();
    conf.setMaster(sparkMaster);
    conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
    JavaSparkContext ctx = new JavaSparkContext(conf);

    //STEP1: READ
    JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
            //.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
    //STEP2: PARSE
    JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
            (PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
                Metadata md = new Metadata();
                try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
                    String content = TikaHolder.tika.parseToString(stream, md);
                    md.add("CONTENT", content);
                }
                return new Tuple2<>(rec._1(), md);
            });
    //STEP3: FORMAT
    JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
        String key = rec._1().toString();
        Metadata metadata = rec._2();
        JSONObject object = new JSONObject();
        for (String name : metadata.names()) {
            if (metadata.isMultiValued(name)) {
                JSONArray arr = new JSONArray();
                for (String val : metadata.getValues(name)) {
                    arr.add(val);
                }
                object.put(name, arr);
            } else {
                object.put(name, metadata.get(name));
            }
        }
        return key + "\t\t" + object.toJSONString();
    });
    //STEP4: SAVE
    LOG.info("Saving at " + outpath);
    outRDD.saveAsTextFile(output);
    LOG.info("Stopping");
    ctx.stop();
}
 
开发者ID:thammegowda,项目名称:tika-dl4j-spark-imgrec,代码行数:54,代码来源:TikaSpark.java

示例4: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", true,"Divide or merge to n partitions" ) );
    options.addOption(new Option( "fa", true, "Include only files with extension given " ));
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        cmd = parser.parse( options, args );
    }
    catch( ParseException exp ) {
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd;
    if(fastaonly!=null)
        rdd = sc.textFile(in+"/*."+fastaonly);
    else
        rdd = sc.textFile(in); //take whole directory as input

    JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{

        String[] fseq = fasta.trim().split("\n");
        String id = fseq[0].split(" ")[0];

        //Give unique id for sequence
        String seq_id = id+"_"+UUID.randomUUID().toString();
        String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");

        return ">"+seq_id+"\n"+seq;
    });

    if(partitions!=null)
        crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
    else
        crdd.saveAsTextFile(out);

    sc.stop();
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:53,代码来源:RenameContigsUniq.java

示例5: writeOutput

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public void writeOutput(JavaRDD<Rating> javaRDD) {
    javaRDD.saveAsTextFile("");
}
 
开发者ID:cosminseceleanu,项目名称:movie-recommender,代码行数:5,代码来源:RatingsFileIo.java


注:本文中的org.apache.spark.api.java.JavaRDD.saveAsTextFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。