当前位置: 首页>>代码示例>>Java>>正文


Java JavaSparkContext.textFile方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.textFile方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.textFile方法的具体用法?Java JavaSparkContext.textFile怎么用?Java JavaSparkContext.textFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaSparkContext的用法示例。


在下文中一共展示了JavaSparkContext.textFile方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
		JavaSparkContext jsc=new JavaSparkContext(conf);
		//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
		//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
		
		
		System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
		JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
		
//		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
		
		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:18,代码来源:S3Example.java

示例2: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
开发者ID:hazelcast,项目名称:big-data-benchmark,代码行数:23,代码来源:SparkWordCount.java

示例3: wordCountJava8

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:22,代码来源:WordCount.java

示例4: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:19,代码来源:SparkWordCount.java

示例5: test

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Test
public void test() {
    String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";

    SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
    JavaSparkContext context = new JavaSparkContext(conf);
    JavaRDD<String> rdd = context.textFile(hdfsPath);
    rdd.foreach((str) -> System.out.println(str));
}
 
开发者ID:hays2hong,项目名称:stonk,代码行数:10,代码来源:SparkHDFSTest.java

示例6: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:35,代码来源:SplitFasta.java

示例7: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File System Example");
		
		
		JavaSparkContext jsc=new JavaSparkContext(conf);
	//	jsc.hadoopConfiguration().setLong("dfs.block.size",20000);
	  
	//	jsc.hadoopConfiguration().setLong("fs.local.block.size",20000);
		  
//		
//		JavaRDD<String> localFile=jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt");
//		localFile.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path");
		
		
//		JavaRDD<String> localFile1 = jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//		
//		System.out.println(localFile1.getNumPartitions());
//		localFile1.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path1");
		
		JavaRDD<String> localFile2 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*");
		System.out.println(localFile2.getNumPartitions());
//		localFile2.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path2");
////	   
//        JavaRDD<String> localFile3 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*,C:\\Users\\sgulati\\Documents\\Result\\test5\\*");
//		
//        localFile3.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path3");
//        
//        JavaPairRDD<String, String> localFileWhole = jsc.wholeTextFiles("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//        System.out.println(localFileWhole.collect());
        
		jsc.close();
		
	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:39,代码来源:LFSExample.java

示例8: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
开发者ID:pkiraly,项目名称:metadata-qa-marc,代码行数:29,代码来源:ParallelValidator.java

示例9: readFeaturesRDD

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
  log.info("Loading features RDD from {}", path);
  JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
  return featureLines.mapToPair(line -> {
    List<?> update = TextUtils.readJSON(line, List.class);
    String key = update.get(0).toString();
    float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
    return new Tuple2<>(key, vector);
  });
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:11,代码来源:ALSUpdate.java

示例10: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", true,"Divide or merge to n partitions" ) );
    options.addOption(new Option( "fa", true, "Include only files with extension given " ));
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        cmd = parser.parse( options, args );
    }
    catch( ParseException exp ) {
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd;
    if(fastaonly!=null)
        rdd = sc.textFile(in+"/*."+fastaonly);
    else
        rdd = sc.textFile(in); //take whole directory as input

    JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{

        String[] fseq = fasta.trim().split("\n");
        String id = fseq[0].split(" ")[0];

        //Give unique id for sequence
        String seq_id = id+"_"+UUID.randomUUID().toString();
        String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");

        return ">"+seq_id+"\n"+seq;
    });

    if(partitions!=null)
        crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
    else
        crdd.saveAsTextFile(out);

    sc.stop();
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:53,代码来源:RenameContigsUniq.java

示例11: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String []args) throws Exception {

        String modelName = args[0];
        String configPath = args[1];
        String configFile = args[2];
        String pyTransformScript = args[3];
        boolean needPyTransform = Boolean.parseBoolean(args[4]);
        String loginName = args[5];
        String hostName = args[6];
        int hostPort = Integer.parseInt(args[7]);
        int slaveNum = Integer.parseInt(args[8]);
        int threadNum = Integer.parseInt(args[9]);

        LOG.info("configFile:" + configFile);
        LOG.info("loginName:" + loginName);
        LOG.info("hostName:" + hostName + ", hostPort:" + hostPort);
        LOG.info("slaveNum:" + slaveNum + ", threadNum:" + threadNum);
        LOG.info("modelName:" + modelName);

        SparkConf conf = new SparkConf();
        SparkTrainWorker worker = new SparkTrainWorker(
                conf,
                modelName,
                configPath,
                configFile,
                pyTransformScript,
                needPyTransform,
                loginName,
                hostName,
                hostPort,
                slaveNum,
                threadNum);
        JavaSparkContext sc = new JavaSparkContext(conf);
        String trainDataPath = worker.getTrainDataPath();
        JavaRDD<String> trainRDD = sc.textFile(trainDataPath);
        LOG.info("trainDataPath:" + trainDataPath);

        if (!worker.sparkTrain(trainRDD)) {
            throw new Exception("spark train exception!");
        }

        System.exit(0);
    }
 
开发者ID:yuantiku,项目名称:ytk-learn,代码行数:44,代码来源:SparkTrainWorker.java

示例12: main

import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
	
	SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
	JavaSparkContext jsc=new JavaSparkContext(conf);
	jsc.hadoopConfiguration().setLong("dfs.blocksize",2);
	//jsc.hadoopConfiguration().setLong("fs.local.block.size",2);
	
	JavaRDD<String> hadoopRdd = jsc.textFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/test1",2);
	
	System.out.println(hadoopRdd.getNumPartitions());
	//hadoopRdd.saveAsTextFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/testout");
	
	
}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:15,代码来源:HdfsExample.java


注:本文中的org.apache.spark.api.java.JavaSparkContext.textFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。