本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.textFile方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.textFile方法的具体用法?Java JavaSparkContext.textFile怎么用?Java JavaSparkContext.textFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaSparkContext
的用法示例。
在下文中一共展示了JavaSparkContext.textFile方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
JavaSparkContext jsc=new JavaSparkContext(conf);
//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
// textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
}
示例2: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
示例3: wordCountJava8
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
示例4: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
System.out.println(System.getProperty("hadoop.home.dir"));
String inputPath = args[0];
String outputPath = args[1];
FileUtils.deleteQuietly(new File(outputPath));
JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");
JavaRDD<String> rdd = sc.textFile(inputPath);
JavaPairRDD<String, Integer> counts = rdd
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y);
counts.saveAsTextFile(outputPath);
sc.close();
}
示例5: test
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
@Test
public void test() {
String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";
SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
JavaSparkContext context = new JavaSparkContext(conf);
JavaRDD<String> rdd = context.textFile(hdfsPath);
rdd.foreach((str) -> System.out.println(str));
}
示例6: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", "Divide or merge to n partitions" ) );
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
// parse the command line arguments
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("SplitFasta");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd = sc.textFile(in);
JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));
crdd.saveAsTextFile(out);
sc.stop();
}
示例7: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File System Example");
JavaSparkContext jsc=new JavaSparkContext(conf);
// jsc.hadoopConfiguration().setLong("dfs.block.size",20000);
// jsc.hadoopConfiguration().setLong("fs.local.block.size",20000);
//
// JavaRDD<String> localFile=jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt");
// localFile.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path");
// JavaRDD<String> localFile1 = jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//
// System.out.println(localFile1.getNumPartitions());
// localFile1.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path1");
JavaRDD<String> localFile2 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*");
System.out.println(localFile2.getNumPartitions());
// localFile2.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path2");
////
// JavaRDD<String> localFile3 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*,C:\\Users\\sgulati\\Documents\\Result\\test5\\*");
//
// localFile3.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
// .reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path3");
//
// JavaPairRDD<String, String> localFileWhole = jsc.wholeTextFiles("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
// System.out.println(localFileWhole.collect());
jsc.close();
}
示例8: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws ParseException {
final Validator validator = new Validator(args);
ValidatorParameters params = validator.getParameters();
validator.setDoPrintInProcessRecord(false);
logger.info("Input file is " + params.getArgs());
SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
JavaSparkContext context = new JavaSparkContext(conf);
System.err.println(validator.getParameters().formatParameters());
JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);
JavaRDD<String> baseCountsRDD = inputFile
.flatMap(content -> {
MarcReader reader = ReadMarc.getMarcStringReader(content);
Record marc4jRecord = reader.next();
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
validator.processRecord(marcRecord, 1);
return ValidationErrorFormatter
.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
.iterator();
}
);
baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
}
示例9: readFeaturesRDD
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
log.info("Loading features RDD from {}", path);
JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
return featureLines.mapToPair(line -> {
List<?> update = TextUtils.readJSON(line, List.class);
String key = update.get(0).toString();
float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
return new Tuple2<>(key, vector);
});
}
示例10: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
Options options = new Options();
Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
options.addOption( new Option( "partitions", true,"Divide or merge to n partitions" ) );
options.addOption(new Option( "fa", true, "Include only files with extension given " ));
options.addOption( pathOpt );
options.addOption( opOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;
SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");
JavaRDD<String> rdd;
if(fastaonly!=null)
rdd = sc.textFile(in+"/*."+fastaonly);
else
rdd = sc.textFile(in); //take whole directory as input
JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{
String[] fseq = fasta.trim().split("\n");
String id = fseq[0].split(" ")[0];
//Give unique id for sequence
String seq_id = id+"_"+UUID.randomUUID().toString();
String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");
return ">"+seq_id+"\n"+seq;
});
if(partitions!=null)
crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
else
crdd.saveAsTextFile(out);
sc.stop();
}
示例11: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String []args) throws Exception {
String modelName = args[0];
String configPath = args[1];
String configFile = args[2];
String pyTransformScript = args[3];
boolean needPyTransform = Boolean.parseBoolean(args[4]);
String loginName = args[5];
String hostName = args[6];
int hostPort = Integer.parseInt(args[7]);
int slaveNum = Integer.parseInt(args[8]);
int threadNum = Integer.parseInt(args[9]);
LOG.info("configFile:" + configFile);
LOG.info("loginName:" + loginName);
LOG.info("hostName:" + hostName + ", hostPort:" + hostPort);
LOG.info("slaveNum:" + slaveNum + ", threadNum:" + threadNum);
LOG.info("modelName:" + modelName);
SparkConf conf = new SparkConf();
SparkTrainWorker worker = new SparkTrainWorker(
conf,
modelName,
configPath,
configFile,
pyTransformScript,
needPyTransform,
loginName,
hostName,
hostPort,
slaveNum,
threadNum);
JavaSparkContext sc = new JavaSparkContext(conf);
String trainDataPath = worker.getTrainDataPath();
JavaRDD<String> trainRDD = sc.textFile(trainDataPath);
LOG.info("trainDataPath:" + trainDataPath);
if (!worker.sparkTrain(trainRDD)) {
throw new Exception("spark train exception!");
}
System.exit(0);
}
示例12: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) {
SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
JavaSparkContext jsc=new JavaSparkContext(conf);
jsc.hadoopConfiguration().setLong("dfs.blocksize",2);
//jsc.hadoopConfiguration().setLong("fs.local.block.size",2);
JavaRDD<String> hadoopRdd = jsc.textFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/test1",2);
System.out.println(hadoopRdd.getNumPartitions());
//hadoopRdd.saveAsTextFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/testout");
}