本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.foreach方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.foreach方法的具体用法?Java JavaRDD.foreach怎么用?Java JavaRDD.foreach使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.foreach方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Test
public void test() {
String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";
SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
JavaSparkContext context = new JavaSparkContext(conf);
JavaRDD<String> rdd = context.textFile(hdfsPath);
rdd.foreach((str) -> System.out.println(str));
}
示例2: assertRDDEquals
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public <T> void assertRDDEquals(String message, JavaRDD<T> expected, JavaRDD<T> result) {
Option<Tuple3<T, Integer, Integer>> diff = JavaRDDComparisons.compareRDD(expected, result);
if (diff.isDefined()) {
log.error("EXPECTED");
expected.foreach(row -> log.error(row.toString()));
log.error("RESULT");
result.foreach(row -> log.error(row.toString()));
log.error("FIRST DIFF");
Tuple3<T, Integer, Integer> diffTriple = diff.get();
log.error(diffTriple.toString());
if (diffTriple._2() == 0) {
log.error("(row not expected but present in result {} times)", diffTriple._3());
}
if (diffTriple._3() == 0) {
log.error("(row expected {} times but not present)", diffTriple._2());
}
throw new AssertionError(message);
}
}
示例3: splitFastq
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例4: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2);
JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
mapParser.foreach(t -> System.out.println(t));
Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
anotherPeople.printSchema();
anotherPeople.show();
Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
json_rec.printSchema();
json_rec.show();
StructType schema = new StructType( new StructField[] {
DataTypes.createStructField("cid", DataTypes.IntegerType, true),
DataTypes.createStructField("county", DataTypes.StringType, true),
DataTypes.createStructField("firstName", DataTypes.StringType, true),
DataTypes.createStructField("sex", DataTypes.StringType, true),
DataTypes.createStructField("year", DataTypes.StringType, true),
DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
/* StructType pep = new StructType(new StructField[] {
new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
new StructField("County", DataTypes.StringType, true, Metadata.empty()),
new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
person_mod.printSchema();
person_mod.show();
person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");
}
示例5: splitFastq
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例6: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
.appName("JavaALSExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
JavaRDD<Movie> moviesRDD = sparkSession
.read().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv")
.javaRDD().filter( str-> !(null==str))
.filter(str-> !(str.length()==0))
.filter(str-> !str.contains("movieId"))
.map(str -> Movie.parseRating(str));
moviesRDD.foreach(m -> System.out.println(m));
Dataset<Row> csv_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_read.printSchema();
csv_read.show();
StructType customSchema = new StructType(new StructField[] {
new StructField("movieId", DataTypes.LongType, true, Metadata.empty()),
new StructField("title", DataTypes.StringType, true, Metadata.empty()),
new StructField("genres", DataTypes.StringType, true, Metadata.empty())
});
Dataset<Row> csv_custom_read = sparkSession.read().format("com.databricks.spark.csv")
.option("header", "true")
.schema(customSchema)
.load("C:/Users/sumit.kumar/git/learning/src/main/resources/movies.csv");
csv_custom_read.printSchema();
csv_custom_read.show();
csv_custom_read.write()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
.save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMovies.csv");
}
示例7: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* @param args
*/
public static void main(String[] args) {
//C:\Users\sumit.kumar\Downloads\bin\warehouse
//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();
JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
@Override
public Boolean call(Integer v1) throws Exception {
return ((v1%2)==0)?true:false;
}
});
evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
evenRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer t) throws Exception {
System.out.println("The value of RDD are :"+t);
}
});
//unpersisting the RDD
evenRDD.unpersist();
rdd.unpersist();
/* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
System.out.println("DEBUG: \n"+ lines.toDebugString());
long word= lines.count();
JavaRDD<String> distinctLines=lines.distinct();
System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
JavaRDD<String> finalRdd=lines.subtract(distinctLines);
System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
System.out.println("The count is "+word);
System.out.println("The count is "+distinctLines.count());
System.out.println("The count is "+finalRdd.count());
finalRdd.foreach(new VoidFunction<String>() {
@Override
public void call(String t) throws Exception {
// TODO Auto-generated method stub
System.out.println(t);
}
});
*/ /*SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
StorageLevel newLevel;
JavaRDD<String> logData = sc.textFile(logFile).cache();
long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
public Boolean call(String s) { return s.contains("a"); }
}).count();
long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
public Boolean call(String s) { return s.contains("b"); }
}).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
sc.stop();*/
}
示例8: splitFastq
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static void splitFastq(FileStatus fst, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, new Configuration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/" + split.getPath().getName()+"_"+split.getStart() + ".fq");
});
}