本文整理汇总了Java中org.apache.spark.api.java.JavaSparkContext.newAPIHadoopFile方法的典型用法代码示例。如果您正苦于以下问题:Java JavaSparkContext.newAPIHadoopFile方法的具体用法?Java JavaSparkContext.newAPIHadoopFile怎么用?Java JavaSparkContext.newAPIHadoopFile使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaSparkContext
的用法示例。
在下文中一共展示了JavaSparkContext.newAPIHadoopFile方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
//conf.set("spark.default.parallelism", String.valueOf(args[2]));
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));
repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例2: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SamToFastq");
sc = new JavaSparkContext(conf);
String in = args[0];
String out = args[1];
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);
fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例3: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("MergeFastq");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));
coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例4: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例5: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
boolean argumentsValid = parseArguments(args);
if (!argumentsValid) {
printHelp();
System.exit(1);
}
final Configuration hadoopConfig = new Configuration(true);
final FileSystem hdfs = FileSystem.get(hadoopConfig);
if (hdfs.exists(outputPath)) {
System.out.printf("output path '%s' already exists in HDFS!%n", outputPath);
System.exit(1);
}
System.out.printf("reading from: %s%n", inputPath);
System.out.printf("writing to: %s%n", outputPath);
System.out.printf("pattern: %s%n", pattern.pattern());
System.out.printf("sample fraction: %f%n", sampleFraction);
System.out.printf("...%n");
System.out.printf("%n");
SparkConf sparkConfig = new SparkConf().setAppName(String.format("Reading sample (fraction %f) from '%s'", sampleFraction, inputPath));
JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);
LogfileInputFormat.setPattern(hadoopConfig, pattern);
JavaPairRDD<Tuple2<Path, Long>, Text> rdd = sparkContext.newAPIHadoopFile(
inputPath.toString(),
LogfileInputFormat.class,
LogfileInputFormat.KEY_CLASS,
Text.class,
hadoopConfig);
rdd.sample(false, sampleFraction)
.map(tuple -> String.format("%[email protected]%016d:%n%n%s%n%n", tuple._1._1.toString(), tuple._1._2, tuple._2.toString()))
.repartition(1)
.saveAsTextFile(outputPath.toString());
sparkContext.close();
}
示例6: main
import org.apache.spark.api.java.JavaSparkContext; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
final Configuration hadoopConfig = new Configuration(true);
hdfs = FileSystem.get(hadoopConfig);
if (!parseArguments(args)) {
printHelp();
System.exit(1);
}
if (hdfs.exists(directory)) {
if (!hdfs.isDirectory(directory)) {
System.out.printf("'%s' exists in HDFS, but is not a directory!%n", directory);
System.exit(1);
}
FileStatus[] fileStatus = hdfs.listStatus(directory);
if (fileStatus.length > 0) {
System.out.printf("'%s' exists in HDFS, but is not empty!%n", directory);
System.exit(1);
}
}
createDirectories();
System.out.printf("Creating test data in '%s'. This may take a while...%n", directory.toString());
Map<String, LogfileType> logfileTypeByPath = new HashMap<>();
LogfileSummary summary = writeLogFiles(logfileTypeByPath);
SparkConf sparkConfig = new SparkConf().setAppName("Testing LogfileInputFormat.");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);
logfileTypeByPath.forEach((path, type) -> {
LogfileInputFormat.setPattern(hadoopConfig, path, type.getFirstlinePattern());
});
LogfileInputFormat.setPattern(hadoopConfig, LogfileType.A.getFirstlinePattern());
JavaPairRDD<Tuple2<Path, Long>, Text> rdd;
JavaRDD<Tuple2<LocalDateTime, LogLevel>> logRecords;
rdd = sparkContext.newAPIHadoopFile(logDir + "/*" + FILE_EXT_LOG, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);
Function<Tuple2<Tuple2<Path, Long>, Text>, Tuple2<LocalDateTime, LogLevel>> mappingFunction = mappingFunction(logfileTypeByPath);
logRecords = rdd.map(mappingFunction).cache();
long totalCountLog = logRecords.count();
long infoCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
long warnCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
long errorCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();
rdd = sparkContext.newAPIHadoopFile(logDirGz + "/*" + FILE_EXT_GZ, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);
logRecords = rdd.map(mappingFunction).cache();
long totalCountGz = logRecords.count();
long infoCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
long warnCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
long errorCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();
long totalCountExpected = summary.getRecordCount();
long infoCountExpected = summary.getRecordCount(LogLevel.INFO);
long warnCountExpected = summary.getRecordCount(LogLevel.WARN);
long errorCountExpected = summary.getRecordCount(LogLevel.ERROR);
System.out.printf("%n%n%n%30s %15s %15s %15s %15s%n%n", "", "expected", "from *.log", "from *.log.gz", "test result");
System.out.printf("%30s %15d %15d %15d %15s%n", "total # of log records",
totalCountExpected, totalCountLog, totalCountGz,
((totalCountExpected == totalCountLog && totalCountLog == totalCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n", "# of INFO level records",
infoCountExpected, infoCountLog, infoCountGz,
((infoCountExpected == infoCountLog && infoCountLog == infoCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n", "# of WARN level records",
warnCountExpected, warnCountLog, warnCountGz,
((warnCountExpected == warnCountLog && warnCountLog == warnCountGz) ? "SUCCESS" : "FAILURE"));
System.out.printf("%30s %15d %15d %15d %15s%n%n%n", "# of ERROR level records",
errorCountExpected, errorCountLog, errorCountGz,
((errorCountExpected == errorCountLog && errorCountLog == errorCountGz) ? "SUCCESS" : "FAILURE"));
sparkContext.close();
}