本文整理汇总了Java中org.apache.spark.api.java.JavaRDD类的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD类的具体用法?Java JavaRDD怎么用?Java JavaRDD使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
JavaRDD类属于org.apache.spark.api.java包,在下文中一共展示了JavaRDD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: interleaveSplitFastq
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例2: main
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public static void main(String[] args) throws InterruptedException {
SparkConf sc = new SparkConf().setAppName("POC-Streaming");
try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
//JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
//textFileStream process lines of files, so xml has to be 1 line to work //alternative below
JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
rddQueue.add(files);
JavaDStream<String> records = jsc.queueStream(rddQueue);
records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));
jsc.start();
jsc.awaitTermination();
}
}
示例3: setPartitionHeaders
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public static JavaRDD<SAMRecord> setPartitionHeaders(final JavaRDD<SAMRecord> reads, final Broadcast<SAMFileHeader> header) {
return reads.mapPartitions(records -> {
//header.getValue().setTextHeader(header.getValue().getTextHeader()+"\\[email protected]\\tSN:"+records..getReferenceName());
//record.setHeader(header);
BAMHeaderOutputFormat.setHeader(header.getValue());
return records;
});
}
示例4: main
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
System.out.println(System.getProperty("hadoop.home.dir"));
String inputPath = args[0];
String outputPath = args[1];
FileUtils.deleteQuietly(new File(outputPath));
JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");
JavaRDD<String> rdd = sc.textFile(inputPath);
JavaPairRDD<String, Integer> counts = rdd
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y);
counts.saveAsTextFile(outputPath);
sc.close();
}
示例5: run
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
示例6: buildModel
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
/**
* @param sparkContext active Spark Context
* @param trainData training data on which to build a model
* @param hyperParameters ordered list of hyper parameter values to use in building model
* @param candidatePath directory where additional model files can be written
* @return a {@link PMML} representation of a model trained on the given data
*/
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int numClusters = (Integer) hyperParameters.get(0);
Preconditions.checkArgument(numClusters > 1);
log.info("Building KMeans Model with {} clusters", numClusters);
JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
numberOfRuns, initializationStrategy);
return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
示例7: findBestModel
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public TrainedModel findBestModel(JavaRDD<Rating> ratings) {
double weights[] = {6, 2, 2};
JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L);
JavaRDD<Rating> trainingRdd = randomRatings[0];
JavaRDD<Rating> validationRdd = randomRatings[1];
JavaRDD<Rating> testRdd = randomRatings[2];
TrainConfig trainConfig = findBestTrainingParameters(trainingRdd, validationRdd);
TrainedModel model = ModelFactory.create(trainingRdd, testRdd, trainConfig.getRankNr(),
trainConfig.getIterationsNr());
logger.info("best model have RMSE = " + model.getError());
return model;
}
示例8: parseBlockCollection
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
public JavaPairRDD<Integer,IntArrayList> parseBlockCollection(JavaRDD<String> blockingInput) {
System.out.println("Parsing the blocking collection...");
return blockingInput
.map(line -> line.split("\t")) //split to [blockId, [entityIds]]
.filter(line -> line.length == 2) //only keep lines of this format
.mapToPair(pair -> {
int blockId = Integer.parseInt(pair[0]);
String[] entities = pair[1].replaceFirst(";", "").split("#");
if (entities == null || entities.length == 0) {
return null;
}
List<Integer> outputEntities = new ArrayList<>(); //possible (but not really probable) cause of OOM (memory errors) if huge blocks exist
for (String entity : entities) {
if (entity.isEmpty()) continue; //in case the last entityId finishes with '#'
Integer entityId = Integer.parseInt(entity);
outputEntities.add(entityId);
}
return new Tuple2<>(blockId, new IntArrayList(outputEntities.stream().mapToInt(i->i).toArray()));
})
.filter(x -> x != null);
}
示例9: splitFastq
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
示例10: getUniqueKmers
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
private static JavaRDD<String> getUniqueKmers(JavaPairRDD<Text, SequencedFragment> fastqRDD, int k) {
JavaRDD<String> rdd = fastqRDD.mapPartitions(records -> {
HashSet<String> umer_set = new HashSet<String>();
while (records.hasNext()) {
Tuple2<Text, SequencedFragment> fastq = records.next();
String seq = fastq._2.getSequence().toString();
//HashSet<String> umer_in_seq = new HashSet<String>();
for (int i = 0; i < seq.length() - k - 1; i++) {
String kmer = seq.substring(i, i + k);
umer_set.add(kmer);
}
}
return umer_set.iterator();
});
JavaRDD<String> umersRDD = rdd.distinct();
//umersRDD.sortBy(s -> s, true, 4);
return umersRDD;
}
示例11: mapSAMRecordsToFastq
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {
JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {
String name = read.getReadName();
if(read.getReadPairedFlag()){
if(read.getFirstOfPairFlag())
name = name+"/1";
if(read.getSecondOfPairFlag())
name = name+"/2";
}
//TODO: check values
Text t = new Text(name);
SequencedFragment sf = new SequencedFragment();
sf.setSequence(new Text(read.getReadString()));
sf.setQuality(new Text(read.getBaseQualityString()));
return new Tuple2<Text, SequencedFragment>(t, sf);
});
return fastqRDD;
}
示例12: testFilterWithoutIgnoreTypes
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
@Test
public void testFilterWithoutIgnoreTypes() {
InstanceFilterConfig config = new InstanceFilterConfig()
.setIgnoreOtherTypes(false)
.setTypes(Arrays.asList("http://example.com/a/type", "http://example.com/b/type"));
InstanceFilter filter = new InstanceFilter(config);
JavaRDD<Instance> result = filter.filter(testRDD, typeIndex);
List<Instance> rdd = new ArrayList<>();
Instance one = new Instance();
one.addType(typeIndex.getIndex("http://example.com/a/type"));
one.setUri("http://example.com/one");
rdd.add(one);
Instance two = new Instance();
two.addType(typeIndex.getIndex("http://example.com/b/type"));
two.addType(typeIndex.getIndex("http://example.com/c/type"));
two.setUri("http://example.com/two");
rdd.add(two);
JavaRDD<Instance> expected = jsc().parallelize(rdd);
assertRDDEquals("Expected instances without restricted types were filtered out", expected, result);
}
示例13: getEntityIdsMapping
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
/**
* @deprecated use {@link #readEntityIdsMapping(JavaRDD)} instead, to get the entity mappings used in blocking
* Maps an entity url to its entity id, that is also used by blocking.
* @param rawTriples
* @param SEPARATOR
* @return a map from an entity url to its entity id, that is also used by blocking.
*/
public static Object2IntOpenHashMap<String> getEntityIdsMapping(JavaRDD<String> rawTriples, String SEPARATOR) {
LinkedHashSet<String> subjectsSet =
new LinkedHashSet<>(rawTriples
.map(line -> line.split(SEPARATOR)[0])
.collect()
); //convert list to set (to remove duplicates)
Object2IntOpenHashMap<String> result = new Object2IntOpenHashMap<>(subjectsSet.size());
result.defaultReturnValue(-1);
int index = 0;
for (String subject : subjectsSet) {
result.put(subject, index++);
}
return result;
}
示例14: getLatestVersions
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
/**
* Returns the latest versions of a given set of concept maps.
*
* @param urls a set of URLs to retrieve the latest version for, or null to load them all.
* @param includeExperimental flag to include concept maps marked as experimental
*
* @return a map of concept map URLs to the latest version for them.
*/
public Map<String,String> getLatestVersions(final Set<String> urls,
boolean includeExperimental) {
// Reduce by the concept map URI to return only the latest version
// per concept map. Spark's provided max aggregation function
// only works on numeric types, so we jump into RDDs and perform
// the reduce by hand.
JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"),
col("version"),
col("experimental"))
.toJavaRDD()
.filter(row -> (urls == null || urls.contains(row.getString(0)))
&& (includeExperimental || row.isNullAt(2) || !row.getBoolean(2)))
.mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1)))
.reduceByKey((leftVersion, rightVersion) ->
leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion)
.map(tuple -> new UrlAndVersion(tuple._1, tuple._2));
return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER)
.collectAsList()
.stream()
.collect(Collectors.toMap(UrlAndVersion::getUrl,
UrlAndVersion::getVersion));
}
示例15: evaluate
import org.apache.spark.api.java.JavaRDD; //导入依赖的package包/类
/**
* @param evalData data for evaluation
* @return the Dunn Index of a given clustering
* (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
*/
@Override
double evaluate(JavaRDD<Vector> evalData) {
// Intra-cluster distance is mean distance to centroid
double maxIntraClusterDistance =
fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();
// Inter-cluster distance is distance between centroids
double minInterClusterDistance = Double.POSITIVE_INFINITY;
List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
DistanceFn<double[]> distanceFn = getDistanceFn();
for (int i = 0; i < clusters.size(); i++) {
double[] centerI = clusters.get(i).getCenter();
// Distances are symmetric, hence d(i,j) == d(j,i)
for (int j = i + 1; j < clusters.size(); j++) {
double[] centerJ = clusters.get(j).getCenter();
minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
}
}
return minInterClusterDistance / maxIntraClusterDistance;
}