本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD类的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD类的具体用法?Java JavaPairRDD怎么用?Java JavaPairRDD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
JavaPairRDD类属于org.apache.spark.api.java包,在下文中一共展示了JavaPairRDD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: interleaveSplitFastq
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
示例2: rmse
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* Computes root mean squared error of {@link Rating#rating()} versus predicted value.
*/
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,Object>> testUserProducts =
(RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
double mse = predictions.mapToPair(
rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
double diff = valuePrediction._1() - valuePrediction._2();
return diff * diff;
}).mean();
return Math.sqrt(mse);
}
示例3: fetchClusterMetrics
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* @param evalData points to cluster for evaluation
* @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
* and sum of squared distances
*/
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
}).reduceByKey(ClusterMetric::add);
}
示例4: wordCountJava8
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
示例5: main
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
System.out.println(System.getProperty("hadoop.home.dir"));
String inputPath = args[0];
String outputPath = args[1];
FileUtils.deleteQuietly(new File(outputPath));
JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");
JavaRDD<String> rdd = sc.textFile(inputPath);
JavaPairRDD<String, Integer> counts = rdd
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y);
counts.saveAsTextFile(outputPath);
sc.close();
}
示例6: getUniqueKmers
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
private static JavaRDD<String> getUniqueKmers(JavaPairRDD<Text, SequencedFragment> fastqRDD, int k) {
JavaRDD<String> rdd = fastqRDD.mapPartitions(records -> {
HashSet<String> umer_set = new HashSet<String>();
while (records.hasNext()) {
Tuple2<Text, SequencedFragment> fastq = records.next();
String seq = fastq._2.getSequence().toString();
//HashSet<String> umer_in_seq = new HashSet<String>();
for (int i = 0; i < seq.length() - k - 1; i++) {
String kmer = seq.substring(i, i + k);
umer_set.add(kmer);
}
}
return umer_set.iterator();
});
JavaRDD<String> umersRDD = rdd.distinct();
//umersRDD.sortBy(s -> s, true, 4);
return umersRDD;
}
示例7: main
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
System.exit(1);
}
SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
//conf.set("spark.default.parallelism", String.valueOf(args[2]));
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());
JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));
repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
sc.stop();
}
示例8: mapSAMRecordsToFastq
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {
JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {
String name = read.getReadName();
if(read.getReadPairedFlag()){
if(read.getFirstOfPairFlag())
name = name+"/1";
if(read.getSecondOfPairFlag())
name = name+"/2";
}
//TODO: check values
Text t = new Text(name);
SequencedFragment sf = new SequencedFragment();
sf.setSequence(new Text(read.getReadString()));
sf.setQuality(new Text(read.getBaseQualityString()));
return new Tuple2<Text, SequencedFragment>(t, sf);
});
return fastqRDD;
}
示例9: partition
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* Partition instances by the specified partitioning (e.g. by instance type)
*
* @param instances RDD of instances to partition
* @return partitioned RDD if requested, original RDD if no partitioning is specified
*/
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
if (!config.isRepartitionByType()) {
return instances;
}
log.info("Getting counts by type hash");
Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
int numPartitions = instances.getNumPartitions();
long totalInstances = instances.count();
long instancesPerPartition = totalInstances / numPartitions + 1;
JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
int typeHash = getTypeHash(instance);
int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
return new Tuple2<>(typeHash + splitIncrement, instance);
});
log.info("Partitioning instances by type");
return instanceWithPartitions
.partitionBy(new HashPartitioner(numPartitions))
.values();
}
示例10: getAllWordsInDoc
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* getAllWordsInDoc: Extracted all unique terms from all docs.
*
* @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
* that doc.
* @return unique term list
*/
public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(List<String> list) {
return list.iterator();
}
}).distinct();
return wordRDD;
}
示例11: execute
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* Generate a csv which is a term-metadata matrix genetrated from original
* metadata.
*
* @see DiscoveryStepAbstract#execute()
*/
@Override
public Object execute() {
LOG.info("Metadata matrix started");
startTime = System.currentTimeMillis();
String metadataMatrixFile = props.getProperty("metadataMatrix");
try {
MetadataExtractor extractor = new MetadataExtractor();
JavaPairRDD<String, List<String>> metadataTermsRDD = extractor.loadMetadata(this.es, this.spark.sc, props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
LabeledRowMatrix wordDocMatrix = MatrixUtil.createWordDocMatrix(metadataTermsRDD);
MatrixUtil.exportToCSV(wordDocMatrix.rowMatrix, wordDocMatrix.rowkeys, wordDocMatrix.colkeys, metadataMatrixFile);
} catch (Exception e) {
LOG.error("Error during Metadata matrix generaion: {}", e);
}
endTime = System.currentTimeMillis();
LOG.info("Metadata matrix finished time elapsed: {}s", (endTime - startTime) / 1000);
return null;
}
示例12: readAndConvertFeatureRDD
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
JavaPairRDD<String,float[]> javaRDD,
Broadcast<Map<String,Integer>> bIdToIndex) {
RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
).mapValues(f -> {
double[] d = new double[f.length];
for (int i = 0; i < d.length; i++) {
d[i] = f[i];
}
return d;
}
).rdd();
// This mimics the persistence level establish by ALS training methods
scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
return objKeyRDD;
}
示例13: getSVDMatrix
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* GetSVDMatrix: Create SVD matrix csv file from original csv file.
*
* @param csvFileName each row is a term, and each column is a document.
* @param svdDimention Dimension of SVD matrix
* @param svdMatrixFileName CSV file name of SVD matrix
*/
public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) {
JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1);
JavaRDD<Vector> vectorRDD = importRDD.values();
RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd());
RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix);
RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention);
List<String> rowKeys = importRDD.keys().collect();
List<String> colKeys = new ArrayList<>();
for (int i = 0; i < svdDimention; i++) {
colKeys.add("dimension" + i);
}
MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName);
}
示例14: getLabelBlocks
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
* Return an RDD with keys: label objects, and values: entity ids from a single collection, having this label
* @param inputTriples
* @param labelAtts
* @param entityIds
* @param SEPARATOR
* @param positiveIds
* @return
*/
private JavaPairRDD<String,Integer> getLabelBlocks(JavaRDD<String> inputTriples, Set<String> labelAtts, JavaRDD<String> entityIds, String SEPARATOR, boolean positiveIds) {
Object2IntOpenHashMap<String> urls1 = Utils.readEntityIdsMapping(entityIds, positiveIds);
return inputTriples.mapToPair(line -> {
String[] spo = line.toLowerCase().replaceAll(" \\.$", "").split(SEPARATOR); //lose the ending " ." from valid .nt files
if (spo.length < 3) {
return null;
}
if (labelAtts.contains(spo[1])) {
String labelValue = line.substring(line.indexOf(spo[1])+spo[1].length()+SEPARATOR.length())
.toLowerCase().replaceAll("[^a-z0-9 ]", "").trim();
int subjectId = urls1.getInt(Utils.encodeURIinUTF8(spo[0])); //replace subject url with entity id
if (!positiveIds) {
subjectId = -subjectId;
}
return new Tuple2<String,Integer>(labelValue,subjectId);
} else {
return null;
}
})
.filter(x-> x!= null)
.distinct();
}
示例15: run
import org.apache.spark.api.java.JavaPairRDD; //导入依赖的package包/类
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}