本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.mapToPair方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.mapToPair方法的具体用法?Java JavaRDD.mapToPair怎么用?Java JavaRDD.mapToPair使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.mapToPair方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
示例2: rmse
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Computes root mean squared error of {@link Rating#rating()} versus predicted value.
*/
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,Object>> testUserProducts =
(RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
double mse = predictions.mapToPair(
rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
double diff = valuePrediction._1() - valuePrediction._2();
return diff * diff;
}).mean();
return Math.sqrt(mse);
}
示例3: mapSAMRecordsToFastq
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {
JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {
String name = read.getReadName();
if(read.getReadPairedFlag()){
if(read.getFirstOfPairFlag())
name = name+"/1";
if(read.getSecondOfPairFlag())
name = name+"/2";
}
//TODO: check values
Text t = new Text(name);
SequencedFragment sf = new SequencedFragment();
sf.setSequence(new Text(read.getReadString()));
sf.setQuality(new Text(read.getBaseQualityString()));
return new Tuple2<Text, SequencedFragment>(t, sf);
});
return fastqRDD;
}
示例4: mapSAMRecordsToFastq
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {
//Map SAMRecords to MyReads
JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {
String name = read.getReadName();
if(read.getReadPairedFlag()){
if(read.getFirstOfPairFlag())
name = name+"/1";
if(read.getSecondOfPairFlag())
name = name+"/2";
}
//TODO: check values
Text t = new Text(name);
SequencedFragment sf = new SequencedFragment();
sf.setSequence(new Text(read.getReadString()));
sf.setQuality(new Text(read.getBaseQualityString()));
return new Tuple2<Text, SequencedFragment>(t, sf);
});
return fastqRDD;
}
示例5: readsToWritable
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) {
return records.mapToPair(read -> {
//SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file
//Set in alignment to sam map phase
if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary());
if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null)
header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName()));
//read.setHeader(read.getHeader());
read.setHeaderStrict(header.getValue());
final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
samRecordWritable.set(read);
return new Tuple2<>(read, samRecordWritable);
});
}
示例6: partition
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Partition instances by the specified partitioning (e.g. by instance type)
*
* @param instances RDD of instances to partition
* @return partitioned RDD if requested, original RDD if no partitioning is specified
*/
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
if (!config.isRepartitionByType()) {
return instances;
}
log.info("Getting counts by type hash");
Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
int numPartitions = instances.getNumPartitions();
long totalInstances = instances.count();
long instancesPerPartition = totalInstances / numPartitions + 1;
JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
int typeHash = getTypeHash(instance);
int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
return new Tuple2<>(typeHash + splitIncrement, instance);
});
log.info("Partitioning instances by type");
return instanceWithPartitions
.partitionBy(new HashPartitioner(numPartitions))
.values();
}
示例7: readsToWritableNoRef
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoRef(JavaRDD<SAMRecord> records) {
return records.mapToPair(read -> {
//read.setHeaderStrict(read.getHeader());
read.setHeader(read.getHeader());
final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
samRecordWritable.set(read);
return new Tuple2<>(read, samRecordWritable);
});
}
示例8: readsToWritableNoHeader
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoHeader(JavaRDD<SAMRecord> records) {
return records.mapToPair(read -> {
final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
samRecordWritable.set(read);
return new Tuple2<>(read, samRecordWritable);
});
}
示例9: parallizeData
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaPairRDD<String, String> parallizeData(SparkDriver spark, List<Tuple2<String, String>> datasetContent) {
JavaRDD<Tuple2<String, String>> datasetContentRDD = spark.sc.parallelize(datasetContent);
return datasetContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(Tuple2<String, String> term) throws Exception {
return term;
}
});
}
示例10: getGroundTruthIds
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* @deprecated use {@link #getGroundTruthIdsFromEntityIds(JavaRDD, JavaRDD,JavaRDD, String)} instead
* Return the ground truth in an RDD format, each entity represented with an integer entity id.
* @param rawTriples1
* @param rawTriples2
* @param RAW_TRIPLES_SEPARATOR
* @param gt
* @param GT_SEPARATOR
* @return
*/
public static JavaPairRDD<Integer,Integer> getGroundTruthIds (JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String RAW_TRIPLES_SEPARATOR, JavaRDD<String> gt, String GT_SEPARATOR) {
Object2IntOpenHashMap<String> entityIds1 = getEntityIdsMapping(rawTriples1, RAW_TRIPLES_SEPARATOR);
Object2IntOpenHashMap<String> entityIds2 = getEntityIdsMapping(rawTriples2, RAW_TRIPLES_SEPARATOR);
return gt.mapToPair(line -> {
String [] parts = line.split(GT_SEPARATOR);
return new Tuple2<>(-entityIds2.getOrDefault(parts[1], 1), //negative id first
entityIds1.getOrDefault(parts[0], -1)); //positive id second
});
}
示例11: readGroundTruthIds
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Return the ground truth in an RDD format, each entity represented with an integer entity id.
* @param gt a ground truth file containing matching entities' ids, separated by GT_SEPARATOR
* @param GT_SEPARATOR
* @return
*/
public static JavaPairRDD<Integer,Integer> readGroundTruthIds (JavaRDD<String> gt, String GT_SEPARATOR) {
return gt.mapToPair(line -> {
String [] parts = line.split(GT_SEPARATOR);
int entity1Id = Integer.parseInt(parts[0]);
int entity2Id = Integer.parseInt(parts[1]);
return new Tuple2<>(-entity2Id-1, entity1Id);
});
}
示例12: parsedToRatingRDD
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* @param parsedRDD parsed input as {@code String[]}
* @return {@link Rating}s ordered by timestamp
*/
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
Broadcast<Map<String,Integer>> bUserIDToIndex,
Broadcast<Map<String,Integer>> bItemIDToIndex) {
JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
try {
return new Tuple2<>(
Long.valueOf(tokens[3]),
new Rating(bUserIDToIndex.value().get(tokens[0]),
bItemIDToIndex.value().get(tokens[1]),
// Empty value means 'delete'; propagate as NaN
tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
log.warn("Bad input: {}", Arrays.toString(tokens));
throw e;
}
});
if (decayFactor < 1.0) {
double factor = decayFactor;
long now = System.currentTimeMillis();
timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
long timestamp = timestampRating._1();
return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
});
}
if (decayZeroThreshold > 0.0) {
double theThreshold = decayZeroThreshold;
timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
}
return timestampRatingRDD.sortByKey().values();
}
示例13: aggregateScores
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Combines {@link Rating}s with the same user/item into one, with score as the sum of
* all of the scores.
*/
private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original, double epsilon) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
if (implicit) {
// TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
// they don't guarantee the delete elements are properly handled
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
// For non-implicit, last wins.
aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
}
JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
aggregated.filter(kv -> !Double.isNaN(kv._2()));
if (logStrength) {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
Math.log1p(userProductScore._2() / epsilon)));
} else {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
userProductScore._2()));
}
}
示例14: readFeaturesRDD
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
log.info("Loading features RDD from {}", path);
JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
return featureLines.mapToPair(line -> {
List<?> update = TextUtils.readJSON(line, List.class);
String key = update.get(0).toString();
float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
return new Tuple2<>(key, vector);
});
}
示例15: knownsRDD
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
boolean knownItems) {
JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());
JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
String user = datum[0];
String item = datum[1];
Boolean delete = datum[2].isEmpty();
return knownItems ?
new Tuple2<>(user, new Tuple2<>(item, delete)) :
new Tuple2<>(item, new Tuple2<>(user, delete));
});
// TODO likely need to figure out a way to avoid groupByKey but collectByKey
// won't work here -- doesn't guarantee enough about ordering
return tuples.groupByKey().mapValues(idDeletes -> {
Collection<String> ids = new HashSet<>();
for (Tuple2<String,Boolean> idDelete : idDeletes) {
if (idDelete._2()) {
ids.remove(idDelete._1());
} else {
ids.add(idDelete._1());
}
}
return ids;
});
}