本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD.filter方法的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD.filter方法的具体用法?Java JavaPairRDD.filter怎么用?Java JavaPairRDD.filter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaPairRDD
的用法示例。
在下文中一共展示了JavaPairRDD.filter方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parsedToRatingRDD
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* @param parsedRDD parsed input as {@code String[]}
* @return {@link Rating}s ordered by timestamp
*/
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
Broadcast<Map<String,Integer>> bUserIDToIndex,
Broadcast<Map<String,Integer>> bItemIDToIndex) {
JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
try {
return new Tuple2<>(
Long.valueOf(tokens[3]),
new Rating(bUserIDToIndex.value().get(tokens[0]),
bItemIDToIndex.value().get(tokens[1]),
// Empty value means 'delete'; propagate as NaN
tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
log.warn("Bad input: {}", Arrays.toString(tokens));
throw e;
}
});
if (decayFactor < 1.0) {
double factor = decayFactor;
long now = System.currentTimeMillis();
timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
long timestamp = timestampRating._1();
return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
});
}
if (decayZeroThreshold > 0.0) {
double theThreshold = decayZeroThreshold;
timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
}
return timestampRatingRDD.sortByKey().values();
}
示例2: aggregateScores
import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
* Combines {@link Rating}s with the same user/item into one, with score as the sum of
* all of the scores.
*/
private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original, double epsilon) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
if (implicit) {
// TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
// they don't guarantee the delete elements are properly handled
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
// For non-implicit, last wins.
aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
}
JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
aggregated.filter(kv -> !Double.isNaN(kv._2()));
if (logStrength) {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
Math.log1p(userProductScore._2() / epsilon)));
} else {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
userProductScore._2()));
}
}