本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.cache方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.cache方法的具体用法?Java JavaRDD.cache怎么用?Java JavaRDD.cache使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.cache方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: rddPreProcessing
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
GaussianMixtureModelSummary gaussianMixtureModelSummary) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
JavaRDD<Vector> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, Vector>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()){
values[j] = Math.abs(values[j]);
}
}
}
//remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return Vectors.dense(values);
}
}
gaussianMixtureModelSummary.updateSummary(idx, feature);
return Vectors.dense(values);
}
);
Normalizer normalizer = new Normalizer();
JavaRDD<Vector> normed;
if (athenaMLFeatureConfiguration.isNormalization()) {
normed = normalizer.transform(parsedData);
} else {
normed = parsedData;
}
normed.cache();
return normed;
}
示例2: rddPreProcessing
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
KmeansModelSummary kmeansModelSummary) {
List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
int numberOfTargetValue = listOfTargetFeatures.size();
// int numberOfTargetValue = 5;
JavaRDD<Vector> parsedData = mongoRDD.map(
(Function<Tuple2<Object, BSONObject>, Vector>) t -> {
BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
BSONObject idx = (BSONObject) t._2();
double[] values = new double[numberOfTargetValue];
for (int j = 0; j < numberOfTargetValue; j++) {
if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
if (obj instanceof Long) {
values[j] = (Long) obj;
} else if (obj instanceof Double) {
values[j] = (Double) obj;
} else if (obj instanceof Boolean) {
values[j] = (Boolean) obj ? 1 : 0;
} else {
values[j] = 0;
}
//check weight
if (weight.containsKey(listOfTargetFeatures.get(j))) {
values[j] *= weight.get(listOfTargetFeatures.get(j));
}
//check absolute
if (athenaMLFeatureConfiguration.isAbsolute()) {
values[j] = Math.abs(values[j]);
}
}
// values[j] = 0;
}
// //remove errors
for (int i = 0; i < numberOfTargetValue; i++) {
if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
for (int j = 0; j < numberOfTargetValue; j++) {
values[j] = 0;
}
return Vectors.dense(values);
}
}
kmeansModelSummary.updateSummary(idx, feature);
return Vectors.dense(values);
}
);
Normalizer normalizer = new Normalizer();
JavaRDD<Vector> normed;
if (athenaMLFeatureConfiguration.isNormalization()) {
normed = normalizer.transform(parsedData);
} else {
normed = parsedData;
}
normed.cache();
return normed;
}
示例3: buildModel
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int features = (Integer) hyperParameters.get(0);
double lambda = (Double) hyperParameters.get(1);
double alpha = (Double) hyperParameters.get(2);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = (Double) hyperParameters.get(3);
}
Preconditions.checkArgument(features > 0);
Preconditions.checkArgument(lambda >= 0.0);
Preconditions.checkArgument(alpha > 0.0);
if (logStrength) {
Preconditions.checkArgument(epsilon > 0.0);
}
JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
parsedRDD.cache();
Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDIndexMap.size(), itemIDIndexMap.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);
JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
trainRatingData = aggregateScores(trainRatingData, epsilon);
ALS als = new ALS()
.setRank(features)
.setIterations(iterations)
.setLambda(lambda)
.setCheckpointInterval(5);
if (implicit) {
als = als.setImplicitPrefs(true).setAlpha(alpha);
}
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
trainingRatingDataRDD.unpersist(false);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
parsedRDD.unpersist();
Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));
PMML pmml = mfModelToPMML(model,
features,
lambda,
alpha,
epsilon,
implicit,
logStrength,
candidatePath,
bUserIndexToID,
bItemIndexToID);
unpersist(model);
bUserIndexToID.unpersist();
bItemIndexToID.unpersist();
return pmml;
}
示例4: evaluate
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
PMML model,
Path modelParentPath,
JavaRDD<String> testData,
JavaRDD<String> trainData) {
JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
parsedTestRDD.cache();
Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDToIndex.size(), itemIDToIndex.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);
JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
}
testRatingData = aggregateScores(testRatingData, epsilon);
MatrixFactorizationModel mfModel =
pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);
parsedTestRDD.unpersist();
double eval;
if (implicit) {
double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
log.info("AUC: {}", auc);
eval = auc;
} else {
double rmse = Evaluation.rmse(mfModel, testRatingData);
log.info("RMSE: {}", rmse);
eval = -rmse;
}
unpersist(mfModel);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
return eval;
}