本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.map方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.map方法的具体用法?Java JavaRDD.map怎么用?Java JavaRDD.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.spark.api.java.JavaRDD
的用法示例。
在下文中一共展示了JavaRDD.map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: execute
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public Object execute() {
// TODO Auto-generated method stub
LOG.info("Starting generate ranking train data.");
startTime = System.currentTimeMillis();
String rankingTrainFile = "E:\\Mudrod_input_data\\Testing_Data_4_1monthLog+Meta+Onto\\traing.txt";
try {
SessionExtractor extractor = new SessionExtractor();
JavaRDD<RankingTrainData> rankingTrainDataRDD = extractor.extractRankingTrainData(this.props, this.es, this.spark);
JavaRDD<String> rankingTrainData_JsonRDD = rankingTrainDataRDD.map(f -> f.toJson());
rankingTrainData_JsonRDD.coalesce(1, true).saveAsTextFile(rankingTrainFile);
} catch (Exception e) {
e.printStackTrace();
}
endTime = System.currentTimeMillis();
LOG.info("Ranking train data generation complete. Time elapsed {} seconds.", (endTime - startTime) / 1000);
return null;
}
示例2: addDefaultInstanceTypes
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Add a default type to instances with no type
*
* @param instances instances to add type to
* @return instances with default type added where no type was specified
*/
private JavaRDD<Instance> addDefaultInstanceTypes(JavaRDD<Instance> instances) {
final int defaultType = schema.value().getTypeIndex().getIndex(IRI_TYPE_DEFAULT);
return instances.map(instance -> {
if (!instance.hasType()) {
instance.addType(defaultType);
}
return instance;
});
}
示例3: filter
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* filter RDD of {@link Instance}s based on the specified config
*
* @param instances RDD of instances to filter
* @param typeIndex index mapping type URIs to integers
* @return filtered RDD of instances
*/
public JavaRDD<Instance> filter(JavaRDD<Instance> instances, IndexMap<String> typeIndex) {
if (config.getTypes().isEmpty()) {
return instances;
}
// get indexes of accepted type URIs
Set<Integer> acceptedTypes = config.getTypes().stream()
.map(typeIndex::getIndex)
.collect(Collectors.toSet());
instances = instances.filter(instance -> !Collections.disjoint(instance.getTypes(), acceptedTypes));
if (config.isIgnoreOtherTypes()) {
// remove other than accepted types from each instance
instances = instances.map(instance -> {
Set<Integer> intersect = Sets.intersection(instance.getTypes(), acceptedTypes).immutableCopy();
instance.getTypes().clear();
instance.getTypes().addAll(intersect);
return instance;
});
}
return instances;
}
示例4: saveUserRecommendations
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public void saveUserRecommendations(TrainedModel model, IoOperation<UserRecommendations> ioOperation) {
logger.info("start saving user recommendations");
JavaRDD<Tuple2<Object, Rating[]>> recommendations = model.getMatrixModel()
.recommendProductsForUsers(20)
.toJavaRDD();
logger.info("recommendations count " + recommendations.count());
JavaRDD<UserRecommendations> userRecommendationsRDD = recommendations.map(tuple -> {
Set<Integer> products = new HashSet<>();
for (Rating rating : tuple._2) {
products.add(rating.product());
}
return new UserRecommendations((int) tuple._1(), products);
});
ioOperation.writeOutput(userRecommendationsRDD);
}
示例5: parsedToVectorRDD
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) {
return parsedRDD.map(data -> {
try {
return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema));
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
log.warn("Bad input: {}", Arrays.toString(data));
throw e;
}
});
}
示例6: parseToLabeledPointRDD
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<LabeledPoint> parseToLabeledPointRDD(
JavaRDD<String[]> parsedRDD,
CategoricalValueEncodings categoricalValueEncodings) {
return parsedRDD.map(data -> {
try {
double[] features = new double[inputSchema.getNumPredictors()];
double target = Double.NaN;
for (int featureIndex = 0; featureIndex < data.length; featureIndex++) {
double encoded;
if (inputSchema.isNumeric(featureIndex)) {
encoded = Double.parseDouble(data[featureIndex]);
} else if (inputSchema.isCategorical(featureIndex)) {
Map<String,Integer> valueEncoding =
categoricalValueEncodings.getValueEncodingMap(featureIndex);
encoded = valueEncoding.get(data[featureIndex]);
} else {
continue;
}
if (inputSchema.isTarget(featureIndex)) {
target = encoded;
} else {
features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded;
}
}
Preconditions.checkState(!Double.isNaN(target));
return new LabeledPoint(target, Vectors.dense(features));
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
log.warn("Bad input: {}", Arrays.toString(data));
throw e;
}
});
}
示例7: execute
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public void execute() {
RatingsFileIo ratingsIo = new RatingsFileIo();
ratingsIo.setSparkContext(sparkContext);
JavaRDD<Rating> ratings = ratingsIo.readInput();
JavaRDD<RawRating> rawRatingRdd = ratings.map(rating -> RawRating.fromSparkRating(rating));
saveToCassandra(rawRatingRdd);
}
示例8: readInput
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public JavaRDD<Rating> readInput() {
JavaRDD<String> data = sparkContext.textFile(ratingsPath);
return data.map(line -> {
String[] lineParts = line.split(",");
int userId = Integer.parseInt(lineParts[0]);
int movieId = Integer.parseInt(lineParts[1]);
double rating = Double.parseDouble(lineParts[2]);
return new Rating(userId, movieId, rating);
});
}
示例9: create
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static TrainedModel create(JavaRDD<Rating> trainingRdd, JavaRDD<Rating> testRdd, int rank, int iterationsNr) {
logger.info(String.format("Train with parameters -> iterations: %d, rank :%d", iterationsNr, rank));
JavaRDD<Tuple2<Object, Object>> testForPredict = testRdd.map(rating ->
new Tuple2<>(rating.user(), rating.product())
);
TimeKeeper timeKeeper = new TimeKeeper();
timeKeeper.start();
MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingRdd), rank, iterationsNr, 0.1);
timeKeeper.end().print(logger, "als model trained in ").reset();
Double error = getError(testRdd, rank, iterationsNr, testForPredict, timeKeeper, model);
return new TrainedModel(error, model);
}
示例10: loadHistoricalRaces
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
* Loads in historical data and stores in Hazelcast IMDG. This is mostly to
* provide a source of horses for the bet simulation.
*
* @throws IOException
*/
public void loadHistoricalRaces() throws IOException {
filePath = Utils.unpackDataToTmp("historical_races.json");
final JavaRDD<String> eventsText = sc.textFile(filePath.toString());
final JavaRDD<Event> events
= eventsText.map(s -> JSONSerializable.parse(s, Event::parseBag));
final JavaPairRDD<Horse, Integer> winners
= events.mapToPair(e -> new Tuple2<>(e.getRaces().get(0).getWinner().orElse(Horse.PALE), 1))
.reduceByKey((a, b) -> a + b);
final HazelcastRDDFunctions accessToHC = javaPairRddFunctions(winners);
accessToHC.saveToHazelcastMap("winners");
}
示例11: GetD_IRW
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static IndexedRowMatrix GetD_IRW(IndexedRowMatrix A, boolean inverseValues, JavaSparkContext jsc) {
JavaRDD<IndexedRow> rows = A.rows().toJavaRDD().cache();
final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
JavaRDD<IndexedRow> LURows = rows.map(new Function<IndexedRow, IndexedRow>() {
@Override
public IndexedRow call(IndexedRow indexedRow) throws Exception {
long index = indexedRow.index();
DenseVector vect = indexedRow.vector().toDense();
boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();
double newValues[] = new double[vect.size()];
for(int i = 0; i< vect.size(); i++) {
if( i == index) {
if(inverseValuesValue) {
newValues[i] = 1.0/vect.apply(i);
}
else {
newValues[i] = vect.apply(i);
}
}
else {
newValues[i] = 0.0;
}
}
DenseVector newVector = new DenseVector(newValues);
return new IndexedRow(index, newVector);
}
});
IndexedRowMatrix newMatrix = new IndexedRowMatrix(LURows.rdd());
return newMatrix;
}
示例12: main
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new HiveContext(sc.sc());
Options options = new Options();
Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
Option queryOpt = new Option( "query", true, "SQL query string." );
Option baminOpt = new Option( "in", true, "" );
options.addOption( opOpt );
options.addOption( queryOpt );
options.addOption( baminOpt );
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse( options, args );
}
catch( ParseException exp ) {
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
}
String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);
//Read BAM/SAM from HDFS
JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
//Map to SAMRecord RDD
JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));
Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
samDF.registerTempTable(tablename);
if(query!=null) {
//Save as parquet file
Dataset df2 = sqlContext.sql(query);
df2.show(100,false);
if(bwaOutDir!=null)
df2.write().parquet(bwaOutDir);
}else{
if(bwaOutDir!=null)
samDF.write().parquet(bwaOutDir);
}
sc.stop();
}
示例13: buildModel
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int features = (Integer) hyperParameters.get(0);
double lambda = (Double) hyperParameters.get(1);
double alpha = (Double) hyperParameters.get(2);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = (Double) hyperParameters.get(3);
}
Preconditions.checkArgument(features > 0);
Preconditions.checkArgument(lambda >= 0.0);
Preconditions.checkArgument(alpha > 0.0);
if (logStrength) {
Preconditions.checkArgument(epsilon > 0.0);
}
JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
parsedRDD.cache();
Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDIndexMap.size(), itemIDIndexMap.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);
JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
trainRatingData = aggregateScores(trainRatingData, epsilon);
ALS als = new ALS()
.setRank(features)
.setIterations(iterations)
.setLambda(lambda)
.setCheckpointInterval(5);
if (implicit) {
als = als.setImplicitPrefs(true).setAlpha(alpha);
}
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
trainingRatingDataRDD.unpersist(false);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
parsedRDD.unpersist();
Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));
PMML pmml = mfModelToPMML(model,
features,
lambda,
alpha,
epsilon,
implicit,
logStrength,
candidatePath,
bUserIndexToID,
bItemIndexToID);
unpersist(model);
bUserIndexToID.unpersist();
bItemIndexToID.unpersist();
return pmml;
}
示例14: evaluate
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
PMML model,
Path modelParentPath,
JavaRDD<String> testData,
JavaRDD<String> trainData) {
JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
parsedTestRDD.cache();
Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);
log.info("Broadcasting ID-index mappings for {} users, {} items",
userIDToIndex.size(), itemIDToIndex.size());
Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);
JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
double epsilon = Double.NaN;
if (logStrength) {
epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
}
testRatingData = aggregateScores(testRatingData, epsilon);
MatrixFactorizationModel mfModel =
pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);
parsedTestRDD.unpersist();
double eval;
if (implicit) {
double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
log.info("AUC: {}", auc);
eval = auc;
} else {
double rmse = Evaluation.rmse(mfModel, testRatingData);
log.info("RMSE: {}", rmse);
eval = -rmse;
}
unpersist(mfModel);
bUserIDToIndex.unpersist();
bItemIDToIndex.unpersist();
return eval;
}
示例15: buildModel
import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int maxSplitCandidates = (Integer) hyperParameters.get(0);
int maxDepth = (Integer) hyperParameters.get(1);
String impurity = (String) hyperParameters.get(2);
Preconditions.checkArgument(maxSplitCandidates >= 2,
"max-split-candidates must be at least 2");
Preconditions.checkArgument(maxDepth > 0,
"max-depth must be at least 1");
JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
CategoricalValueEncodings categoricalValueEncodings =
new CategoricalValueEncodings(getDistinctValues(parsedRDD));
JavaRDD<LabeledPoint> trainPointData =
parseToLabeledPointRDD(parsedRDD, categoricalValueEncodings);
Map<Integer,Integer> categoryInfo = categoricalValueEncodings.getCategoryCounts();
categoryInfo.remove(inputSchema.getTargetFeatureIndex()); // Don't specify target count
// Need to translate indices to predictor indices
Map<Integer,Integer> categoryInfoByPredictor = new HashMap<>(categoryInfo.size());
categoryInfo.forEach((k, v) -> categoryInfoByPredictor.put(inputSchema.featureToPredictorIndex(k), v));
int seed = RandomManager.getRandom().nextInt();
RandomForestModel model;
if (inputSchema.isClassification()) {
int numTargetClasses =
categoricalValueEncodings.getValueCount(inputSchema.getTargetFeatureIndex());
model = RandomForest.trainClassifier(trainPointData,
numTargetClasses,
categoryInfoByPredictor,
numTrees,
"auto",
impurity,
maxDepth,
maxSplitCandidates,
seed);
} else {
model = RandomForest.trainRegressor(trainPointData,
categoryInfoByPredictor,
numTrees,
"auto",
impurity,
maxDepth,
maxSplitCandidates,
seed);
}
List<Map<Integer,Long>> treeNodeIDCounts = treeNodeExampleCounts(trainPointData, model);
Map<Integer,Long> predictorIndexCounts = predictorExampleCounts(trainPointData, model);
return rdfModelToPMML(model,
categoricalValueEncodings,
maxDepth,
maxSplitCandidates,
impurity,
treeNodeIDCounts,
predictorIndexCounts);
}