当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.map方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.map方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.map方法的具体用法?Java JavaRDD.map怎么用?Java JavaRDD.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: execute

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public Object execute() {
  // TODO Auto-generated method stub
  LOG.info("Starting generate ranking train data.");
  startTime = System.currentTimeMillis();

  String rankingTrainFile = "E:\\Mudrod_input_data\\Testing_Data_4_1monthLog+Meta+Onto\\traing.txt";
  try {
    SessionExtractor extractor = new SessionExtractor();
    JavaRDD<RankingTrainData> rankingTrainDataRDD = extractor.extractRankingTrainData(this.props, this.es, this.spark);

    JavaRDD<String> rankingTrainData_JsonRDD = rankingTrainDataRDD.map(f -> f.toJson());

    rankingTrainData_JsonRDD.coalesce(1, true).saveAsTextFile(rankingTrainFile);

  } catch (Exception e) {
    e.printStackTrace();
  }

  endTime = System.currentTimeMillis();
  LOG.info("Ranking train data generation complete. Time elapsed {} seconds.", (endTime - startTime) / 1000);
  return null;
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:24,代码来源:RankingTrainDataGenerator.java

示例2: addDefaultInstanceTypes

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Add a default type to instances with no type
 *
 * @param instances instances to add type to
 * @return instances with default type added where no type was specified
 */
private JavaRDD<Instance> addDefaultInstanceTypes(JavaRDD<Instance> instances) {
    final int defaultType = schema.value().getTypeIndex().getIndex(IRI_TYPE_DEFAULT);
    return instances.map(instance -> {
        if (!instance.hasType()) {
            instance.addType(defaultType);
        }
        return instance;
    });
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:16,代码来源:InstanceAggregator.java

示例3: filter

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * filter RDD of {@link Instance}s based on the specified config
 *
 * @param instances RDD of instances to filter
 * @param typeIndex index mapping type URIs to integers
 * @return filtered RDD of instances
 */
public JavaRDD<Instance> filter(JavaRDD<Instance> instances, IndexMap<String> typeIndex) {
    if (config.getTypes().isEmpty()) {
        return instances;
    }
    // get indexes of accepted type URIs
    Set<Integer> acceptedTypes = config.getTypes().stream()
            .map(typeIndex::getIndex)
            .collect(Collectors.toSet());

    instances = instances.filter(instance -> !Collections.disjoint(instance.getTypes(), acceptedTypes));

    if (config.isIgnoreOtherTypes()) {
        // remove other than accepted types from each instance
        instances = instances.map(instance -> {
            Set<Integer> intersect = Sets.intersection(instance.getTypes(), acceptedTypes).immutableCopy();
            instance.getTypes().clear();
            instance.getTypes().addAll(intersect);
            return instance;
        });
    }

    return instances;
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:31,代码来源:InstanceFilter.java

示例4: saveUserRecommendations

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public void saveUserRecommendations(TrainedModel model, IoOperation<UserRecommendations> ioOperation) {
    logger.info("start saving user recommendations");
    JavaRDD<Tuple2<Object, Rating[]>> recommendations = model.getMatrixModel()
            .recommendProductsForUsers(20)
            .toJavaRDD();

    logger.info("recommendations count " + recommendations.count());

    JavaRDD<UserRecommendations> userRecommendationsRDD = recommendations.map(tuple -> {
        Set<Integer> products = new HashSet<>();
        for (Rating rating : tuple._2) {
            products.add(rating.product());
        }

        return new UserRecommendations((int) tuple._1(), products);
    });
    ioOperation.writeOutput(userRecommendationsRDD);
}
 
开发者ID:cosminseceleanu,项目名称:movie-recommender,代码行数:19,代码来源:RecommendationEngine.java

示例5: parsedToVectorRDD

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) {
  return parsedRDD.map(data -> {
    try {
      return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:11,代码来源:KMeansUpdate.java

示例6: parseToLabeledPointRDD

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaRDD<LabeledPoint> parseToLabeledPointRDD(
    JavaRDD<String[]> parsedRDD,
    CategoricalValueEncodings categoricalValueEncodings) {

  return parsedRDD.map(data -> {
    try {
      double[] features = new double[inputSchema.getNumPredictors()];
      double target = Double.NaN;
      for (int featureIndex = 0; featureIndex < data.length; featureIndex++) {
        double encoded;
        if (inputSchema.isNumeric(featureIndex)) {
          encoded = Double.parseDouble(data[featureIndex]);
        } else if (inputSchema.isCategorical(featureIndex)) {
          Map<String,Integer> valueEncoding =
              categoricalValueEncodings.getValueEncodingMap(featureIndex);
          encoded = valueEncoding.get(data[featureIndex]);
        } else {
          continue;
        }
        if (inputSchema.isTarget(featureIndex)) {
          target = encoded;
        } else {
          features[inputSchema.featureToPredictorIndex(featureIndex)] = encoded;
        }
      }
      Preconditions.checkState(!Double.isNaN(target));
      return new LabeledPoint(target, Vectors.dense(features));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:34,代码来源:RDFUpdate.java

示例7: execute

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public void execute() {
    RatingsFileIo ratingsIo = new RatingsFileIo();
    ratingsIo.setSparkContext(sparkContext);
    JavaRDD<Rating> ratings = ratingsIo.readInput();
    JavaRDD<RawRating> rawRatingRdd = ratings.map(rating -> RawRating.fromSparkRating(rating));
    saveToCassandra(rawRatingRdd);
}
 
开发者ID:cosminseceleanu,项目名称:movie-recommender,代码行数:9,代码来源:ImportRatingsJob.java

示例8: readInput

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public JavaRDD<Rating> readInput() {
    JavaRDD<String> data = sparkContext.textFile(ratingsPath);

    return data.map(line -> {
        String[] lineParts = line.split(",");
        int userId = Integer.parseInt(lineParts[0]);
        int movieId = Integer.parseInt(lineParts[1]);
        double rating = Double.parseDouble(lineParts[2]);

        return new Rating(userId, movieId, rating);
    });
}
 
开发者ID:cosminseceleanu,项目名称:movie-recommender,代码行数:14,代码来源:RatingsFileIo.java

示例9: create

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static TrainedModel create(JavaRDD<Rating> trainingRdd, JavaRDD<Rating> testRdd, int rank, int iterationsNr) {
    logger.info(String.format("Train with parameters -> iterations: %d, rank :%d", iterationsNr, rank));
    JavaRDD<Tuple2<Object, Object>> testForPredict = testRdd.map(rating ->
        new Tuple2<>(rating.user(), rating.product())
    );
    TimeKeeper timeKeeper = new TimeKeeper();
    timeKeeper.start();

    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingRdd), rank, iterationsNr, 0.1);
    timeKeeper.end().print(logger, "als model trained in ").reset();

    Double error = getError(testRdd, rank, iterationsNr, testForPredict, timeKeeper, model);

    return new TrainedModel(error, model);
}
 
开发者ID:cosminseceleanu,项目名称:movie-recommender,代码行数:16,代码来源:ModelFactory.java

示例10: loadHistoricalRaces

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Loads in historical data and stores in Hazelcast IMDG. This is mostly to 
 * provide a source of horses for the bet simulation.
 * 
 * @throws IOException 
 */
public void loadHistoricalRaces() throws IOException {
    filePath = Utils.unpackDataToTmp("historical_races.json");

    final JavaRDD<String> eventsText = sc.textFile(filePath.toString());
    final JavaRDD<Event> events
            = eventsText.map(s -> JSONSerializable.parse(s, Event::parseBag));

    final JavaPairRDD<Horse, Integer> winners
            = events.mapToPair(e -> new Tuple2<>(e.getRaces().get(0).getWinner().orElse(Horse.PALE), 1))
            .reduceByKey((a, b) -> a + b);

    final HazelcastRDDFunctions accessToHC = javaPairRddFunctions(winners);
    accessToHC.saveToHazelcastMap("winners");
}
 
开发者ID:hazelcast,项目名称:betleopard,代码行数:21,代码来源:LiveBetMain.java

示例11: GetD_IRW

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static IndexedRowMatrix GetD_IRW(IndexedRowMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD().cache();

        final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
        JavaRDD<IndexedRow> LURows = rows.map(new Function<IndexedRow, IndexedRow>() {

            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {
                long index = indexedRow.index();
                DenseVector vect = indexedRow.vector().toDense();

                boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();

                double newValues[] = new double[vect.size()];


                for(int i = 0; i< vect.size(); i++) {

                    if( i == index) {
                        if(inverseValuesValue) {
                            newValues[i] = 1.0/vect.apply(i);
                        }
                        else {
                            newValues[i] = vect.apply(i);
                        }
                    }
                    else {
                        newValues[i] = 0.0;
                    }

                }

                DenseVector newVector = new DenseVector(newValues);

                return new IndexedRow(index, newVector);

            }
        });

        IndexedRowMatrix newMatrix = new IndexedRowMatrix(LURows.rdd());

        return newMatrix;
    }
 
开发者ID:jmabuin,项目名称:BLASpark,代码行数:45,代码来源:OtherOperations.java

示例12: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:56,代码来源:SQLQueryBAM.java

示例13: buildModel

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:73,代码来源:ALSUpdate.java

示例14: evaluate

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {

  JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
  parsedTestRDD.cache();

  Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
  Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDToIndex.size(), itemIDToIndex.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);

  JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
  }
  testRatingData = aggregateScores(testRatingData, epsilon);

  MatrixFactorizationModel mfModel =
      pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);

  parsedTestRDD.unpersist();

  double eval;
  if (implicit) {
    double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
    log.info("AUC: {}", auc);
    eval = auc;
  } else {
    double rmse = Evaluation.rmse(mfModel, testRatingData);
    log.info("RMSE: {}", rmse);
    eval = -rmse;
  }
  unpersist(mfModel);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  return eval;
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:49,代码来源:ALSUpdate.java

示例15: buildModel

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {

  int maxSplitCandidates = (Integer) hyperParameters.get(0);
  int maxDepth = (Integer) hyperParameters.get(1);
  String impurity = (String) hyperParameters.get(2);
  Preconditions.checkArgument(maxSplitCandidates >= 2,
                              "max-split-candidates must be at least 2");
  Preconditions.checkArgument(maxDepth > 0,
                              "max-depth must be at least 1");

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  CategoricalValueEncodings categoricalValueEncodings =
      new CategoricalValueEncodings(getDistinctValues(parsedRDD));
  JavaRDD<LabeledPoint> trainPointData =
      parseToLabeledPointRDD(parsedRDD, categoricalValueEncodings);

  Map<Integer,Integer> categoryInfo = categoricalValueEncodings.getCategoryCounts();
  categoryInfo.remove(inputSchema.getTargetFeatureIndex()); // Don't specify target count
  // Need to translate indices to predictor indices
  Map<Integer,Integer> categoryInfoByPredictor = new HashMap<>(categoryInfo.size());
  categoryInfo.forEach((k, v) -> categoryInfoByPredictor.put(inputSchema.featureToPredictorIndex(k), v));

  int seed = RandomManager.getRandom().nextInt();

  RandomForestModel model;
  if (inputSchema.isClassification()) {
    int numTargetClasses =
        categoricalValueEncodings.getValueCount(inputSchema.getTargetFeatureIndex());
    model = RandomForest.trainClassifier(trainPointData,
                                         numTargetClasses,
                                         categoryInfoByPredictor,
                                         numTrees,
                                         "auto",
                                         impurity,
                                         maxDepth,
                                         maxSplitCandidates,
                                         seed);
  } else {
    model = RandomForest.trainRegressor(trainPointData,
                                        categoryInfoByPredictor,
                                        numTrees,
                                        "auto",
                                        impurity,
                                        maxDepth,
                                        maxSplitCandidates,
                                        seed);
  }

  List<Map<Integer,Long>> treeNodeIDCounts = treeNodeExampleCounts(trainPointData, model);
  Map<Integer,Long> predictorIndexCounts = predictorExampleCounts(trainPointData, model);

  return rdfModelToPMML(model,
                        categoricalValueEncodings,
                        maxDepth,
                        maxSplitCandidates,
                        impurity,
                        treeNodeIDCounts,
                        predictorIndexCounts);
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:64,代码来源:RDFUpdate.java


注:本文中的org.apache.spark.api.java.JavaRDD.map方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。