当前位置: 首页>>代码示例>>Java>>正文


Java JavaPairRDD.map方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaPairRDD.map方法的典型用法代码示例。如果您正苦于以下问题:Java JavaPairRDD.map方法的具体用法?Java JavaPairRDD.map怎么用?Java JavaPairRDD.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaPairRDD的用法示例。


在下文中一共展示了JavaPairRDD.map方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) {

		SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());

		JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
						new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
						new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
						new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));

		JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
						new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));

		Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());

		JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
				v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));

		System.out.println(joined.collect());

	}
 
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:26,代码来源:MapSideJoinBroadcast.java

示例2: aggregateScores

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:34,代码来源:ALSUpdate.java

示例3: run

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public void run() throws IOException {
    FileSystem fs = DistributedFileSystem.get(new Configuration());
    Path inpath = new Path(input);
    Path outpath = new Path(output);
    if (!fs.exists(inpath)) {
        throw new IllegalArgumentException("Input file not found: " + inpath);
    }
    if (fs.exists(outpath)) {
        throw new IllegalArgumentException("Output file exists, Not overwriting it: " + inpath);
    }

    SparkConf conf = new SparkConf();
    conf.setMaster(sparkMaster);
    conf.setAppName(getClass().getSimpleName() + "::" + System.currentTimeMillis());
    JavaSparkContext ctx = new JavaSparkContext(conf);

    //STEP1: READ
    JavaPairRDD<Text, BytesWritable> rdd = ctx.sequenceFile(input, Text.class, BytesWritable.class);
            //.mapToPair(rec -> new Tuple2<>(new Text(rec._1()), new BytesWritable(rec._2().getBytes())));
    //STEP2: PARSE
    JavaPairRDD<Text, Metadata> parsedRDD = rdd.mapToPair(
            (PairFunction<Tuple2<Text, BytesWritable>, Text, Metadata>) rec -> {
                Metadata md = new Metadata();
                try (ByteArrayInputStream stream = new ByteArrayInputStream(rec._2().getBytes())) {
                    String content = TikaHolder.tika.parseToString(stream, md);
                    md.add("CONTENT", content);
                }
                return new Tuple2<>(rec._1(), md);
            });
    //STEP3: FORMAT
    JavaRDD<String> outRDD = parsedRDD.map((Function<Tuple2<Text, Metadata>, String>) rec -> {
        String key = rec._1().toString();
        Metadata metadata = rec._2();
        JSONObject object = new JSONObject();
        for (String name : metadata.names()) {
            if (metadata.isMultiValued(name)) {
                JSONArray arr = new JSONArray();
                for (String val : metadata.getValues(name)) {
                    arr.add(val);
                }
                object.put(name, arr);
            } else {
                object.put(name, metadata.get(name));
            }
        }
        return key + "\t\t" + object.toJSONString();
    });
    //STEP4: SAVE
    LOG.info("Saving at " + outpath);
    outRDD.saveAsTextFile(output);
    LOG.info("Stopping");
    ctx.stop();
}
 
开发者ID:thammegowda,项目名称:tika-dl4j-spark-imgrec,代码行数:54,代码来源:TikaSpark.java

示例4: main

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

  sc.stop();

}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:19,代码来源:SamToFastq.java

示例5: main

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:56,代码来源:SQLQueryBAM.java

示例6: calculateSimilarityFromVector

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
/**
 * Calculate term similarity from vector.
 *
 * @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD}
 *                  data structure containing the vectors.
 * @param simType   the similarity calculation to execute e.g. 
 * <ul>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li>
 * </ul>
 * @return a new {@link org.apache.spark.api.java.JavaPairRDD}
 */
public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) {
  JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2));
  JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1);

  return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() {

    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @Override
    public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) {
      String keyA = arg._1._1;
      String keyB = arg._2._1;

      if (keyA.equals(keyB)) {
        return null;
      }

      Vector vecA = arg._1._2;
      Vector vecB = arg._2._2;
      Double weight = 0.0;

      if (simType == SimilarityUtil.SIM_PEARSON) {
        weight = SimilarityUtil.pearsonDistance(vecA, vecB);
      } else if (simType == SimilarityUtil.SIM_HELLINGER) {
        weight = SimilarityUtil.hellingerDistance(vecA, vecB);
      }

      LinkageTriple triple = new LinkageTriple();
      triple.keyA = keyA;
      triple.keyB = keyB;
      triple.weight = weight;
      return triple;
    }
  }).filter(new Function<LinkageTriple, Boolean>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @Override
    public Boolean call(LinkageTriple arg0) throws Exception {
      if (arg0 == null) {
        return false;
      }
      return true;
    }
  });
}
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:65,代码来源:SimilarityUtil.java

示例7: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                        AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                        GaussianMixtureModelSummary gaussianMixtureModelSummary) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();

    JavaRDD<Vector> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, Vector>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();

                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }

                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return Vectors.dense(values);
                    }
                }
                gaussianMixtureModelSummary.updateSummary(idx, feature);
                return Vectors.dense(values);
            }
    );

    Normalizer normalizer = new Normalizer();
    JavaRDD<Vector> normed;
    if (athenaMLFeatureConfiguration.isNormalization()) {
        normed = normalizer.transform(parsedData);
    } else {
        normed = parsedData;
    }

    normed.cache();
    return normed;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:67,代码来源:GaussianMixtureDistJob.java

示例8: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<Vector> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                            AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                            KmeansModelSummary kmeansModelSummary) {
        List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
        Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

        int numberOfTargetValue = listOfTargetFeatures.size();
//        int numberOfTargetValue = 5;

        JavaRDD<Vector> parsedData = mongoRDD.map(
                (Function<Tuple2<Object, BSONObject>, Vector>) t -> {

                    BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                    BSONObject idx = (BSONObject) t._2();

                    double[] values = new double[numberOfTargetValue];
                    for (int j = 0; j < numberOfTargetValue; j++) {
                        if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                            Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                            if (obj instanceof Long) {
                                values[j] = (Long) obj;
                            } else if (obj instanceof Double) {
                                values[j] = (Double) obj;
                            } else if (obj instanceof Boolean) {
                                values[j] = (Boolean) obj ? 1 : 0;
                            } else {
                                values[j] = 0;
                            }

                            //check weight
                            if (weight.containsKey(listOfTargetFeatures.get(j))) {
                                values[j] *= weight.get(listOfTargetFeatures.get(j));
                            }
                            //check absolute
                            if (athenaMLFeatureConfiguration.isAbsolute()) {
                                values[j] = Math.abs(values[j]);
                            }
                        }
//                        values[j] = 0;
                    }

//                    //remove errors
                    for (int i = 0; i < numberOfTargetValue; i++) {
                        if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                            for (int j = 0; j < numberOfTargetValue; j++) {
                                values[j] = 0;

                            }
                            return Vectors.dense(values);
                        }
                    }
                    kmeansModelSummary.updateSummary(idx, feature);
                    return Vectors.dense(values);
                }
        );

        Normalizer normalizer = new Normalizer();
        JavaRDD<Vector> normed;
        if (athenaMLFeatureConfiguration.isNormalization()) {
            normed = normalizer.transform(parsedData);
        } else {
            normed = parsedData;
        }

        normed.cache();
        return normed;
    }
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:KMeansDistJob.java

示例9: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              GradientBoostedTreesModelSummary gradientBoostedTreesModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                gradientBoostedTreesModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:GradientBoostedTreesDistJob.java

示例10: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              RandomForestModelSummary randomForestModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                randomForestModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:RandomForestDistJob.java

示例11: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              SVMModelSummary SVMModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);

                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                SVMModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:69,代码来源:SVMDistJob.java

示例12: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              LogisticRegressionModelSummary logisticRegressionModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                logisticRegressionModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:LogisticRegressionDistJob.java

示例13: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              DecisionTreeModelSummary decisionTreeModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                decisionTreeModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:DecisionTreeDistJob.java

示例14: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              NaiveBayesModelSummary naiveBayesModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();
    Normalizer normalizer = new Normalizer();

    int numberOfTargetValue = listOfTargetFeatures.size();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()){
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }

                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }
                naiveBayesModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:66,代码来源:NaiveBayesDistJob.java

示例15: rddPreProcessing

import org.apache.spark.api.java.JavaPairRDD; //导入方法依赖的package包/类
public JavaRDD<LabeledPoint> rddPreProcessing(JavaPairRDD<Object, BSONObject> mongoRDD,
                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                              RidgeRegressionModelSummary ridgeRegressionModelSummary,
                                              Marking marking) {
    List<AthenaFeatureField> listOfTargetFeatures = athenaMLFeatureConfiguration.getListOfTargetFeatures();
    Map<AthenaFeatureField, Integer> weight = athenaMLFeatureConfiguration.getWeight();

    int numberOfTargetValue = listOfTargetFeatures.size();
    Normalizer normalizer = new Normalizer();

    JavaRDD<LabeledPoint> parsedData = mongoRDD.map(
            (Function<Tuple2<Object, BSONObject>, LabeledPoint>) t -> {

                BSONObject feature = (BSONObject) t._2().get(AthenaFeatureField.FEATURE);
                BSONObject idx = (BSONObject) t._2();
                int label = marking.checkClassificationMarkingElements(idx, feature);
                double[] values = new double[numberOfTargetValue];
                for (int j = 0; j < numberOfTargetValue; j++) {
                    if (feature.containsField(listOfTargetFeatures.get(j).getValue())) {
                        Object obj = feature.get(listOfTargetFeatures.get(j).getValue());
                        if (obj instanceof Long) {
                            values[j] = (Long) obj;
                        } else if (obj instanceof Double) {
                            values[j] = (Double) obj;
                        } else if (obj instanceof Boolean) {
                            values[j] = (Boolean) obj ? 1 : 0;
                        } else {
                            values[j] = 0;
                        }

                        //check weight
                        if (weight.containsKey(listOfTargetFeatures.get(j))) {
                            values[j] *= weight.get(listOfTargetFeatures.get(j));
                        }
                        //check absolute
                        if (athenaMLFeatureConfiguration.isAbsolute()) {
                            values[j] = Math.abs(values[j]);
                        }
                    }
                }

                //remove errors
                for (int i = 0; i < numberOfTargetValue; i++) {
                    if (Double.isInfinite(values[i]) || Double.isNaN(values[i])) {
                        for (int j = 0; j < numberOfTargetValue; j++) {
                            values[j] = 0;

                        }
                        return new LabeledPoint(label, Vectors.dense(values));
                    }
                }


                Vector normedForVal;
                if (athenaMLFeatureConfiguration.isNormalization()) {
                    normedForVal = normalizer.transform(Vectors.dense(values));
                } else {
                    normedForVal = Vectors.dense(values);
                }

                ridgeRegressionModelSummary.updateSummary(idx, feature);
                return new LabeledPoint(label, normedForVal);
            }
    );

    return parsedData;
}
 
开发者ID:shlee89,项目名称:athena,代码行数:68,代码来源:RidgeRegressionDistJob.java


注:本文中的org.apache.spark.api.java.JavaPairRDD.map方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。