当前位置: 首页>>代码示例>>Java>>正文


Java JavaRDD.mapToPair方法代码示例

本文整理汇总了Java中org.apache.spark.api.java.JavaRDD.mapToPair方法的典型用法代码示例。如果您正苦于以下问题:Java JavaRDD.mapToPair方法的具体用法?Java JavaRDD.mapToPair怎么用?Java JavaRDD.mapToPair使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.spark.api.java.JavaRDD的用法示例。


在下文中一共展示了JavaRDD.mapToPair方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
开发者ID:hazelcast,项目名称:big-data-benchmark,代码行数:23,代码来源:SparkWordCount.java

示例2: rmse

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:19,代码来源:Evaluation.java

示例3: mapSAMRecordsToFastq

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {

    JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {

      String name = read.getReadName();
      if(read.getReadPairedFlag()){
        if(read.getFirstOfPairFlag())
          name = name+"/1";
        if(read.getSecondOfPairFlag())
          name = name+"/2";
      }

      //TODO: check values
      Text t = new Text(name);
      SequencedFragment sf = new SequencedFragment();
      sf.setSequence(new Text(read.getReadString()));
      sf.setQuality(new Text(read.getBaseQualityString()));

      return new Tuple2<Text, SequencedFragment>(t, sf);
    });
    return fastqRDD;
  }
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:23,代码来源:SamToFastq.java

示例4: mapSAMRecordsToFastq

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {

        //Map SAMRecords to MyReads
        JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {

            String name = read.getReadName();
            if(read.getReadPairedFlag()){
                if(read.getFirstOfPairFlag())
                    name = name+"/1";
                if(read.getSecondOfPairFlag())
                    name = name+"/2";
            }

            //TODO: check values
            Text t = new Text(name);
            SequencedFragment sf = new SequencedFragment();
            sf.setSequence(new Text(read.getReadString()));
            sf.setQuality(new Text(read.getBaseQualityString()));

            return new Tuple2<Text, SequencedFragment>(t, sf);
        });
        return fastqRDD;
    }
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:24,代码来源:HDFSWriter.java

示例5: readsToWritable

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) {
    return records.mapToPair(read -> {

        //SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file
        //Set in alignment to sam map phase
        if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary());
        if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null)
            header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName()));

        //read.setHeader(read.getHeader());
        read.setHeaderStrict(header.getValue());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:17,代码来源:HDFSWriter.java

示例6: partition

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Partition instances by the specified partitioning (e.g. by instance type)
 *
 * @param instances RDD of instances to partition
 * @return partitioned RDD if requested, original RDD if no partitioning is specified
 */
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
    if (!config.isRepartitionByType()) {
        return instances;
    }
    log.info("Getting counts by type hash");
    Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
    int numPartitions = instances.getNumPartitions();
    long totalInstances = instances.count();
    long instancesPerPartition = totalInstances / numPartitions + 1;

    JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
        int typeHash = getTypeHash(instance);
        int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
        return new Tuple2<>(typeHash + splitIncrement, instance);
    });

    log.info("Partitioning instances by type");
    return instanceWithPartitions
            .partitionBy(new HashPartitioner(numPartitions))
            .values();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:28,代码来源:InstancePartitioner.java

示例7: readsToWritableNoRef

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoRef(JavaRDD<SAMRecord> records) {
    return records.mapToPair(read -> {
        //read.setHeaderStrict(read.getHeader());
        read.setHeader(read.getHeader());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:10,代码来源:HDFSWriter.java

示例8: readsToWritableNoHeader

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoHeader(JavaRDD<SAMRecord> records) {
    return records.mapToPair(read -> {
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
开发者ID:NGSeq,项目名称:ViraPipe,代码行数:8,代码来源:HDFSWriter.java

示例9: parallizeData

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private JavaPairRDD<String, String> parallizeData(SparkDriver spark, List<Tuple2<String, String>> datasetContent) {

    JavaRDD<Tuple2<String, String>> datasetContentRDD = spark.sc.parallelize(datasetContent);

    return datasetContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
      /**
       * 
       */
      private static final long serialVersionUID = 1L;

      @Override
      public Tuple2<String, String> call(Tuple2<String, String> term) throws Exception {
        return term;
      }
    });

  }
 
开发者ID:apache,项目名称:incubator-sdap-mudrod,代码行数:18,代码来源:MetadataOpt.java

示例10: getGroundTruthIds

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * @deprecated use {@link #getGroundTruthIdsFromEntityIds(JavaRDD, JavaRDD,JavaRDD, String)} instead
 * Return the ground truth in an RDD format, each entity represented with an integer entity id. 
 * @param rawTriples1
 * @param rawTriples2
 * @param RAW_TRIPLES_SEPARATOR
 * @param gt
 * @param GT_SEPARATOR
 * @return 
 */
public static JavaPairRDD<Integer,Integer> getGroundTruthIds (JavaRDD<String> rawTriples1, JavaRDD<String> rawTriples2, String RAW_TRIPLES_SEPARATOR, JavaRDD<String> gt, String GT_SEPARATOR) {
    Object2IntOpenHashMap<String> entityIds1 = getEntityIdsMapping(rawTriples1, RAW_TRIPLES_SEPARATOR);
    Object2IntOpenHashMap<String> entityIds2 = getEntityIdsMapping(rawTriples2, RAW_TRIPLES_SEPARATOR); 
    
    return gt.mapToPair(line -> {
                String [] parts = line.split(GT_SEPARATOR);
                return new Tuple2<>(-entityIds2.getOrDefault(parts[1], 1), //negative id first
                                    entityIds1.getOrDefault(parts[0], -1)); //positive id second
            });
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:21,代码来源:Utils.java

示例11: readGroundTruthIds

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Return the ground truth in an RDD format, each entity represented with an integer entity id.      
 * @param gt a ground truth file containing matching entities' ids, separated by GT_SEPARATOR
 * @param GT_SEPARATOR
 * @return 
 */
public static JavaPairRDD<Integer,Integer> readGroundTruthIds (JavaRDD<String> gt, String GT_SEPARATOR) {
    return gt.mapToPair(line -> {                    
                String [] parts = line.split(GT_SEPARATOR);                    
                int entity1Id = Integer.parseInt(parts[0]);
                int entity2Id = Integer.parseInt(parts[1]);
                return new Tuple2<>(-entity2Id-1, entity1Id);                                        
            });
}
 
开发者ID:vefthym,项目名称:MinoanER,代码行数:15,代码来源:Utils.java

示例12: parsedToRatingRDD

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * @param parsedRDD parsed input as {@code String[]}
 * @return {@link Rating}s ordered by timestamp
 */
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
                                          Broadcast<Map<String,Integer>> bUserIDToIndex,
                                          Broadcast<Map<String,Integer>> bItemIDToIndex) {
  JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
    try {
      return new Tuple2<>(
          Long.valueOf(tokens[3]),
          new Rating(bUserIDToIndex.value().get(tokens[0]),
                     bItemIDToIndex.value().get(tokens[1]),
                     // Empty value means 'delete'; propagate as NaN
                     tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(tokens));
      throw e;
    }
  });

  if (decayFactor < 1.0) {
    double factor = decayFactor;
    long now = System.currentTimeMillis();
    timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
        long timestamp = timestampRating._1();
        return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
      });
  }

  if (decayZeroThreshold > 0.0) {
    double theThreshold = decayZeroThreshold;
    timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
  }

  return timestampRatingRDD.sortByKey().values();
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:38,代码来源:ALSUpdate.java

示例13: aggregateScores

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:34,代码来源:ALSUpdate.java

示例14: readFeaturesRDD

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
  log.info("Loading features RDD from {}", path);
  JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
  return featureLines.mapToPair(line -> {
    List<?> update = TextUtils.readJSON(line, List.class);
    String key = update.get(0).toString();
    float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
    return new Tuple2<>(key, vector);
  });
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:11,代码来源:ALSUpdate.java

示例15: knownsRDD

import org.apache.spark.api.java.JavaRDD; //导入方法依赖的package包/类
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                                                boolean knownItems) {
  JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());

  JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
      String user = datum[0];
      String item = datum[1];
      Boolean delete = datum[2].isEmpty();
      return knownItems ?
          new Tuple2<>(user, new Tuple2<>(item, delete)) :
          new Tuple2<>(item, new Tuple2<>(user, delete));
    });

  // TODO likely need to figure out a way to avoid groupByKey but collectByKey
  // won't work here -- doesn't guarantee enough about ordering
  return tuples.groupByKey().mapValues(idDeletes -> {
      Collection<String> ids = new HashSet<>();
      for (Tuple2<String,Boolean> idDelete : idDeletes) {
        if (idDelete._2()) {
          ids.remove(idDelete._1());
        } else {
          ids.add(idDelete._1());
        }
      }
      return ids;
    });
}
 
开发者ID:oncewang,项目名称:oryx2,代码行数:28,代码来源:ALSUpdate.java


注:本文中的org.apache.spark.api.java.JavaRDD.mapToPair方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。