当前位置: 首页>>代码示例>>Java>>正文


Java HashPartitioner类代码示例

本文整理汇总了Java中org.apache.spark.HashPartitioner的典型用法代码示例。如果您正苦于以下问题:Java HashPartitioner类的具体用法?Java HashPartitioner怎么用?Java HashPartitioner使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


HashPartitioner类属于org.apache.spark包,在下文中一共展示了HashPartitioner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: partition

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * Partition instances by the specified partitioning (e.g. by instance type)
 *
 * @param instances RDD of instances to partition
 * @return partitioned RDD if requested, original RDD if no partitioning is specified
 */
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
    if (!config.isRepartitionByType()) {
        return instances;
    }
    log.info("Getting counts by type hash");
    Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
    int numPartitions = instances.getNumPartitions();
    long totalInstances = instances.count();
    long instancesPerPartition = totalInstances / numPartitions + 1;

    JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
        int typeHash = getTypeHash(instance);
        int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
        return new Tuple2<>(typeHash + splitIncrement, instance);
    });

    log.info("Partitioning instances by type");
    return instanceWithPartitions
            .partitionBy(new HashPartitioner(numPartitions))
            .values();
}
 
开发者ID:Merck,项目名称:rdf2x,代码行数:28,代码来源:InstancePartitioner.java

示例2: instantiateWorkers

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
 * instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
 */
private void instantiateWorkers() {
    if (sparkContextIsAvailable) {
        /* initialize the RDD */
        logger.info("Initializing an RDD of compute blocks");
        computeRDD = ctx.parallelizePairs(targetBlockStream()
                .map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled)))
                .collect(Collectors.toList()), numTargetBlocks)
                .partitionBy(new HashPartitioner(numTargetBlocks))
                .cache();
    } else {
        logger.info("Initializing a local compute block");
        localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
    }
    prevCheckpointedComputeRDD = null;
    cacheCallCounter = 0;
}
 
开发者ID:broadinstitute,项目名称:gatk-protected,代码行数:21,代码来源:CoverageModelEMWorkspace.java

示例3: getPartitioner

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
private Partitioner getPartitioner(JavaPairRDD<Row, Row> keyedArriving) {    
  if (hasPartitioner()) {
    Config partitionerConfig = config.getConfig("partitioner");      
    return PartitionerFactory.create(partitionerConfig, keyedArriving); 
  }
  else {
    return new HashPartitioner(keyedArriving.getNumPartitions());
  }
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:10,代码来源:DataStep.java

示例4: testHash

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
@Test
public void testHash() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "hash");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof HashPartitioner);
  assertEquals(p.numPartitions(), 10);
}
 
开发者ID:cloudera-labs,项目名称:envelope,代码行数:13,代码来源:TestPartitionerFactory.java

示例5: groupByKeyOnly

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * An implementation of
 * {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
 * for the Spark runner.
 */
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(
    JavaRDD<WindowedValue<KV<K, V>>> rdd,
    Coder<K> keyCoder,
    WindowedValueCoder<V> wvCoder) {
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  JavaPairRDD<ByteArray, byte[]> pairRDD =
      rdd
          .map(new ReifyTimestampsAndWindowsFunction<K, V>())
          .map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction())
          .mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
  // use a default parallelism HashPartitioner.
  Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());

  // using mapPartitions allows to preserve the partitioner
  // and avoid unnecessary shuffle downstream.
  return pairRDD
      .groupByKey(partitioner)
      .mapPartitionsToPair(
          TranslationUtils.pairFunctionToPairFlatMapFunction(
              CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)),
          true)
      .mapPartitions(
          TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true)
      .mapPartitions(
          TranslationUtils.functionToFlatMapFunction(
              WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()),
          true);
}
 
开发者ID:apache,项目名称:beam,代码行数:36,代码来源:GroupCombineFunctions.java

示例6: Unbounded

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public Unbounded(SparkContext sc,
    SerializablePipelineOptions options,
    MicrobatchSource<T, CheckpointMarkT> microbatchSource,
    int initialNumPartitions) {
  super(sc, NIL,
      JavaSparkContext$.MODULE$.<scala.Tuple2<Source<T>, CheckpointMarkT>>fakeClassTag());
  this.options = options;
  this.microbatchSource = microbatchSource;
  this.partitioner = new HashPartitioner(initialNumPartitions);
}
 
开发者ID:apache,项目名称:beam,代码行数:11,代码来源:SourceRDD.java

示例7: fetchCopyRatioEmissionDataSpark

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * Creates a {@link JavaPairRDD} of (sample index, emission data list)
 * @return a {@link JavaPairRDD}
 */
private JavaPairRDD<Integer, List<CoverageModelCopyRatioEmissionData>> fetchCopyRatioEmissionDataSpark() {
    final int numSamples = this.numSamples;

    return computeRDD
            /* flat map workers a list of [sample index, [target block, emission data on target block]] */
            .flatMapToPair(tuple -> {
                final LinearlySpacedIndexBlock tb = tuple._1;
                final CoverageModelEMComputeBlock cb = tuple._2;
                final List<List<CoverageModelCopyRatioEmissionData>> el = cb.getSampleCopyRatioLatentPosteriorData();
                return IntStream.range(0, numSamples)
                        .mapToObj(si -> new Tuple2<>(si, new Tuple2<>(tb, el.get(si))))
                        .iterator();
            })
            /* combine elements with the same sample index */
            .combineByKey(
                    /* create a new list */
                    Collections::singletonList,
                    /* recipe to add an element to the list */
                    (list, element) -> Stream.concat(list.stream(), Stream.of(element))
                            .collect(Collectors.toList()),
                    /* recipe to concatenate two lists */
                    (list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()),
                    /* repartition with respect to sample indices */
                    new HashPartitioner(numSamples))
            /* sort the [target block, emission data on target block] chunks for each sample into a single list */
            .mapValues(emissionBlocksList -> emissionBlocksList.stream() /* for each partition ... */
                    /* sort the data blocks */
                    .sorted(Comparator.comparingInt(t -> t._1.getBegIndex()))
                    /* remove the LinearlySpacedIndexBlock keys from the sorted emissionBlocksList */
                    .map(p -> p._2)
                    /* flatten */
                    .flatMap(List::stream)
                    /* collect as a single list */
                    .collect(Collectors.toList()));
}
 
开发者ID:broadinstitute,项目名称:gatk-protected,代码行数:40,代码来源:CoverageModelEMWorkspace.java

示例8: shuffleExamples

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * Randomly shuffle the examples in each DataSet object, and recombine them into new DataSet objects
 * with the specified BatchSize
 *
 * @param rdd DataSets to shuffle/recombine
 * @param newBatchSize New batch size for the DataSet objects, after shuffling/recombining
 * @param numPartitions Number of partitions to use when splitting/recombining
 * @return A new {@link JavaRDD<DataSet>}, with the examples shuffled/combined in each
 */
public static JavaRDD<DataSet> shuffleExamples(JavaRDD<DataSet> rdd, int newBatchSize, int numPartitions) {
    //Step 1: split into individual examples, mapping to a pair RDD (random key in range 0 to numPartitions)

    JavaPairRDD<Integer, DataSet> singleExampleDataSets =
                    rdd.flatMapToPair(new SplitDataSetExamplesPairFlatMapFunction(numPartitions));

    //Step 2: repartition according to the random keys
    singleExampleDataSets = singleExampleDataSets.partitionBy(new HashPartitioner(numPartitions));

    //Step 3: Recombine
    return singleExampleDataSets.values().mapPartitions(new BatchDataSetsFunction(newBatchSize));
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:SparkUtils.java

示例9: extract

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public void extract(JavaPairRDD<Double, Multiset<String>> trainingDocs) {
	// .filter(word -> word._2 > 6 && word._2 < 1000);
	JavaPairRDD<Double, Multiset<String>> wordsClass0 = trainingDocs.filter(row -> row._1.equals(0.0)).cache();
	JavaPairRDD<Double, Multiset<String>> wordsClass1 = trainingDocs.filter(row -> row._1.equals(1.0)).cache();

	JavaPairRDD<String, Integer> idfClass0 = extractWordCount(wordsClass0).cache();
	JavaPairRDD<String, Integer> idfClass1 = extractWordCount(wordsClass1).cache();

	JavaPairRDD<String, Integer> idfWords = idfClass0.fullOuterJoin(idfClass1).mapToPair(joined -> {
		String word = joined._1;
		Integer left = joined._2()._1().orNull();
		Integer right = joined._2()._2().orNull();
		int count;

		if (left != null && right != null) {
			count = left.intValue() + right.intValue();
		} else {
			count = left != null ? left.intValue() : right.intValue();
		}

		return new Tuple2<>(word, count);
	}).partitionBy(new HashPartitioner(100)).filter(word -> word._2 > 10).cache();

	if (isVerbose()) {
		this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idf-final.csv", idfWords);
		this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idfClass0.csv", idfClass0);
		this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idfClass1.csv", idfClass1);
		this.printIDFInfo(idfWords);
	}

	wordsClass0.unpersist();
	wordsClass1.unpersist();
	idfClass0.unpersist();
	idfClass1.unpersist();

	JavaPairRDD<String, Tuple2<Integer, Long>> idfRdd = idfWords.zipWithIndex()
			.mapToPair(row -> new Tuple2<>(row._1._1, new Tuple2<>(row._1._2, row._2)));
	idf = idfRdd.collectAsMap();
	idfWords.unpersist();

	featuresCount = idf.size();
}
 
开发者ID:mhardalov,项目名称:news-credibility,代码行数:43,代码来源:TFIDFTransform.java

示例10: extractWordCount

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
private JavaPairRDD<String, Integer> extractWordCount(JavaPairRDD<Double, Multiset<String>> wordsClass0) {
	return wordsClass0.flatMap(row -> row._2()).mapToPair(word -> new Tuple2<>(word, 1))
			.partitionBy(new HashPartitioner(10)).reduceByKey((a, b) -> a + b);
}
 
开发者ID:mhardalov,项目名称:news-credibility,代码行数:5,代码来源:TFIDFTransform.java

示例11: joinWithWorkersAndMap

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
 * A generic function for handling a blockified list of objects to their corresponding compute nodes
 *
 * If Spark is enabled:
 *
 *      Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
 *      map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
 *
 * If Spark is disabled:
 *
 *      Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
 *      {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
 *      the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
 *      by {@code mapper.}
 *
 * @param data the list to joined and mapped together with the compute block(s)
 * @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
 *               returns a new compute block
 * @param <V> the type of the object to the broadcasted
 */
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data,
                                       @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
    if (sparkContextIsAvailable) {
        final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD =
                ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
        computeRDD = computeRDD.join(newRDD).mapValues(mapper);
    } else {
        try {
            Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
            localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
        } catch (Exception e) {
            throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
        }
    }
}
 
开发者ID:broadinstitute,项目名称:gatk-protected,代码行数:37,代码来源:CoverageModelEMWorkspace.java

示例12: repartitionReadsByName

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
static JavaRDD<GATKRead> repartitionReadsByName(final JavaRDD<GATKRead> reads, final int numPartitions) {
    //Shuffle reads into partitions by read name hash code
    return reads.mapToPair(read -> new Tuple2<>(read.getName(), read))
            .partitionBy(new HashPartitioner(numPartitions))
            .map(Tuple2::_2);
}
 
开发者ID:broadinstitute,项目名称:gatk,代码行数:7,代码来源:PSFilter.java

示例13: main

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

		SparkConf sparkConf = new SparkConf();
		//sparkConf.setMaster("local");
		sparkConf.setAppName("TestSpark");		
		JavaSparkContext jsc = new JavaSparkContext(sparkConf);

		//HBaseConnection connection = new HBaseConnection(Consts.HBASE_MASTER, Consts.ZOOKKEEPER_QUORUM);		
		Configuration hbaseConfig = initializeHBaseConfig();
		long start = System.currentTimeMillis();		
		
		//hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_START, "1444194420157");
		hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_START, "0");
		hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_END, "1444194520157");
		
		JavaPairRDD<ImmutableBytesWritable, Result> rdd = jsc.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
		rdd.partitionBy(new HashPartitioner(10));
		
		JavaRDD<Map<String,String>> events = rdd.map(new Function<Tuple2<ImmutableBytesWritable, Result>, Map<String,String>>() {
			@Override
			public Map<String,String> call(Tuple2<ImmutableBytesWritable, Result> rowData) throws Exception {
				SentinelEvent ev = new SentinelEvent(new SentinelEventDecoder().fromBytes(rowData._2.value()));				
				return ev.valueMap;
			}
		});
		events.repartition(10);
		
		JavaRDD<Map<String,String>> persist = events.persist(StorageLevel.MEMORY_AND_DISK_2());		
		long total  = persist.count();		
		System.out.println("Toal " + total + " records read");	
		
		try {
			//JavaEsSpark.saveToEs(events, "events2/event");
			System.out.println(persist.take((int)total));
		}
		catch(Exception es) {
			System.err.println("****** Exception: " + es.toString());
		}
		
		long sec = (System.currentTimeMillis() - start)/1000;
		System.out.println("Took " + sec + " seconds at the rate of " + total/sec + " EPS");
			
		persist.unpersist();					
	}
 
开发者ID:atulsm,项目名称:Test_Projects,代码行数:45,代码来源:HbaseReadTimeSeries.java

示例14: main

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public static void main(String[] args) {
	long start = System.currentTimeMillis();
	SparkConf sparkConf = new SparkConf();
	sparkConf.setMaster("local");
	sparkConf.setAppName("TestSpark");

	JavaSparkContext sc = new JavaSparkContext(sparkConf);
	JavaRDD<String> input = sc.parallelize(data);

	// represent the input data as a key value
	JavaPairRDD<String, Iterable<String>> links = input.mapToPair(
			new PairFunction<String, String, String>() {
				@Override
				public Tuple2<String, String> call(String x)
						throws Exception {
					String[] array = x.split(" ");
					return new Tuple2(array[0], array[1]);
				}
			}).groupByKey();
	links.partitionBy(new HashPartitioner(5));

	// initialize ranks
	JavaPairRDD<String, Double> ranks = links.mapValues(x -> 1.0);

	// Calculates and updates URL ranks continuously using PageRank algorithm.
	for (int current = 0; current < 10; current++) {
		// Calculates URL contributions to the rank of other URLs.
		JavaRDD<Tuple2<Iterable<String>, Double>> values = links.join(ranks).values();

		JavaPairRDD<String, Double> contribs = values
				.flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
					@Override
					public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
						int urlCount = Iterables.size(s._1);
						List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
						for (String n : s._1) {
							results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
						}
						return results;
					}
				});

		// Re-calculates URL ranks based on neighbor contributions.
		ranks = contribs
					.reduceByKey((a, b) -> a + b)
					.mapValues(x -> 0.15 + x * 0.85);
		
		//print(ranks);
	}

	print(ranks);

	sc.stop();
	System.out.println((System.currentTimeMillis() - start)/1000 + " seconds");
}
 
开发者ID:atulsm,项目名称:Test_Projects,代码行数:56,代码来源:PageRank.java

示例15: hashingBalancedPartitionerDoesBalance

import org.apache.spark.HashPartitioner; //导入依赖的package包/类
@Test
public void hashingBalancedPartitionerDoesBalance() {
    // partitionWeightsByClass = [[1.714, .429, .857], [0.9, 0.6, 1.5]]
    List<Double> reds = Arrays.asList(1.714D, 0.429D, .857D);
    List<Double> blues = Arrays.asList(0.9D, 0.6D, 1.5D);
    List<List<Double>> partitionWeights = Arrays.asList(reds, blues);

    HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);
    List<Tuple2<Integer, String>> l = new ArrayList<>();

    for (int i = 0; i < 4; i++) {
        l.add(new Tuple2<Integer, String>(0, "red"));
    }
    for (int i = 0; i < 3; i++) {
        l.add(new Tuple2<Integer, String>(0, "blue"));
    }
    for (int i = 0; i < 1; i++) {
        l.add(new Tuple2<Integer, String>(1, "red"));
    }
    for (int i = 0; i < 2; i++) {
        l.add(new Tuple2<Integer, String>(1, "blue"));
    }
    for (int i = 0; i < 2; i++) {
        l.add(new Tuple2<Integer, String>(2, "red"));
    }
    for (int i = 0; i < 5; i++) {
        l.add(new Tuple2<Integer, String>(2, "blue"));
    }
    // This should give exactly the sought distribution
    JavaPairRDD<Integer, String> rdd =
                    JavaPairRDD.fromJavaRDD(sc.parallelize(l)).partitionBy(new HashPartitioner(3));

    // Let's reproduce UIDs
    JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId().mapToPair(
                    new PairFunction<Tuple2<Tuple2<Integer, String>, Long>, Tuple2<Long, Integer>, String>() {
                        @Override
                        public Tuple2<Tuple2<Long, Integer>, String> call(
                                        Tuple2<Tuple2<Integer, String>, Long> payLoadNuid) {
                            Long uid = payLoadNuid._2();
                            String value = payLoadNuid._1()._2();
                            Integer elemClass = value.equals("red") ? 0 : 1;
                            return new Tuple2<Tuple2<Long, Integer>, String>(
                                            new Tuple2<Long, Integer>(uid, elemClass), value);
                        }
                    });

    List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();

    int[][] colorCountsByPartition = new int[3][2];
    for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
        System.out.println(val);
        Integer partition = hbp.getPartition(val._1());
        System.out.println(partition);

        if (val._2().equals("red"))
            colorCountsByPartition[partition][0] += 1;
        else
            colorCountsByPartition[partition][1] += 1;
    }

    for (int i = 0; i < 3; i++) {
        System.out.println(Arrays.toString(colorCountsByPartition[i]));
    }
    for (int i = 0; i < 3; i++) {
        // avg red per partition : 2.33
        assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4);
        // avg blue per partition : 3.33
        assertTrue(colorCountsByPartition[i][1] >= 2 && colorCountsByPartition[i][1] < 5);
    }

}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:72,代码来源:HashingBalancedPartitionerTest.java


注:本文中的org.apache.spark.HashPartitioner类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。