本文整理汇总了Java中org.apache.spark.HashPartitioner类的典型用法代码示例。如果您正苦于以下问题:Java HashPartitioner类的具体用法?Java HashPartitioner怎么用?Java HashPartitioner使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
HashPartitioner类属于org.apache.spark包,在下文中一共展示了HashPartitioner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: partition
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* Partition instances by the specified partitioning (e.g. by instance type)
*
* @param instances RDD of instances to partition
* @return partitioned RDD if requested, original RDD if no partitioning is specified
*/
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
if (!config.isRepartitionByType()) {
return instances;
}
log.info("Getting counts by type hash");
Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
int numPartitions = instances.getNumPartitions();
long totalInstances = instances.count();
long instancesPerPartition = totalInstances / numPartitions + 1;
JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
int typeHash = getTypeHash(instance);
int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
return new Tuple2<>(typeHash + splitIncrement, instance);
});
log.info("Partitioning instances by type");
return instanceWithPartitions
.partitionBy(new HashPartitioner(numPartitions))
.values();
}
示例2: instantiateWorkers
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
* instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
*/
private void instantiateWorkers() {
if (sparkContextIsAvailable) {
/* initialize the RDD */
logger.info("Initializing an RDD of compute blocks");
computeRDD = ctx.parallelizePairs(targetBlockStream()
.map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled)))
.collect(Collectors.toList()), numTargetBlocks)
.partitionBy(new HashPartitioner(numTargetBlocks))
.cache();
} else {
logger.info("Initializing a local compute block");
localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
}
prevCheckpointedComputeRDD = null;
cacheCallCounter = 0;
}
示例3: getPartitioner
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
private Partitioner getPartitioner(JavaPairRDD<Row, Row> keyedArriving) {
if (hasPartitioner()) {
Config partitionerConfig = config.getConfig("partitioner");
return PartitionerFactory.create(partitionerConfig, keyedArriving);
}
else {
return new HashPartitioner(keyedArriving.getNumPartitions());
}
}
示例4: testHash
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
@Test
public void testHash() throws Exception {
Map<String, Object> configMap = Maps.newHashMap();
configMap.put("type", "hash");
JavaPairRDD<Row, Row> base = getDummyRDD(10);
Config config = ConfigFactory.parseMap(configMap);
Partitioner p = PartitionerFactory.create(config, base);
assertTrue(p instanceof HashPartitioner);
assertEquals(p.numPartitions(), 10);
}
示例5: groupByKeyOnly
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* An implementation of
* {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
* for the Spark runner.
*/
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(
JavaRDD<WindowedValue<KV<K, V>>> rdd,
Coder<K> keyCoder,
WindowedValueCoder<V> wvCoder) {
// we use coders to convert objects in the PCollection to byte arrays, so they
// can be transferred over the network for the shuffle.
JavaPairRDD<ByteArray, byte[]> pairRDD =
rdd
.map(new ReifyTimestampsAndWindowsFunction<K, V>())
.map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction())
.mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction())
.mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
// use a default parallelism HashPartitioner.
Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());
// using mapPartitions allows to preserve the partitioner
// and avoid unnecessary shuffle downstream.
return pairRDD
.groupByKey(partitioner)
.mapPartitionsToPair(
TranslationUtils.pairFunctionToPairFlatMapFunction(
CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)),
true)
.mapPartitions(
TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true)
.mapPartitions(
TranslationUtils.functionToFlatMapFunction(
WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()),
true);
}
示例6: Unbounded
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public Unbounded(SparkContext sc,
SerializablePipelineOptions options,
MicrobatchSource<T, CheckpointMarkT> microbatchSource,
int initialNumPartitions) {
super(sc, NIL,
JavaSparkContext$.MODULE$.<scala.Tuple2<Source<T>, CheckpointMarkT>>fakeClassTag());
this.options = options;
this.microbatchSource = microbatchSource;
this.partitioner = new HashPartitioner(initialNumPartitions);
}
示例7: fetchCopyRatioEmissionDataSpark
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* Creates a {@link JavaPairRDD} of (sample index, emission data list)
* @return a {@link JavaPairRDD}
*/
private JavaPairRDD<Integer, List<CoverageModelCopyRatioEmissionData>> fetchCopyRatioEmissionDataSpark() {
final int numSamples = this.numSamples;
return computeRDD
/* flat map workers a list of [sample index, [target block, emission data on target block]] */
.flatMapToPair(tuple -> {
final LinearlySpacedIndexBlock tb = tuple._1;
final CoverageModelEMComputeBlock cb = tuple._2;
final List<List<CoverageModelCopyRatioEmissionData>> el = cb.getSampleCopyRatioLatentPosteriorData();
return IntStream.range(0, numSamples)
.mapToObj(si -> new Tuple2<>(si, new Tuple2<>(tb, el.get(si))))
.iterator();
})
/* combine elements with the same sample index */
.combineByKey(
/* create a new list */
Collections::singletonList,
/* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element))
.collect(Collectors.toList()),
/* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()),
/* repartition with respect to sample indices */
new HashPartitioner(numSamples))
/* sort the [target block, emission data on target block] chunks for each sample into a single list */
.mapValues(emissionBlocksList -> emissionBlocksList.stream() /* for each partition ... */
/* sort the data blocks */
.sorted(Comparator.comparingInt(t -> t._1.getBegIndex()))
/* remove the LinearlySpacedIndexBlock keys from the sorted emissionBlocksList */
.map(p -> p._2)
/* flatten */
.flatMap(List::stream)
/* collect as a single list */
.collect(Collectors.toList()));
}
示例8: shuffleExamples
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* Randomly shuffle the examples in each DataSet object, and recombine them into new DataSet objects
* with the specified BatchSize
*
* @param rdd DataSets to shuffle/recombine
* @param newBatchSize New batch size for the DataSet objects, after shuffling/recombining
* @param numPartitions Number of partitions to use when splitting/recombining
* @return A new {@link JavaRDD<DataSet>}, with the examples shuffled/combined in each
*/
public static JavaRDD<DataSet> shuffleExamples(JavaRDD<DataSet> rdd, int newBatchSize, int numPartitions) {
//Step 1: split into individual examples, mapping to a pair RDD (random key in range 0 to numPartitions)
JavaPairRDD<Integer, DataSet> singleExampleDataSets =
rdd.flatMapToPair(new SplitDataSetExamplesPairFlatMapFunction(numPartitions));
//Step 2: repartition according to the random keys
singleExampleDataSets = singleExampleDataSets.partitionBy(new HashPartitioner(numPartitions));
//Step 3: Recombine
return singleExampleDataSets.values().mapPartitions(new BatchDataSetsFunction(newBatchSize));
}
示例9: extract
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public void extract(JavaPairRDD<Double, Multiset<String>> trainingDocs) {
// .filter(word -> word._2 > 6 && word._2 < 1000);
JavaPairRDD<Double, Multiset<String>> wordsClass0 = trainingDocs.filter(row -> row._1.equals(0.0)).cache();
JavaPairRDD<Double, Multiset<String>> wordsClass1 = trainingDocs.filter(row -> row._1.equals(1.0)).cache();
JavaPairRDD<String, Integer> idfClass0 = extractWordCount(wordsClass0).cache();
JavaPairRDD<String, Integer> idfClass1 = extractWordCount(wordsClass1).cache();
JavaPairRDD<String, Integer> idfWords = idfClass0.fullOuterJoin(idfClass1).mapToPair(joined -> {
String word = joined._1;
Integer left = joined._2()._1().orNull();
Integer right = joined._2()._2().orNull();
int count;
if (left != null && right != null) {
count = left.intValue() + right.intValue();
} else {
count = left != null ? left.intValue() : right.intValue();
}
return new Tuple2<>(word, count);
}).partitionBy(new HashPartitioner(100)).filter(word -> word._2 > 10).cache();
if (isVerbose()) {
this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idf-final.csv", idfWords);
this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idfClass0.csv", idfClass0);
this.saveIDFToFile("/home/momchil/Desktop/master-thesis/datasets/stats/idfClass1.csv", idfClass1);
this.printIDFInfo(idfWords);
}
wordsClass0.unpersist();
wordsClass1.unpersist();
idfClass0.unpersist();
idfClass1.unpersist();
JavaPairRDD<String, Tuple2<Integer, Long>> idfRdd = idfWords.zipWithIndex()
.mapToPair(row -> new Tuple2<>(row._1._1, new Tuple2<>(row._1._2, row._2)));
idf = idfRdd.collectAsMap();
idfWords.unpersist();
featuresCount = idf.size();
}
示例10: extractWordCount
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
private JavaPairRDD<String, Integer> extractWordCount(JavaPairRDD<Double, Multiset<String>> wordsClass0) {
return wordsClass0.flatMap(row -> row._2()).mapToPair(word -> new Tuple2<>(word, 1))
.partitionBy(new HashPartitioner(10)).reduceByKey((a, b) -> a + b);
}
示例11: joinWithWorkersAndMap
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
/**
* A generic function for handling a blockified list of objects to their corresponding compute nodes
*
* If Spark is enabled:
*
* Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
* map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
*
* If Spark is disabled:
*
* Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
* {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
* the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
* by {@code mapper.}
*
* @param data the list to joined and mapped together with the compute block(s)
* @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
* returns a new compute block
* @param <V> the type of the object to the broadcasted
*/
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data,
@Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
if (sparkContextIsAvailable) {
final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD =
ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
computeRDD = computeRDD.join(newRDD).mapValues(mapper);
} else {
try {
Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
} catch (Exception e) {
throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
}
}
}
示例12: repartitionReadsByName
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
static JavaRDD<GATKRead> repartitionReadsByName(final JavaRDD<GATKRead> reads, final int numPartitions) {
//Shuffle reads into partitions by read name hash code
return reads.mapToPair(read -> new Tuple2<>(read.getName(), read))
.partitionBy(new HashPartitioner(numPartitions))
.map(Tuple2::_2);
}
示例13: main
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf();
//sparkConf.setMaster("local");
sparkConf.setAppName("TestSpark");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
//HBaseConnection connection = new HBaseConnection(Consts.HBASE_MASTER, Consts.ZOOKKEEPER_QUORUM);
Configuration hbaseConfig = initializeHBaseConfig();
long start = System.currentTimeMillis();
//hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_START, "1444194420157");
hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_START, "0");
hbaseConfig.set(TableInputFormat.SCAN_TIMERANGE_END, "1444194520157");
JavaPairRDD<ImmutableBytesWritable, Result> rdd = jsc.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
rdd.partitionBy(new HashPartitioner(10));
JavaRDD<Map<String,String>> events = rdd.map(new Function<Tuple2<ImmutableBytesWritable, Result>, Map<String,String>>() {
@Override
public Map<String,String> call(Tuple2<ImmutableBytesWritable, Result> rowData) throws Exception {
SentinelEvent ev = new SentinelEvent(new SentinelEventDecoder().fromBytes(rowData._2.value()));
return ev.valueMap;
}
});
events.repartition(10);
JavaRDD<Map<String,String>> persist = events.persist(StorageLevel.MEMORY_AND_DISK_2());
long total = persist.count();
System.out.println("Toal " + total + " records read");
try {
//JavaEsSpark.saveToEs(events, "events2/event");
System.out.println(persist.take((int)total));
}
catch(Exception es) {
System.err.println("****** Exception: " + es.toString());
}
long sec = (System.currentTimeMillis() - start)/1000;
System.out.println("Took " + sec + " seconds at the rate of " + total/sec + " EPS");
persist.unpersist();
}
示例14: main
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
public static void main(String[] args) {
long start = System.currentTimeMillis();
SparkConf sparkConf = new SparkConf();
sparkConf.setMaster("local");
sparkConf.setAppName("TestSpark");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaRDD<String> input = sc.parallelize(data);
// represent the input data as a key value
JavaPairRDD<String, Iterable<String>> links = input.mapToPair(
new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String x)
throws Exception {
String[] array = x.split(" ");
return new Tuple2(array[0], array[1]);
}
}).groupByKey();
links.partitionBy(new HashPartitioner(5));
// initialize ranks
JavaPairRDD<String, Double> ranks = links.mapValues(x -> 1.0);
// Calculates and updates URL ranks continuously using PageRank algorithm.
for (int current = 0; current < 10; current++) {
// Calculates URL contributions to the rank of other URLs.
JavaRDD<Tuple2<Iterable<String>, Double>> values = links.join(ranks).values();
JavaPairRDD<String, Double> contribs = values
.flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
@Override
public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
int urlCount = Iterables.size(s._1);
List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
for (String n : s._1) {
results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
}
return results;
}
});
// Re-calculates URL ranks based on neighbor contributions.
ranks = contribs
.reduceByKey((a, b) -> a + b)
.mapValues(x -> 0.15 + x * 0.85);
//print(ranks);
}
print(ranks);
sc.stop();
System.out.println((System.currentTimeMillis() - start)/1000 + " seconds");
}
示例15: hashingBalancedPartitionerDoesBalance
import org.apache.spark.HashPartitioner; //导入依赖的package包/类
@Test
public void hashingBalancedPartitionerDoesBalance() {
// partitionWeightsByClass = [[1.714, .429, .857], [0.9, 0.6, 1.5]]
List<Double> reds = Arrays.asList(1.714D, 0.429D, .857D);
List<Double> blues = Arrays.asList(0.9D, 0.6D, 1.5D);
List<List<Double>> partitionWeights = Arrays.asList(reds, blues);
HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);
List<Tuple2<Integer, String>> l = new ArrayList<>();
for (int i = 0; i < 4; i++) {
l.add(new Tuple2<Integer, String>(0, "red"));
}
for (int i = 0; i < 3; i++) {
l.add(new Tuple2<Integer, String>(0, "blue"));
}
for (int i = 0; i < 1; i++) {
l.add(new Tuple2<Integer, String>(1, "red"));
}
for (int i = 0; i < 2; i++) {
l.add(new Tuple2<Integer, String>(1, "blue"));
}
for (int i = 0; i < 2; i++) {
l.add(new Tuple2<Integer, String>(2, "red"));
}
for (int i = 0; i < 5; i++) {
l.add(new Tuple2<Integer, String>(2, "blue"));
}
// This should give exactly the sought distribution
JavaPairRDD<Integer, String> rdd =
JavaPairRDD.fromJavaRDD(sc.parallelize(l)).partitionBy(new HashPartitioner(3));
// Let's reproduce UIDs
JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId().mapToPair(
new PairFunction<Tuple2<Tuple2<Integer, String>, Long>, Tuple2<Long, Integer>, String>() {
@Override
public Tuple2<Tuple2<Long, Integer>, String> call(
Tuple2<Tuple2<Integer, String>, Long> payLoadNuid) {
Long uid = payLoadNuid._2();
String value = payLoadNuid._1()._2();
Integer elemClass = value.equals("red") ? 0 : 1;
return new Tuple2<Tuple2<Long, Integer>, String>(
new Tuple2<Long, Integer>(uid, elemClass), value);
}
});
List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();
int[][] colorCountsByPartition = new int[3][2];
for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
System.out.println(val);
Integer partition = hbp.getPartition(val._1());
System.out.println(partition);
if (val._2().equals("red"))
colorCountsByPartition[partition][0] += 1;
else
colorCountsByPartition[partition][1] += 1;
}
for (int i = 0; i < 3; i++) {
System.out.println(Arrays.toString(colorCountsByPartition[i]));
}
for (int i = 0; i < 3; i++) {
// avg red per partition : 2.33
assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4);
// avg blue per partition : 3.33
assertTrue(colorCountsByPartition[i][1] >= 2 && colorCountsByPartition[i][1] < 5);
}
}