本文整理汇总了Scala中org.apache.spark.rdd.ShuffledRDD类的典型用法代码示例。如果您正苦于以下问题:Scala ShuffledRDD类的具体用法?Scala ShuffledRDD怎么用?Scala ShuffledRDD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ShuffledRDD类的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: PointsPartitionedByBoxesRDD
//设置package包名称以及导入依赖的类
package org.alitouka.spark.dbscan.spatial.rdd
import org.alitouka.spark.dbscan.spatial.{BoxCalculator, PointSortKey, Box, Point}
import org.apache.spark.rdd.{ShuffledRDD, RDD}
import org.alitouka.spark.dbscan._
import org.apache.spark.SparkContext
import org.alitouka.spark.dbscan.util.PointIndexer
private [dbscan] class PointsPartitionedByBoxesRDD (prev: RDD[(PointSortKey, Point)], val boxes: Iterable[Box], val boundingBox: Box)
extends ShuffledRDD [PointSortKey, Point, Point] (prev, new BoxPartitioner(boxes))
object PointsPartitionedByBoxesRDD {
def apply (rawData: RawDataSet,
partitioningSettings: PartitioningSettings = new PartitioningSettings (),
dbscanSettings: DbscanSettings = new DbscanSettings ())
: PointsPartitionedByBoxesRDD = {
val sc = rawData.sparkContext
val boxCalculator = new BoxCalculator (rawData)
val (boxes, boundingBox) = boxCalculator.generateDensityBasedBoxes (partitioningSettings, dbscanSettings)
val broadcastBoxes = sc.broadcast(boxes)
var broadcastNumberOfDimensions = sc.broadcast (boxCalculator.numberOfDimensions)
val pointsInBoxes = PointIndexer.addMetadataToPoints(
rawData,
broadcastBoxes,
broadcastNumberOfDimensions,
dbscanSettings.distanceMeasure)
PointsPartitionedByBoxesRDD (pointsInBoxes, boxes, boundingBox)
}
def apply (pointsInBoxes: RDD[(PointSortKey, Point)], boxes: Iterable[Box], boundingBox: Box): PointsPartitionedByBoxesRDD = {
new PointsPartitionedByBoxesRDD(pointsInBoxes, boxes, boundingBox)
}
private [dbscan] def extractPointIdsAndCoordinates (data: RDD[(PointSortKey, Point)]): RDD[(PointId, PointCoordinates)] = {
data.map ( x => (x._2.pointId, x._2.coordinates) )
}
}
示例2: TeraSort
//设置package包名称以及导入依赖的类
package com.microsoft.spark.perf.core
import scala.collection.mutable.ArrayBuffer
import com.microsoft.spark.perf.Benchmarkable
import com.microsoft.spark.perf.core.report.CoreBenchmarkResult
import com.microsoft.spark.perf.report.{BenchmarkResult, ExecutionMode, Failure}
import com.microsoft.spark.perf.report.ExecutionMode.{ForeachResults, WriteTextFile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{RDD, ShuffledRDD}
private[core] class TeraSort (
@transient sparkContext: SparkContext,
inputPath: String,
outputPath: String = "",
inputPartitions: Int,
outputPartitions: Int,
override protected val executionMode: ExecutionMode = ExecutionMode.ForeachResults)
extends Benchmarkable(sparkContext) {
override val name = "TeraSort"
private[core] def generateInputRecords: RDD[RecordWrapper] = {
println(s"Reading from $inputPath")
val inputData = sparkContext.objectFile[Array[Byte]](inputPath, inputPartitions).cache()
val dataset = inputData.map(record => new RecordWrapper(record)).cache()
dataset
}
override protected def doBenchmark(
includeBreakdown: Boolean,
description: String,
messages: ArrayBuffer[String]): BenchmarkResult = {
try {
val dataset = generateInputRecords
val partitioner = new TeraSortPartitioner(outputPartitions * 2)
val output = new ShuffledRDD[Array[Byte], Array[Byte], Array[Byte]](dataset, partitioner).
cache()
output.setSerializer(new TeraSortSerializer)
output.setKeyOrdering(new TeraSortRecordOrdering)
val executionTime = measureTimeMs {
executionMode match {
case ForeachResults =>
output.foreach(_ => Unit)
case WriteTextFile(outputPath) =>
output.map(result => UnsafeUtils.getUnsafeInstance.getLong(result._1, 16)).
saveAsTextFile(outputPath)
}
}
new CoreBenchmarkResult(name, executionMode.toString, executionTime = Some(executionTime))
} catch {
case e: Exception =>
e.printStackTrace()
BenchmarkResult(name, executionMode.toString,
failure = Some(Failure(e.getClass.getName, e.getMessage)))
}
}
}