本文整理汇总了Scala中org.apache.spark.rdd.RDD类的典型用法代码示例。如果您正苦于以下问题:Scala RDD类的具体用法?Scala RDD怎么用?Scala RDD使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RDD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: Histogram
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object Histogram{
def main(args:Array[String]){
val conf:SparkConf = new SparkConf().setAppName("Histogram").setMaster("local")
val sc:SparkContext = new SparkContext(conf)
val dataset1:RDD[String] = sc.textFile("/home/hadoop/spark/scala/mllib/core/data1")
val dataset2:RDD[String] = sc.textFile("/home/hadoop/spark/scala/mllib/core/data2");
val subRDD:RDD[String] = dataset1.subtract(dataset2)
val keyValueRDD:RDD[(String, String)] = subRDD.map(line => (line.split(",")(1), line.split(",")(0)))
val hist = keyValueRDD.countByKey
for((k,v) <- hist){
println(k + "===>" + v)
}
}
}
示例2: RddAggregateByKey
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object RddAggregateByKey{
def main(args:Array[String]){
val conf = new SparkConf().setAppName("RDD Aggregate").setMaster("local")
val sc = new SparkContext(conf)
val stocks = sc.textFile("./stocks")
val projdata = stocks.map(line => (line.split("\t")(1), line.split("\t")(7).toInt))
val volMax = projdata.aggregateByKey(0)(math.max(_,_), math.max(_,_))
val volMin = projdata.aggregateByKey(100000000)(math.min(_,_), math.min(_,_))
val aggRdd = volMax ++ volMin
aggRdd.saveAsTextFile("./voulme")
}
}
示例3: StudyRDD
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
class StudyRDD(sqlContext: SQLContext, schema: StructType) extends RDD[Row](sqlContext.sparkContext, deps=Nil) {
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[Row] = new StudyReader(context, schema, split)
// ??? ?? ????? 2?? ???? ??? ????.
// ? Executor? ???? ??? ????. ???? ???? 2? ??? ???, ??? ??? ? ?? Executor? ?? 2???.
override protected def getPartitions: Array[Partition] = {
val arr: Array[Partition] = new Array[Partition](2)
arr.update(0, new Partition() {
override def index: Int = 0
})
arr.update(1, new Partition() {
override def index: Int = 1
})
arr
}
}
示例4: extractTriples
//设置package包名称以及导入依赖的类
package net.sansa_stack.inference.spark.forwardchaining
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import net.sansa_stack.inference.data.RDFTriple
import net.sansa_stack.inference.spark.data.model.RDFGraph
import net.sansa_stack.inference.utils.Profiler
def extractTriples(triples: RDD[RDFTriple],
subject: Option[String],
predicate: Option[String],
obj: Option[String]): RDD[RDFTriple] = {
var extractedTriples = triples
if(subject.isDefined) {
extractedTriples = extractedTriples.filter(triple => triple.s == subject.get)
}
if(predicate.isDefined) {
extractedTriples = extractedTriples.filter(triple => triple.p == predicate.get)
}
if(obj.isDefined) {
extractedTriples = extractedTriples.filter(triple => triple.o == obj.get)
}
extractedTriples
}
}
示例5: KMeansClusteringApp
//设置package包名称以及导入依赖的类
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
object KMeansClusteringApp {
def main(args: Array[String]) {
if (args.length != 4) {
System.err.println(
"Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
System.exit(1)
}
val Seq(appName, batchInterval, hostname, port) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))
val substream = ssc.socketTextStream(hostname, port.toInt)
.filter(!_.contains("NaN"))
.map(_.split(" "))
.filter(f => f(1) != "0")
val orientationStream = substream
.map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
.map(arr => arr.map(_.toDouble))
.filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
val model = new StreamingKMeans()
.setK(3)
.setDecayFactor(0)
.setRandomCenters(18, 0.0)
model.trainOn(train.map(v => v.features))
val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))
ssc.start()
ssc.awaitTermination()
}
}
示例6: MedianOfMediansCalculator
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
class MedianOfMediansCalculator {
def calculateMedianOfMediansForFile(hdfsFilePath: String, sc: SparkContext): Double =
calculateMedianOfMedians(sortAndNumberMedians(calculateMediansPerLine(readFileOfIntegers(hdfsFilePath, sc))))
def readFileOfIntegers(hdfsFilePath: String, sc: SparkContext): RDD[Array[Int]] = {
sc.textFile(hdfsFilePath)
.map(line => line.split("\\D+"))
.map(lineParts => lineParts.map(number => number.toInt)
.sorted)
}
def calculateMediansPerLine(integerArrayRdd: RDD[Array[Int]]): RDD[Double] = {
integerArrayRdd.map { lineInts =>
if (lineInts.length % 2 == 0)
(lineInts(lineInts.length / 2) + lineInts((lineInts.length / 2) + 1)) / 2.0
else
lineInts((lineInts.length / 2) + 1)
}
}
def sortAndNumberMedians(lineMedians: RDD[Double]): RDD[(Long, Double)] = {
lineMedians
.sortBy(identity)
.zipWithIndex
.keyBy { case (_, index) => index }
.mapValues { case (value, _) => value }
}
def calculateMedianOfMedians(sortedAndNumberedMedians: RDD[(Long, Double)]): Double = {
if (sortedAndNumberedMedians.count() % 2 == 0)
sortedAndNumberedMedians.lookup((sortedAndNumberedMedians.count / 2) + 1).head + sortedAndNumberedMedians.lookup(sortedAndNumberedMedians.count / 2).head / 2.0
else
sortedAndNumberedMedians.lookup((sortedAndNumberedMedians.count / 2) + 1).head
}
}
示例7: PrecipSource
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
case class PrecipSource(sourceId: Int,
name: String,
countryCode: String,
latitude: String,
longitude: String,
elevation: Int,
elementId: String,
beginDate: String,
endDate: String,
participantId: Int,
participantName: String
)
case class Precipication(stationId: Int,
sourceId: Int,
date: String,
amount: Int,
quality: Int
)
class Mappers() {
def precipicationDF(spark: SparkSession, sourceFilPath: String): Dataset[Precipication] = {
import spark.implicits._
var sourceFile: RDD[String] = spark.sparkContext.textFile(sourceFilPath)
val header = spark.sparkContext.parallelize(sourceFile.take(20))
sourceFile = sourceFile.subtract(header)
header.unpersist()
var precipitionDF: Dataset[Precipication] = sourceFile
.map(s => s.split(",")
.map(_.trim()))
.map(fields => Precipication(
stationId = fields(0).toInt,
sourceId = fields(1).toInt,
date = fields(2),
amount = fields(3).toInt,
quality = fields(4).toInt
))
.toDS()
precipitionDF.show(false)
precipitionDF
}
}
示例8: ReadsRDD
//设置package包名称以及导入依赖的类
package org.hammerlab.guacamole.readsets.rdd
import java.io.File
import org.apache.spark.rdd.RDD
import org.hammerlab.guacamole.reads.{MappedRead, PairedRead, Read}
import org.hammerlab.guacamole.readsets.io.Input
case class ReadsRDD(reads: RDD[Read], input: Input) {
val basename = new File(input.path).getName
val shortName = basename.substring(0, math.min(100, basename.length))
lazy val mappedReads =
reads.flatMap({
case r: MappedRead => Some(r)
case PairedRead(r: MappedRead, _, _) => Some(r)
case _ => None
}).setName(s"Mapped reads: $shortName")
lazy val mappedPairedReads: RDD[PairedRead[MappedRead]] =
reads.flatMap({
case rp: PairedRead[_] if rp.isMapped => Some(rp.asInstanceOf[PairedRead[MappedRead]])
case _ => None
}).setName(s"Mapped reads: $shortName")
}
示例9: Checkpoint
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object Checkpoint{
def main(args:Array[String]){
val conf = new SparkConf().setAppName("RDD Aggregate").setMaster("local")
val sc = new SparkContext(conf)
sc.setCheckpointDir("./projdata")
val stocks = sc.textFile("./stocks")
val projdata = stocks.map(record => (record.split("\t")(1), record.split("\t")(7).toInt))
projdata.checkpoint()
println(projdata.count())
}
}
示例10: GroupWith
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object GroupWith{
def main(args:Array[String]){
val conf = new SparkConf().setAppName("RDD Aggregate").setMaster("local")
val sc = new SparkContext(conf)
val citi = sc.textFile("./citi")
val hdfc = sc.textFile("./hdfc")
val sbi = sc.textFile("./sbi")
val citiPairRDD = citi.map(row => (row.split("\t")(0), row.split("\t")(1).toInt))
val hdfcPairRDD = hdfc.map(row => (row.split("\t")(0), row.split("\t")(1).toInt))
val sbiPairRDD = sbi.map(row => (row.split("\t")(0), row.split("\t")(1).toInt))
val groupRDD = citiPairRDD.groupWith(hdfcPairRDD, sbiPairRDD)
groupRDD.collect.foreach{println}
}
}
示例11: Item
//设置package包名称以及导入依赖的类
package com.github.vladminzatu.surfer.persist
import com.github.vladminzatu.surfer.Score
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.{HttpClientBuilder}
import org.apache.spark.rdd.RDD
import org.json4s.jackson.Serialization.write
case class Item(item:String, score:Double)
class RestPersister extends Persister {
val url = "http://localhost:8080/items"
override def persist(scores: RDD[(String, Score)]): Unit = {
implicit val formats = org.json4s.DefaultFormats
val payload = write(scores.collect().sortWith((a,b) => a._2.value > b._2.value).map(x => Item(x._1, x._2.value)))
val client = HttpClientBuilder.create().build();
client.execute(postRequest(payload))
}
private def postRequest(payload: String): HttpPost = {
val post = new HttpPost(url)
post.setEntity(new StringEntity(payload))
post
}
}
示例12: MllibLBFGS
//设置package包名称以及导入依赖的类
package optimizers
import breeze.linalg.{DenseVector, Vector}
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater, Updater}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import utils.Functions._
class MllibLBFGS(val data: RDD[LabeledPoint],
loss: LossFunction,
regularizer: Regularizer,
params: LBFGSParameters
) extends Optimizer(loss, regularizer) {
val opt = new LogisticRegressionWithLBFGS
val reg: Updater = (regularizer: Regularizer) match {
case _: L1Regularizer => new L1Updater
case _: L2Regularizer => new SquaredL2Updater
case _: Unregularized => new SimpleUpdater
}
opt.optimizer.
setNumIterations(params.iterations).
setConvergenceTol(params.convergenceTol).
setNumCorrections(params.numCorrections).
setRegParam(regularizer.lambda).
setUpdater(reg)
override def optimize(): Vector[Double] = {
val model = opt.run(data)
val w = model.weights.toArray
return DenseVector(w)
}
}
示例13: SamplePCA
//设置package包名称以及导入依赖的类
package org.broadinstitute.hail.methods
import org.apache.spark.mllib.linalg.{Matrix, DenseMatrix}
import org.apache.spark.rdd.RDD
import org.broadinstitute.hail.variant.Variant
import org.broadinstitute.hail.variant.VariantDataset
class SamplePCA(k: Int, computeLoadings: Boolean, computeEigenvalues: Boolean) {
def name = "SamplePCA"
def apply(vds: VariantDataset): (Matrix, Option[RDD[(Variant, Array[Double])]], Option[Array[Double]]) = {
val (variants, mat) = ToStandardizedIndexedRowMatrix(vds)
val sc = vds.sparkContext
val variantsBc = sc.broadcast(variants)
val svd = mat.computeSVD(k, computeU = computeLoadings)
val scores =
svd.V.multiply(DenseMatrix.diag(svd.s))
val loadings =
if (computeLoadings)
Some(svd.U.rows.map(ir =>
(variantsBc.value(ir.index.toInt), ir.vector.toArray)))
else
None
val eigenvalues =
if (computeEigenvalues)
Some(svd.s.toArray.map(x => x * x))
else
None
(scores, loadings, eigenvalues)
}
}
示例14: of
//设置package包名称以及导入依赖的类
package fregata.spark.metrics.classification
import org.apache.spark.rdd.RDD
import fregata.Num
def of(rs2:RDD[(Num, Num)]) = {
val rs = rs2.sortByKey(false)
val total = rs2.count()
val (m,sum) = rs.zipWithIndex().map{
case ((predict,label),rank) =>
if( label == 1 ) {
predict -> ( total - rank , 1L , 1 , 0)
}else{
predict -> ( total - rank , 1L , 0 , 1)
}
}.reduceByKey{
case ((r1,c1,p1,f1),(r2,c2,p2,f2)) =>
(r1+r2 ,c1+c2,p1+p2,f1+f2)
}.map{
case (score,(rank,count,positive,navigate)) =>
val avg = rank.toDouble / count
(positive,avg * positive)
}.filter( _._1 > 0 ).treeReduce{
case ((p1,r1),(p2,r2)) => (p1+p2,r1+r2)
}
val M = m.toDouble
if( M == 0 || M == total ) 0.5
else{
val N = total - M
val diff = sum - ( M * ( M + 1 ) / 2 )
diff / (M * N)
}
}
}
示例15: PDF
//设置package包名称以及导入依赖的类
package org.hammerlab.coverage.two_sample
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.coverage
import org.hammerlab.coverage.histogram.JointHistogram.Depth
import spire.algebra.Monoid
case class PDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])],
maxDepth1: Depth,
maxDepth2: Depth)
extends coverage.PDF[C]
with CanDownSampleRDD[C]
case class CDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])])
extends coverage.CDF[C]
with CanDownSampleRDD[C]