本文整理汇总了Scala中org.apache.spark.Partition类的典型用法代码示例。如果您正苦于以下问题:Scala Partition类的具体用法?Scala Partition怎么用?Scala Partition使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Partition类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: StudyRDD
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
class StudyRDD(sqlContext: SQLContext, schema: StructType) extends RDD[Row](sqlContext.sparkContext, deps=Nil) {
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[Row] = new StudyReader(context, schema, split)
// ??? ?? ????? 2?? ???? ??? ????.
// ? Executor? ???? ??? ????. ???? ???? 2? ??? ???, ??? ??? ? ?? Executor? ?? 2???.
override protected def getPartitions: Array[Partition] = {
val arr: Array[Partition] = new Array[Partition](2)
arr.update(0, new Partition() {
override def index: Int = 0
})
arr.update(1, new Partition() {
override def index: Int = 1
})
arr
}
}
示例2: StudyReader
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
class StudyReader(context: TaskContext, schema: StructType, split: Partition) extends Iterator[Row] {
private[this] var counter: Int = 0
// Task? ???? ???? close? ????? ??.
if(context != null) {
context.addTaskCompletionListener(context => close())
}
// 100?? Row? ??? ??
override def hasNext: Boolean = {
if(counter < 100) {
true
} else {
false
}
}
// 1?? Row? ????.
override def next(): Row = {
if(!hasNext) {
throw new NoSuchElementException("End of stream")
}
counter += 1
Row(split.index + " field1 " + counter, "field2 " + counter, "field3: " + counter)
}
// close?? ? ??? ??? ??? close??.
def close() = println("closed")
}
示例3: GDBRDD
//设置package包名称以及导入依赖的类
package com.esri.gdb
import org.apache.hadoop.conf.Configuration
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.{Logging, Partition, SparkContext, TaskContext}
case class GDBRDD(@transient sc: SparkContext, gdbPath: String, gdbName: String, numPartitions: Int) extends RDD[Row](sc, Nil) with Logging {
@DeveloperApi
override def compute(partition: Partition, context: TaskContext): Iterator[Row] = {
val part = partition.asInstanceOf[GDBPartition]
val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
val index = GDBIndex(gdbPath, part.hexName, hadoopConf)
val table = GDBTable(gdbPath, part.hexName, hadoopConf)
context.addTaskCompletionListener(context => {
table.close()
index.close()
})
table.rowIterator(index, part.startAtRow, part.numRowsToRead)
}
override protected def getPartitions: Array[Partition] = {
val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
GDBTable.findTable(gdbPath, gdbName, hadoopConf) match {
case Some(catTab) => {
val index = GDBIndex(gdbPath, catTab.hexName, hadoopConf)
try {
val numRows = index.numRows
val numRowsPerPartition = (numRows.toDouble / numPartitions).ceil.toInt
var startAtRow = 0
(0 until numPartitions).map(i => {
val endAtRow = startAtRow + numRowsPerPartition
val numRowsToRead = if (endAtRow <= numRows) numRowsPerPartition else numRows - startAtRow
val gdbPartition = GDBPartition(i, catTab.hexName, startAtRow, numRowsToRead)
startAtRow += numRowsToRead
gdbPartition
}).toArray
} finally {
index.close()
}
}
case _ => {
log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty array of Partitions !")
Array.empty[Partition]
}
}
}
}
private[this] case class GDBPartition(m_index: Int,
val hexName: String,
val startAtRow: Int,
val numRowsToRead: Int
) extends Partition {
override def index = m_index
}
示例4: MongoRDDPartitioner
//设置package包名称以及导入依赖的类
package nsmc.rdd.partitioner
import nsmc.mongo.{MongoInterval, IntervalGenerator, CollectionConfig}
import org.apache.spark.Partition
private[nsmc]
class MongoRDDPartitioner(val collectionConfig: CollectionConfig) extends nsmc.Logging {
val ig = new IntervalGenerator(collectionConfig.connectorConf.getDestination(),
collectionConfig.databaseName, collectionConfig.collectionName)
def makePartitions(): Array[Partition] = {
val intervals = ig.generateSyntheticIntervals(collectionConfig.connectorConf.splitSize, Seq(("_id", 1)))
val partitions = intervals.zipWithIndex map {
case (interval, index) => {
val p: Partition = new MongoRDDPartition(index, 0, interval)
p
}
}
partitions.to[Array]
}
def close() : Unit = {
ig.close();
}
}
示例5: MongoRDD
//设置package包名称以及导入依赖的类
package nsmc.rdd
import com.mongodb.BasicDBObject
import nsmc.Logging
import nsmc.mongo.{CollectionConfig, MongoConnector}
import nsmc.rdd.partitioner.{MongoRDDPartition, MongoRDDPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}
import scala.language.existentials
import scala.reflect.ClassTag
class MongoRDD[R] private[nsmc] (@transient sc: SparkContext,
val collectionConfig: CollectionConfig)
(implicit ct : ClassTag[R])
extends RDD[R](sc, Seq.empty) with Logging {
private val proxy = new CollectionProxy(collectionConfig)
// make sure we inherit logging from the right place: out own Logging class and not RDD
override def log = super[Logging].log
override def logName = super[Logging].logName
override def logInfo(msg: => String) = super[Logging].logInfo(msg)
override def logDebug(msg: => String) = super[Logging].logDebug(msg)
override def logTrace(msg: => String) = super[Logging].logTrace(msg)
override def logWarning(msg: => String) = super[Logging].logWarning(msg)
override def logError(msg: => String) = super[Logging].logError(msg)
override def logInfo(msg: => String, throwable: Throwable) = super[Logging].logInfo(msg, throwable)
override def logDebug(msg: => String, throwable: Throwable) = super[Logging].logDebug(msg, throwable)
override def logTrace(msg: => String, throwable: Throwable) = super[Logging].logTrace(msg, throwable)
override def logWarning(msg: => String, throwable: Throwable) = super[Logging].logWarning(msg, throwable)
override def logError(msg: => String, throwable: Throwable) = super[Logging].logError(msg, throwable)
override def isTraceEnabled() = super[Logging].isTraceEnabled()
override def getPartitions: Array[Partition] = {
proxy.getPartitions
}
override def compute(split: Partition, context: TaskContext): Iterator[R] = {
proxy.getPartitionIterator(split, context, new BasicDBObject(), new BasicDBObject()).asInstanceOf[Iterator[R]]
}
}
示例6: SQLMongoRDD
//设置package包名称以及导入依赖的类
package nsmc.rdd
import com.mongodb.DBObject
import nsmc.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}
import scala.language.existentials
// For use only in creating an RDD to return for SQL integration
class SQLMongoRDD private[nsmc] (@transient sc: SparkContext,
proxy: CollectionProxy,
filter: DBObject,
projection: DBObject)
extends RDD[DBObject](sc, Seq.empty) with Logging {
// make sure we inherit logging from the right place: out own Logging class and not RDD
override def log = super[Logging].log
override def logName = super[Logging].logName
override def logInfo(msg: => String) = super[Logging].logInfo(msg)
override def logDebug(msg: => String) = super[Logging].logDebug(msg)
override def logTrace(msg: => String) = super[Logging].logTrace(msg)
override def logWarning(msg: => String) = super[Logging].logWarning(msg)
override def logError(msg: => String) = super[Logging].logError(msg)
override def logInfo(msg: => String, throwable: Throwable) = super[Logging].logInfo(msg, throwable)
override def logDebug(msg: => String, throwable: Throwable) = super[Logging].logDebug(msg, throwable)
override def logTrace(msg: => String, throwable: Throwable) = super[Logging].logTrace(msg, throwable)
override def logWarning(msg: => String, throwable: Throwable) = super[Logging].logWarning(msg, throwable)
override def logError(msg: => String, throwable: Throwable) = super[Logging].logError(msg, throwable)
override def isTraceEnabled() = super[Logging].isTraceEnabled()
override def getPartitions: Array[Partition] = {
proxy.getPartitions
}
override def compute(split: Partition, context: TaskContext): Iterator[DBObject] = {
proxy.getPartitionIterator(split, context, filter, projection)
}
}
示例7: OracleJdbcPartition
//设置package包名称以及导入依赖的类
package util
import org.apache.spark.Partition
import java.sql.ResultSet
import java.sql.Connection
import scala.reflect.ClassTag
import org.apache.spark.SparkContext
import java.sql.PreparedStatement
import org.apache.spark.rdd.RDD
import org.apache.spark.Logging
import org.apache.spark.TaskContext
import org.apache.spark.util
class OracleJdbcPartition(idx: Int, parameters: Map[String, Object]) extends Partition {
override def index = idx
val partitionParameters = parameters
}
abstract class OracleJdbcRdd[T:ClassTag] (
sc:SparkContext,
getConnection:() => Connection,
sql:String,
getOracleJdbcPatition :()=>Array[Partition],
preparedStatement :(PreparedStatement,OracleJdbcPartition)=> PreparedStatement,
mapRow :(ResultSet)=> T=OracleJdbcRdd.resultSetToObjectArray _)
extends RDD[T](sc,Nil)with Logging{
def getPartitions: Array[Partition] = {
getOracleJdbcPatition();
}
}
object OracleJdbcRdd {
def resultSetToObjectArray(rs: ResultSet): Array[Object] = {
Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
}
trait ConnectionFactory extends Serializable {
@throws[Exception]
def getConnection: Connection
}
}
示例8: cursorMark
//设置package包名称以及导入依赖的类
package com.lucidworks.spark
import org.apache.solr.client.solrj.SolrQuery
import org.apache.solr.common.params.SolrParams
import org.apache.spark.Partition
trait SolrRDDPartition extends Partition {
def cursorMark: String
def solrShard: SolrShard
def query: SolrQuery
def preferredReplica: SolrReplica // Preferred replica to query
}
case class CloudStreamPartition(
index: Int,
zkhost:String,
collection:String,
params: SolrParams)
extends Partition
case class ShardRDDPartition(
index: Int,
cursorMark: String,
solrShard: SolrShard,
query: SolrQuery,
preferredReplica: SolrReplica)
extends SolrRDDPartition
case class SplitRDDPartition(
index: Int,
cursorMark: String,
solrShard: SolrShard,
query: SolrQuery,
preferredReplica: SolrReplica)
extends SolrRDDPartition
示例9: SolrPartitioner
//设置package包名称以及导入依赖的类
package com.lucidworks.spark
import java.net.InetAddress
import com.lucidworks.spark.rdd.SolrRDD
import com.lucidworks.spark.util.SolrSupport
import org.apache.solr.client.solrj.SolrQuery
import org.apache.spark.Partition
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
// Is there a need to override {@code Partitioner.scala} and define our own partition id's
object SolrPartitioner {
def getShardPartitions(shards: List[SolrShard], query: SolrQuery) : Array[Partition] = {
shards.zipWithIndex.map{ case (shard, i) =>
// Chose any of the replicas as the active shard to query
new ShardRDDPartition(i, "*", shard, query, SolrRDD.randomReplica(shard))}.toArray
}
def getSplitPartitions(
shards: List[SolrShard],
query: SolrQuery,
splitFieldName: String,
splitsPerShard: Int): Array[Partition] = {
var splitPartitions = ArrayBuffer.empty[SplitRDDPartition]
var counter = 0
shards.foreach(shard => {
// Form a continuous iterator list so that we can pick different replicas for different partitions in round-robin mode
val replicaContinuousIterator: Iterator[SolrReplica] = Iterator.continually(shard.replicas).flatten
val splits = SolrSupport.splitShards(query, shard, splitFieldName, splitsPerShard)
splits.foreach(split => {
splitPartitions += SplitRDDPartition(counter, "*", shard, split.getSplitQuery, replicaContinuousIterator.next())
counter = counter + 1
})
})
splitPartitions.toArray
}
}
case class SolrShard(shardName: String, replicas: List[SolrReplica])
case class SolrReplica(
replicaNumber: Int,
replicaName: String,
replicaUrl: String,
replicaHostName: String,
locations: Array[InetAddress]) {
override def toString(): String = {
return s"SolrReplica(${replicaNumber}) ${replicaName}: url=${replicaUrl}, hostName=${replicaHostName}, locations="+locations.mkString(",")
}
}
示例10: UnencryptedRDD
//设置package包名称以及导入依赖的类
package org.apache.datacommons.protectr.rdds
import com.n1analytics.paillier.{PaillierContext, PaillierPublicKey}
import org.apache.datacommons.protectr.encryptors.EncryptionKeyPair
import org.apache.datacommons.protectr.types.{CSV, FileType}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, TaskContext}
class UnencryptedRDD(parent: RDD[String],fileType: FileType = CSV)
extends RDD[String](parent) {
def encryptHomomorphically(keyPair: EncryptionKeyPair, columnIndex: Int)
: HomomorphicallyEncryptedRDD = {
val publicKey: PaillierPublicKey = keyPair.getPublicKey
val signedContext: PaillierContext = publicKey.createSignedContext
val encryptedRDD = this.map(row => {
val values: Array[String] = fileType.parseRecord(row)
val numericValue: String = values(columnIndex)
values(columnIndex) = signedContext.encrypt(numericValue.toDouble).toString
fileType.join(values)
})
new HomomorphicallyEncryptedRDD(encryptedRDD, keyPair, fileType)
}
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[String] = {
parent.compute(split, context)
}
override protected def getPartitions: Array[Partition] = parent.partitions
}
示例11: HomomorphicallyEncryptedRDD
//设置package包名称以及导入依赖的类
package org.apache.datacommons.protectr.rdds
import java.math.BigInteger
import com.n1analytics.paillier.{PaillierPrivateKey, EncryptedNumber}
import org.apache.datacommons.protectr.encryptors.EncryptionKeyPair
import org.apache.datacommons.protectr.types.FileType
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, TaskContext}
class HomomorphicallyEncryptedRDD
(RDD: RDD[String], keyPair: EncryptionKeyPair, fileType: FileType) extends RDD[String](RDD) {
def sum(columnIndex: Int): BigInteger = {
val finalRecord = this.reduce((firstRow, secondRow) => {
val firstRecord: Array[String] = fileType.parseRecord(firstRow)
val secondRecord: Array[String] = fileType.parseRecord(secondRow)
val firstNumber: EncryptedNumber = EncryptedNumber.create(
firstRecord(columnIndex), keyPair.getPrivateKey)
val secondNumber: EncryptedNumber = EncryptedNumber.create(
secondRecord(columnIndex), keyPair.getPrivateKey)
firstRecord(columnIndex) = firstNumber.add(secondNumber).toString
fileType.join(firstRecord)
})
val sum: String = fileType.parseRecord(finalRecord)(columnIndex)
val result: EncryptedNumber = EncryptedNumber.create(sum, keyPair.getPrivateKey)
result.decrypt(keyPair.getPrivateKey).decodeApproximateBigInteger
}
def decrypt(columnIndex: Int): UnencryptedRDD = {
val privateKey: PaillierPrivateKey = keyPair.getPrivateKey
val javaRDD = this.map(row =>{
val values: Array[String] = fileType.parseRecord(row)
val encryptedNumber: EncryptedNumber = EncryptedNumber.create(
values(columnIndex), keyPair.getPrivateKey)
val bigInteger: BigInteger = privateKey.decrypt(encryptedNumber).decodeApproximateBigInteger
values(columnIndex) = bigInteger.toString
fileType.join(values)
})
new UnencryptedRDD(javaRDD,fileType)
}
override protected def getPartitions = RDD.partitions
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[String] = {
RDD.compute(split, context)
}
}
示例12: MongoRDDPartitioner
//设置package包名称以及导入依赖的类
package nsmc.rdd.partitioner
import nsmc.mongo.{MongoInterval, IntervalGenerator, CollectionConfig}
import org.apache.spark.Partition
private[nsmc]
class MongoRDDPartitioner(val collectionConfig: CollectionConfig) extends nsmc.Logging {
val ig = new IntervalGenerator(collectionConfig.connectorConf.getDestination(),
collectionConfig.databaseName, collectionConfig.collectionName)
def makePartitions(): Array[Partition] = {
val intervals = if (collectionConfig.connectorConf.splitIndexed && collectionConfig.indexedKeys.length > 0) {
logInfo(s"Partitioning collection '${collectionConfig.collectionName}' in database '${collectionConfig.databaseName}' with synthetic partitions")
ig.generateSyntheticIntervals(collectionConfig.connectorConf.splitSize, collectionConfig.indexedKeys)
} else if (collectionConfig.connectorConf.useShardChunks) {
logInfo(s"Partitioning collection '${collectionConfig.collectionName}' in database '${collectionConfig.databaseName}' with shard chunks")
ig.generate(collectionConfig.connectorConf.directToShards)
} else {
logInfo(s"NOT Partitioning collection '${collectionConfig.collectionName}' in database '${collectionConfig.databaseName}' -- producing single partition")
val interval = new MongoInterval(null, null, collectionConfig.connectorConf.getDestination())
Seq(interval)
}
val partitions = intervals.zipWithIndex map {
case (interval, index) => {
val p: Partition = new MongoRDDPartition(index, 0, interval)
p
}
}
partitions.to[Array]
}
def close() : Unit = {
ig.close();
}
}
示例13: TitanRDD
//设置package包名称以及导入依赖的类
package com.goyeau.spark.titan.connector
import com.goyeau.spark.gremlin.connector.Workaround._
import com.thinkaurelius.titan.core.{TitanFactory, TitanGraph}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.GraphTraversal
import scala.collection.JavaConversions._
import scala.reflect.ClassTag
class TitanRDD[E: ClassTag](sc: SparkContext,
traversal: GraphTraversal[_, E],
numSlices: Int = 0) extends RDD[E](sc, Nil) {
private val defaultParallelism = sc.defaultParallelism
private val readConf = GraphConf(traversal.asAdmin.getGraph.get.asInstanceOf[TitanGraph].configuration)
override def compute(split: Partition, context: TaskContext): Iterator[E] = {
val partition = split.asInstanceOf[TitanPartition[E]]
val partitionTraversal = partition.traversal.asAdmin
val graph = TitanFactory.open(readConf.toTitanConf)
partitionTraversal.setGraph(graph)
partitionTraversal.toList().toIterator
}
override protected def getPartitions: Array[Partition] = {
val numElement = cloneTraversal(traversal).count().toList().head
val numPartitions =
if (numSlices > 0 && numElement >= numSlices) numSlices
else if (numElement >= defaultParallelism) defaultParallelism
else numElement.toInt
val partitionSize = numElement / numPartitions
(0 until numPartitions).toArray map { i =>
val from = partitionSize * i
val to = partitionSize * (i + 1)
val partitionTraversal = cloneTraversal(traversal).range(from, to)
TitanPartition(i, partitionTraversal)
}
}
}