本文整理汇总了Scala中org.apache.spark.sql.Encoder类的典型用法代码示例。如果您正苦于以下问题:Scala Encoder类的具体用法?Scala Encoder怎么用?Scala Encoder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Encoder类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: GladLogLikelihoodAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
import scala.math.log
private[crowd] class GladLogLikelihoodAggregator(params: Broadcast[GladParams])
extends Aggregator[GladPartial, GladLogLikelihoodAggregatorBuffer, Double]{
def zero: GladLogLikelihoodAggregatorBuffer = GladLogLikelihoodAggregatorBuffer(0,-1)
def reduce(b: GladLogLikelihoodAggregatorBuffer, a: GladPartial) : GladLogLikelihoodAggregatorBuffer = {
val alphaVal = params.value.alpha(a.annotator.toInt)
val betaVal = a.beta
val sig = Functions.sigmoid(alphaVal*betaVal)
val p0 = 1-a.est
val p1 = a.est
val k0 = if (a.value == 0) sig else 1-sig
val k1 = if (a.value == 1) sig else 1-sig
GladLogLikelihoodAggregatorBuffer(b.agg + Functions.prodlog(p0,k0)
+ Functions.prodlog(p1,k1), p1)
}
def merge(b1: GladLogLikelihoodAggregatorBuffer, b2: GladLogLikelihoodAggregatorBuffer) : GladLogLikelihoodAggregatorBuffer = {
GladLogLikelihoodAggregatorBuffer(b1.agg + b2.agg, if (b1.classProb == -1) b2.classProb else b1.classProb)
}
def finish(reduction: GladLogLikelihoodAggregatorBuffer) = {
val agg = reduction.agg
val w0 = params.value.w(0)
val w1 = params.value.w(1)
val lastVal = reduction.agg + Functions.prodlog((1-reduction.classProb),params.value.w(0)) +
Functions.prodlog(reduction.classProb,params.value.w(1))
lastVal
}
def bufferEncoder: Encoder[GladLogLikelihoodAggregatorBuffer] = Encoders.product[GladLogLikelihoodAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladLogLikelihoodAggregatorBuffer(agg: Double, classProb: Double)
示例2: dimensionTableName
//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling
import org.apache.spark.sql.{Column, Dataset, Encoder, Encoders}
def dimensionTableName: String
def tmpDimensionTableName: String = {
val Array(dbName, tableName) = dimensionTableName.split('.')
s"${dbName}.tmp_${tableName}"
}
def maxPartitionsInDimensionTable: Int = 400
def dimensionTable(refresh: Boolean = false): Dataset[DIM] = {
if (refresh) {
spark.catalog.refreshTable(dimensionTableName)
}
spark.table(dimensionTableName).as[DIM]
}
def currentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"${isCurrentColumnName}")
def notCurrentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"NOT ${isCurrentColumnName}")
def save(ds: Dataset[DIM], useTempTable: Boolean = true): Dataset[DIM] = {
println("Saving dimensions..")
val toSave = if (useTempTable) {
ds
.coalesce(maxPartitionsInDimensionTable)
.write
.mode("overwrite")
.saveAsTable(tmpDimensionTableName)
spark.table(tmpDimensionTableName)
} else {
ds
}
toSave
.coalesce(maxPartitionsInDimensionTable)
.write
.mode("overwrite")
.saveAsTable(dimensionTableName)
if (useTempTable) {
spark.sql(s"DROP TABLE ${tmpDimensionTableName} PURGE")
}
dimensionTable(refresh = true)
}
}
示例3: enrichedWithDimensionEncoder
//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling
import org.apache.spark.sql.{Dataset, Encoder}
trait EnrichedDimensionWithDimensionProvider[ENRICHED_DIM <: (Product with Serializable), DIM <: (Product with Serializable)] {
type EnrichedWithDimension = (ENRICHED_DIM, DIM)
type DimensionWithEnriched = (DIM, ENRICHED_DIM)
type JoinEnrichedWithDimension = Dataset[EnrichedWithDimension]
type JoinDimensionWithEnriched = Dataset[DimensionWithEnriched]
implicit def enrichedWithDimensionEncoder: Encoder[(ENRICHED_DIM, DIM)]
implicit def dimensionWithEnrichedEncoder: Encoder[(DIM, ENRICHED_DIM)]
implicit def enrichedWithDimensionToDimensionWithEnriched(e: EnrichedWithDimension): DimensionWithEnriched = e.swap
implicit def dimensionWithEnrichedToEnrichedWithDimension(d: EnrichedWithDimension): DimensionWithEnriched = d.swap
implicit def joinEnrichedWithDimensionToJoinDimensionWithEnriched(e: JoinEnrichedWithDimension): JoinDimensionWithEnriched = e.map(_.swap)
implicit def joinDimensionWithEnrichedToJoinEnrichedWithDimension(d: JoinEnrichedWithDimension): JoinDimensionWithEnriched = d.map(_.swap)
}
开发者ID:alghimo,项目名称:spark-dimensional-modelling,代码行数:20,代码来源:EnrichedDimensionWithDimensionProvider.scala
示例4: toKeyValueGroupedDataSet
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.dataset
import org.apache.spark.sql.{Encoder, KeyValueGroupedDataset, SparkSession}
import scala.reflect.ClassTag
trait KeyValueGroupedDataSetAPI[K, V] {
protected lazy val spark: SparkSession = SparkSession.builder().getOrCreate()
private[sparklocal] def toKeyValueGroupedDataSet(implicit encK: Encoder[K], encT: Encoder[V], encKT: Encoder[(K, V)]): KeyValueGroupedDataset[K, V]
def count(): DataSetAPI[(K, Long)]
def mapGroups[U: ClassTag](f: (K, Iterator[V]) => U)
(implicit enc: Encoder[U]=null): DataSetAPI[U]
def reduceGroups(f: (V, V) => V): DataSetAPI[(K, V)]
def mapValues[W: ClassTag](func: V => W)
(implicit enc: Encoder[W]=null): KeyValueGroupedDataSetAPI[K, W]
def flatMapGroups[U: ClassTag](f: (K, Iterator[V]) => TraversableOnce[U])
(implicit enc: Encoder[U]=null): DataSetAPI[U]
def keys: DataSetAPI[K]
def cogroup[U: ClassTag, R: ClassTag](other: KeyValueGroupedDataSetAPI[K, U])
(f: (K, Iterator[V], Iterator[U]) => TraversableOnce[R])
(implicit
encK: Encoder[K]=null,
encV: Encoder[V]=null,
encU: Encoder[U]=null,
encR: Encoder[R]=null,
encKV: Encoder[(K,V)]=null,
encKU: Encoder[(K,U)]=null
): DataSetAPI[R]
}
示例5: DataSetAPIScalaLazyImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.`lazy`.dataset
import com.datawizards.sparklocal.dataset.{DataSetAPI, KeyValueGroupedDataSetAPI}
import com.datawizards.sparklocal.impl.scala.dataset.DataSetAPIScalaBase
import com.datawizards.sparklocal.rdd.RDDAPI
import org.apache.spark.sql.Encoder
import scala.collection.{GenIterable, SeqView}
import scala.reflect.ClassTag
object DataSetAPIScalaLazyImpl {
private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaLazyImpl(it.toSeq.seq.view)
}
class DataSetAPIScalaLazyImpl[T: ClassTag](private[sparklocal] val data: SeqView[T, Seq[T]]) extends DataSetAPIScalaBase[T] {
override type InternalCollection = SeqView[T, Seq[T]]
override private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
DataSetAPIScalaLazyImpl.create(it)
override protected def union(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.union(dsScala.data.toSeq))
override protected def intersect(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.intersect(dsScala.data.toSeq))
override protected def diff(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.diff(dsScala.data.toSeq))
override def distinct(): DataSetAPI[T] =
create(data.distinct)
override def groupByKey[K: ClassTag](func: (T) => K)(implicit enc: Encoder[K]): KeyValueGroupedDataSetAPI[K, T] =
new KeyValueGroupedDataSetAPIScalaLazyImpl(data.groupBy(func))
override def rdd(): RDDAPI[T] = RDDAPI(data)
}
示例6: implicits
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.session
import com.datawizards.sparklocal.accumulator.{AccumulatorV2API, CollectionAccumulatorAPI, DoubleAccumulatorAPI, LongAccumulatorAPI}
import com.datawizards.sparklocal.broadcast.BroadcastAPI
import com.datawizards.sparklocal.impl.scala.accumulator.{CollectionAccumulatorAPIScalaImpl, DoubleAccumulatorAPIScalaImpl, LongAccumulatorAPIScalaImpl}
import com.datawizards.sparklocal.impl.scala.broadcast.BroadcastAPIScalaImpl
import com.datawizards.sparklocal.session.SparkSessionAPI
import org.apache.spark.sql.Encoder
import scala.reflect.ClassTag
trait SparkSessionAPIScalaBase extends SparkSessionAPI {
object implicits {
implicit def enc[T]: Encoder[T] = null
}
override def broadcast[T: ClassTag](value: T): BroadcastAPI[T] =
new BroadcastAPIScalaImpl[T](value)
override def longAccumulator: LongAccumulatorAPI =
new LongAccumulatorAPIScalaImpl()
override def longAccumulator(name: String): LongAccumulatorAPI =
new LongAccumulatorAPIScalaImpl(Some(name))
override def doubleAccumulator: DoubleAccumulatorAPI =
new DoubleAccumulatorAPIScalaImpl()
override def doubleAccumulator(name: String): DoubleAccumulatorAPI =
new DoubleAccumulatorAPIScalaImpl(Some(name))
override def collectionAccumulator[T]: CollectionAccumulatorAPI[T] =
new CollectionAccumulatorAPIScalaImpl[T]()
override def collectionAccumulator[T](name: String): CollectionAccumulatorAPI[T] =
new CollectionAccumulatorAPIScalaImpl[T](Some(name))
override def register(acc: AccumulatorV2API[_, _], name: String): Unit =
{ }
}
示例7: KeyValueGroupedDataSetAPIScalaParallelLazyImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.parallellazy.dataset
import com.datawizards.sparklocal.dataset.KeyValueGroupedDataSetAPI
import com.datawizards.sparklocal.impl.scala.dataset.{DataSetAPIScalaBase, KeyValueGroupedDataSetAPIScalaBase}
import org.apache.spark.sql.Encoder
import scala.collection.{GenIterable, GenSeq}
import scala.reflect.ClassTag
class KeyValueGroupedDataSetAPIScalaParallelLazyImpl[K: ClassTag, T: ClassTag](private[sparklocal] val data: Map[K, GenSeq[T]]) extends KeyValueGroupedDataSetAPIScalaBase[K, T] {
override type InternalCollection = Map[K, GenSeq[T]]
override private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]=null): DataSetAPIScalaBase[U] =
DataSetAPIScalaParallelLazyImpl.create(it)
override def mapValues[W: ClassTag](func: (T) => W)
(implicit enc: Encoder[W]=null): KeyValueGroupedDataSetAPI[K, W] = {
val mapped = data.mapValues(_.map(func))
new KeyValueGroupedDataSetAPIScalaParallelLazyImpl(mapped)
}
}
开发者ID:piotr-kalanski,项目名称:spark-local,代码行数:23,代码来源:KeyValueGroupedDataSetAPIScalaParallelLazyImpl.scala
示例8: DataSetAPIScalaParallelLazyImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.parallellazy.dataset
import com.datawizards.sparklocal.dataset.{DataSetAPI, KeyValueGroupedDataSetAPI}
import com.datawizards.sparklocal.impl.scala.dataset.DataSetAPIScalaBase
import com.datawizards.sparklocal.impl.scala.parallellazy.ParallelLazySeq
import com.datawizards.sparklocal.rdd.RDDAPI
import org.apache.spark.sql.Encoder
import scala.collection.GenIterable
import scala.reflect.ClassTag
object DataSetAPIScalaParallelLazyImpl {
private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaParallelLazyImpl(new ParallelLazySeq(it.toSeq.par))
}
class DataSetAPIScalaParallelLazyImpl[T: ClassTag](private[sparklocal] val data: ParallelLazySeq[T]) extends DataSetAPIScalaBase[T] {
override type InternalCollection = ParallelLazySeq[T]
override private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
DataSetAPIScalaParallelLazyImpl.create(it)
private def create[U: ClassTag](data: ParallelLazySeq[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaParallelLazyImpl(data)
override protected def union(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.union(dsScala.data.toSeq))
override protected def intersect(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.intersect(dsScala.data.toSeq))
override protected def diff(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.diff(dsScala.data.toSeq))
override def distinct(): DataSetAPI[T] =
create(data.distinct)
override def groupByKey[K: ClassTag](func: (T) => K)(implicit enc: Encoder[K]): KeyValueGroupedDataSetAPI[K, T] =
new KeyValueGroupedDataSetAPIScalaParallelLazyImpl(data.groupBy(func))
override def rdd(): RDDAPI[T] = RDDAPI(data)
}
示例9: DataSetAPIScalaEagerImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.eager.dataset
import com.datawizards.sparklocal.dataset.{DataSetAPI, KeyValueGroupedDataSetAPI}
import com.datawizards.sparklocal.impl.scala.dataset.DataSetAPIScalaBase
import com.datawizards.sparklocal.rdd.RDDAPI
import org.apache.spark.sql.Encoder
import scala.collection.GenIterable
import scala.reflect.ClassTag
object DataSetAPIScalaEagerImpl {
private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaEagerImpl(it.toSeq.seq)
}
class DataSetAPIScalaEagerImpl[T: ClassTag](private[sparklocal] val data: Seq[T]) extends DataSetAPIScalaBase[T] {
override type InternalCollection = Seq[T]
override private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
DataSetAPIScalaEagerImpl.create(it)
override protected def union(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.union(dsScala.data.toSeq))
override protected def intersect(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.intersect(dsScala.data.toSeq))
override protected def diff(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.diff(dsScala.data.toSeq))
override def distinct(): DataSetAPI[T] =
create(data.distinct)
override def groupByKey[K: ClassTag](func: (T) => K)(implicit enc: Encoder[K]): KeyValueGroupedDataSetAPI[K, T] =
new KeyValueGroupedDataSetAPIScalaEagerImpl(data.groupBy(func))
override def rdd(): RDDAPI[T] = RDDAPI(data)
}
示例10: DataSetAPIScalaParallelImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.scala.parallel.dataset
import com.datawizards.sparklocal.dataset.{DataSetAPI, KeyValueGroupedDataSetAPI}
import com.datawizards.sparklocal.impl.scala.`lazy`.dataset.DataSetAPIScalaLazyImpl
import com.datawizards.sparklocal.impl.scala.dataset.DataSetAPIScalaBase
import com.datawizards.sparklocal.rdd.RDDAPI
import org.apache.spark.sql.Encoder
import scala.collection.GenIterable
import scala.collection.parallel.ParSeq
import scala.reflect.ClassTag
object DataSetAPIScalaParallelImpl {
private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaParallelImpl(it.toSeq.par)
}
class DataSetAPIScalaParallelImpl[T: ClassTag](private[sparklocal] val data: ParSeq[T]) extends DataSetAPIScalaBase[T] {
override type InternalCollection = ParSeq[T]
override private[sparklocal] def create[U: ClassTag](it: GenIterable[U])(implicit enc: Encoder[U]): DataSetAPIScalaBase[U] =
DataSetAPIScalaParallelImpl.create(it)
private def create[U: ClassTag](data: ParSeq[U]): DataSetAPIScalaBase[U] =
new DataSetAPIScalaParallelImpl(data)
override protected def union(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.union(dsScala.data.toSeq))
override protected def intersect(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.intersect(dsScala.data.toSeq))
override protected def diff(data: InternalCollection, dsScala: DataSetAPIScalaBase[T]): DataSetAPIScalaBase[T] =
create(data.diff(dsScala.data.toSeq))
override def distinct(): DataSetAPI[T] =
create(data.distinct)
override def groupByKey[K: ClassTag](func: (T) => K)(implicit enc: Encoder[K]): KeyValueGroupedDataSetAPI[K, T] =
new KeyValueGroupedDataSetAPIScalaParallelImpl(data.groupBy(func))
override def rdd(): RDDAPI[T] = RDDAPI(data)
}
示例11: KeyValueGroupedDataSetAPISparkImpl
//设置package包名称以及导入依赖的类
package com.datawizards.sparklocal.impl.spark.dataset
import com.datawizards.sparklocal.dataset.{DataSetAPI, KeyValueGroupedDataSetAPI}
import org.apache.spark.sql.{Encoder, KeyValueGroupedDataset}
import scala.reflect.ClassTag
class KeyValueGroupedDataSetAPISparkImpl[K: ClassTag, T: ClassTag](private[dataset] val data: KeyValueGroupedDataset[K, T]) extends KeyValueGroupedDataSetAPI[K, T] {
private def create[U: ClassTag](data: KeyValueGroupedDataset[K,U]) = new KeyValueGroupedDataSetAPISparkImpl(data)
override private[sparklocal] def toKeyValueGroupedDataSet(implicit encK: Encoder[K], encT: Encoder[T], encKT: Encoder[(K, T)]) = data
override def count(): DataSetAPI[(K, Long)] =
DataSetAPI(data.count())
override def mapValues[W: ClassTag](func: (T) => W)
(implicit enc: Encoder[W]=null): KeyValueGroupedDataSetAPI[K, W] =
create(data.mapValues(func))
override def mapGroups[U: ClassTag](f: (K, Iterator[T]) => U)
(implicit enc: Encoder[U]=null): DataSetAPI[U] =
DataSetAPI(data.mapGroups(f))
override def reduceGroups(f: (T, T) => T): DataSetAPI[(K, T)] =
DataSetAPI(data.reduceGroups(f))
override def flatMapGroups[U: ClassTag](f: (K, Iterator[T]) => TraversableOnce[U])
(implicit enc: Encoder[U]=null): DataSetAPI[U] =
DataSetAPI(data.flatMapGroups(f))
override def keys: DataSetAPI[K] =
DataSetAPI(data.keys)
override def cogroup[U: ClassTag, R: ClassTag](other: KeyValueGroupedDataSetAPI[K, U])
(f: (K, Iterator[T], Iterator[U]) => TraversableOnce[R])
(implicit
encK: Encoder[K]=null,
encT: Encoder[T]=null,
encU: Encoder[U]=null,
encR: Encoder[R]=null,
encKT: Encoder[(K,T)]=null,
encKU: Encoder[(K,U)]=null
): DataSetAPI[R] = {
DataSetAPI(data.cogroup(other.toKeyValueGroupedDataSet)(f))
}
}
示例12: GladAlphaAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class GladAlphaAggregator(params: Broadcast[GladParams], learningRate: Double)
extends Aggregator[GladPartial, GladAlphaAggregatorBuffer, Double]{
def zero: GladAlphaAggregatorBuffer = GladAlphaAggregatorBuffer(0,-1)
def reduce(b: GladAlphaAggregatorBuffer, a: GladPartial) : GladAlphaAggregatorBuffer = {
val alpha = params.value.alpha
val al = alpha(a.annotator)
val bet = a.beta
val aest = a.est
val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
val p = if (a.value == 1) a.est else (1-a.est)
val term = (p - sigmoidValue)*bet
GladAlphaAggregatorBuffer(b.agg + term, al)
}
def merge(b1: GladAlphaAggregatorBuffer, b2: GladAlphaAggregatorBuffer) : GladAlphaAggregatorBuffer = {
GladAlphaAggregatorBuffer(b1.agg + b2.agg, if (b1.alpha == -1) b2.alpha else b1.alpha )
}
def finish(reduction: GladAlphaAggregatorBuffer) = {
reduction.alpha + learningRate * reduction.agg
}
def bufferEncoder: Encoder[GladAlphaAggregatorBuffer] = Encoders.product[GladAlphaAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladAlphaAggregatorBuffer(agg: Double, alpha: Double)
示例13: MulticlassMVAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.MulticlassAnnotation
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
private[crowd] class MulticlassMVAggregator(nClasses: Int) extends Aggregator[MulticlassAnnotation, MulticlassMVPartial, Int]{
def sumKey(map: Map[Int,Long], pair: (Int,Long)) = {
val key = pair._1
val value = pair._2
val new_value = map.get(key) match {
case Some(x) => x + value
case None => value
}
map.updated(key, new_value)
}
def zero: MulticlassMVPartial = MulticlassMVPartial(Vector.fill(nClasses)(0),0)
def reduce(b: MulticlassMVPartial, a: MulticlassAnnotation) : MulticlassMVPartial = {
MulticlassMVPartial(b.aggVect.updated(a.value, b.aggVect(a.value) + 1), b.count + 1)
}
def merge(b1: MulticlassMVPartial, b2: MulticlassMVPartial) : MulticlassMVPartial = {
MulticlassMVPartial(b1.aggVect.zip(b2.aggVect).map(x => x._1 + x._2), b1.count + b2.count)
}
def finish(reduction: MulticlassMVPartial) = {
reduction.aggVect.indexOf(reduction.aggVect.max)
}
def bufferEncoder: Encoder[MulticlassMVPartial] = Encoders.product[MulticlassMVPartial]
def outputEncoder: Encoder[Int] = Encoders.scalaInt
}
示例14: GladBetaAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class GladBetaAggregator(params: Broadcast[GladParams], learningRate: Double)
extends Aggregator[GladPartial, GladBetaAggregatorBuffer, Double]{
def zero: GladBetaAggregatorBuffer = GladBetaAggregatorBuffer(0,-1)
def reduce(b: GladBetaAggregatorBuffer, a: GladPartial) : GladBetaAggregatorBuffer = {
val alpha = params.value.alpha
val al = alpha(a.annotator)
val bet = a.beta
val aest = a.est
val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
val p = if (a.value == 1) a.est else (1-a.est)
val term = (p - sigmoidValue)*alpha(a.annotator)
GladBetaAggregatorBuffer(b.agg + term, a.beta)
}
def merge(b1: GladBetaAggregatorBuffer, b2: GladBetaAggregatorBuffer) : GladBetaAggregatorBuffer = {
GladBetaAggregatorBuffer(b1.agg + b2.agg, if (b1.beta == -1) b2.beta else b1.beta)
}
def finish(reduction: GladBetaAggregatorBuffer) = {
reduction.beta + learningRate * reduction.agg
}
def bufferEncoder: Encoder[GladBetaAggregatorBuffer] = Encoders.product[GladBetaAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladBetaAggregatorBuffer(agg: Double, beta: Double)
示例15: BinarySoftMVAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import org.apache.spark.sql.{Encoder, Encoders}
import com.enriquegrodrigo.spark.crowd.types.BinaryAnnotation
import org.apache.spark.sql.expressions.Aggregator
private[crowd] class BinarySoftMVAggregator extends Aggregator[BinaryAnnotation, BinaryMVPartial, Double]{
def zero: BinaryMVPartial = BinaryMVPartial(0,0)
def reduce(b: BinaryMVPartial, a: BinaryAnnotation) : BinaryMVPartial =
BinaryMVPartial(b.aggValue+a.value, b.count + 1)
def merge(b1: BinaryMVPartial, b2: BinaryMVPartial) : BinaryMVPartial =
BinaryMVPartial(b1.aggValue + b2.aggValue, b1.count + b2.count)
def finish(reduction: BinaryMVPartial) = {
val numerator: Double = reduction.aggValue
val denominator: Double = reduction.count
if (denominator == 0)
throw new IllegalArgumentException()
else {
(numerator / denominator)
}
}
def bufferEncoder: Encoder[BinaryMVPartial] = Encoders.product[BinaryMVPartial]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}