当前位置: 首页>>代码示例>>Scala>>正文


Scala Dataset类代码示例

本文整理汇总了Scala中org.apache.spark.sql.Dataset的典型用法代码示例。如果您正苦于以下问题:Scala Dataset类的具体用法?Scala Dataset怎么用?Scala Dataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Dataset类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: apply

//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators

import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime

import scala.util.Random


  def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
    val sparkSession = SparkRuntime.getSparkSession()
    import sparkSession.implicits._
    val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
    val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
    val random : Random = new Random()
    val id : Int = random.nextInt()
    val path : String = s"/tmp/${id}"
    val sparkContext = sparkSession.sparkContext
    generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
    val edgesRDD = sparkContext.textFile(path)
                               .map( s => s.split("\t"))
                               .map( l => (l(0).toLong, l(1).toLong))
                               .zipWithIndex().map( { case ((tail,head), id) =>  (id, tail, head)})
    sparkSession.createDataset(edgesRDD)
  }

} 
开发者ID:DAMA-UPC,项目名称:DataSynth,代码行数:28,代码来源:EdgeTableOperator.scala

示例2: PrecipSource

//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}


case class PrecipSource(sourceId: Int,
                        name: String,
                        countryCode: String,
                        latitude: String,
                        longitude: String,
                        elevation: Int,
                        elementId: String,
                        beginDate: String,
                        endDate: String,
                        participantId: Int,
                        participantName: String
                       )

case class Precipication(stationId: Int,
                         sourceId: Int,
                         date: String,
                         amount: Int,
                         quality: Int
                        )

class Mappers() {

  
  def precipicationDF(spark: SparkSession, sourceFilPath: String): Dataset[Precipication] = {
    import spark.implicits._

    var sourceFile: RDD[String] = spark.sparkContext.textFile(sourceFilPath)

    val header = spark.sparkContext.parallelize(sourceFile.take(20))
    sourceFile = sourceFile.subtract(header)
    header.unpersist()

    var precipitionDF: Dataset[Precipication] = sourceFile
      .map(s => s.split(",")
        .map(_.trim()))
      .map(fields => Precipication(
        stationId = fields(0).toInt,
        sourceId = fields(1).toInt,
        date = fields(2),
        amount = fields(3).toInt,
        quality = fields(4).toInt
      ))
      .toDS()

    precipitionDF.show(false)
    precipitionDF
  }

} 
开发者ID:luxinator,项目名称:RainyDay,代码行数:54,代码来源:Mappers.scala

示例3: DataFrameFunctions

//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc

import org.apache.spark.sql.{Column, Dataset, Row}


class DataFrameFunctions(self: DC[Row]) {

    def join(right: DC[Row]): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right)
      }
      val hashTarget = Seq("join")
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], usingColumn: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, usingColumn)
      }
      val hashTarget = Seq("join", usingColumn)
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")

    def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, joinExprs)
      }
      val hashTarget = Seq("join", joinType, joinExprs.toString())
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }


} 
开发者ID:bloomberg,项目名称:spark-flow,代码行数:36,代码来源:DataFrameFunctions.scala

示例4: MiniPanda

//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.ml

import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}

import com.holdenkarau.spark.testing._

import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
import org.scalatest.Matchers._
import org.scalatest.FunSuite

case class MiniPanda(happy: Double, fuzzy: Double, old: Double)

class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase {
  val miniPandasList = List(
    MiniPanda(1.0, 1.0, 1.0),
    MiniPanda(1.0, 1.0, 0.0),
    MiniPanda(1.0, 1.0, 0.0),
    MiniPanda(0.0, 0.0, 1.0),
    MiniPanda(0.0, 0.0, 0.0))

  test("simple sanity test") {
    val session = spark
    import session.implicits._
    val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList)
    val assembler = new VectorAssembler()
    assembler.setInputCols(Array("fuzzy", "old"))
    assembler.setOutputCol("features")
    val snb = new SimpleNaiveBayes()
    snb.setLabelCol("happy")
    snb.setFeaturesCol("features")
    val pipeline = new Pipeline().setStages(Array(assembler, snb))
    val model = pipeline.fit(ds)
    val test = ds.select("fuzzy", "old")
    val predicted = model.transform(test)
    println(predicted.collect())
  }
} 
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:41,代码来源:SimpleNaiveBayes.scala

示例5: Partition

//设置package包名称以及导入依赖的类
package br.ufmg.cs.lib.privacy.kanonymity

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._


case class Partition(
    member: Dataset[Row],
    memberCount: Long,
    low: Array[Int],
    high: Array[Int],
    allow: Array[Int]) {

  override def toString: String = {
    s"Partition(memberlen=${memberCount}" +
    s", low=${low.mkString("[", ", ", "]")}" +
    s", high=${high.mkString("[", ",", "]")}" +
    s", allow=${allow.mkString("[", ",", "]")})"
  }
}

object Partition {
  def apply(member: Dataset[Row], memberCount: Long,
      low: Array[Int], high: Array[Int]): Partition = {
    Partition(member, memberCount, low, high, Array.fill(low.length)(1))
  }
} 
开发者ID:eubr-bigsea,项目名称:k-anonymity-mondrian,代码行数:28,代码来源:Partition.scala

示例6: cogroup

//设置package包名称以及导入依赖的类
package com.qunar.spark.tungsten.base

import org.apache.spark.sql.Dataset
import com.qunar.spark.tungsten.base.CommonEncoders._

import scala.reflect.runtime.universe.TypeTag


  def cogroup[T: TypeTag, K: TypeTag](leftDataset: Dataset[T], rightDataset: Dataset[T], genJoinKey: T => K): Dataset[(Seq[T], Seq[T])] = {
    // ????leftDataset
    val thisKeyValueSet = leftDataset.groupByKey(data => genJoinKey(data))
    val thisCogroupSet = thisKeyValueSet.mapGroups((key, dataIter) => {
      val builder = Seq.newBuilder[T]
      for (data <- dataIter) {
        builder += data
      }
      (key, builder.result)
    }).toDF("_1", "_2").as[(K, Seq[T])]

    // ????rightDataset
    val anotherKeyValueSet = rightDataset.groupByKey(data => genJoinKey(data))
    val anotherCogroupSet = anotherKeyValueSet.mapGroups((key, dataIter) => {
      val builder = Seq.newBuilder[T]
      for (data <- dataIter) {
        builder += data
      }
      (key, builder.result)
    }).toDF("_1", "_2").as[(K, Seq[T])]

    val resultDataFrame = thisCogroupSet.join(anotherCogroupSet, thisCogroupSet("_1") === anotherCogroupSet("_1"), "outer")
    resultDataFrame.as[(Seq[T], Seq[T])]
  }

} 
开发者ID:spark-bypass-common,项目名称:common-tungsten,代码行数:35,代码来源:CoreJoinOperators.scala

示例7: OpenIE

//设置package包名称以及导入依赖的类
package com.knoldus

import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions.udf

import scala.collection.JavaConverters._


private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
  def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
    this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
}

object StartApplication extends App{

  val spark = SparkSession.builder().appName("spark-nlp-starter").master("local[*]").getOrCreate()
  val sc = spark.sparkContext
  val readPdfFile: Dataset[String] = spark.read.textFile("test")
  readPdfFile.show(false)

  def openie = udf { sentence: String =>
    new Sentence(sentence).openie().asScala.map(q => new OpenIE(q)).toSeq
  }

  val res = readPdfFile.select(openie(readPdfFile("value")))
  res.show(false)
} 
开发者ID:shiv4nsh,项目名称:scala-spark-nlp-starter-kit,代码行数:30,代码来源:StartApplication.scala

示例8: RowProfiler

//设置package包名称以及导入依赖的类
package io.gzet.profilers.raw

import org.apache.spark.sql.Dataset

case class RowProfiler() {

  def profile(df: Dataset[String]): Dataset[RowReport] = {
    import df.sparkSession.implicits._
    val report = RowReport(df.count().toDouble)
    df.sparkSession.createDataset[RowReport](
      Seq(report)
    )
  }
}

case class RowReport(
                      metricValue: Double
                    ) 
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:19,代码来源:RowProfiler.scala

示例9: EmptinessProfiler

//设置package包名称以及导入依赖的类
package io.gzet.profilers.field

import io.gzet.profilers.Utils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Dataset

import scalaz.Scalaz._

case class EmptinessProfiler() {

  def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = {

    import df.sparkSession.implicits._

    val features = Utils.buildColumns(df)

    features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) =>
      (column, isNotEmpty)
    }).count().map({ case ((column, isNotEmpty), count) =>
      (column, Map(isNotEmpty -> count))
    }).groupByKey({ case (column, map) =>
      column
    }).reduceGroups({ (v1, v2) =>
      (v1._1, v1._2 |+| v2._2)
    }).map({ case (col, (_, map)) =>
      val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble
      EmptinessReport(
        col,
        emptiness
      )
    })

  }

}

case class EmptinessReport(
                            field: Int,
                            metricValue: Double
                          ) 
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:41,代码来源:EmptinessProfiler.scala

示例10: CountSentencesByLanguage

//设置package包名称以及导入依赖的类
package biz.meetmatch.modules

import biz.meetmatch.model.{Sentence, SentenceCountByLanguage}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.rogach.scallop.Scallop

object CountSentencesByLanguage extends Module with ParquetExtensions[SentenceCountByLanguage] {
  override val parquetFile = "SentenceCountsByLanguage"

  override def execute(scallopts: Scallop)(implicit sparkSession: SparkSession): Unit = {
    val sentenceDS = DetectLanguage.loadResultsFromParquet

    val sentenceCountByLanguageDS = calc(sentenceDS)

    saveResultsToParquet(sentenceCountByLanguageDS)
  }

  
  def calc(sentenceDS: Dataset[Sentence])(implicit sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
    import sparkSession.implicits._
    sparkSession.sparkContext.setJobGroup(this.getClass.getName, this.getClass.getName)

    sparkSession.sparkContext.setJobDescription("Count the sentences by language")

    // TASK 2: count how many sentences exist for each detected language and save the results in the SentenceCountByLanguage case class

    // when finished coding:
    // - package, deploy and submit the spark application and verify the results using spark shell or a notebook (see https://github.com/tolomaus/languagedetector section Quick start - usage)
    // - verify the logs of the executed module in the language detector UI

    // solution:
    sentenceDS
      .groupByKey(_.detectedLanguage)
      .count
      .map { case (language, count) => SentenceCountByLanguage(language, count) }
  }

  def loadResultsFromParquet(implicit module: Class[_] = this.getClass, sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
    import sparkSession.implicits._
    loadResultsFromParquetAsDF(module, sparkSession).as[SentenceCountByLanguage]
  }
} 
开发者ID:tolomaus,项目名称:languagedetector,代码行数:43,代码来源:CountSentencesByLanguage.scala

示例11: SQLDataProvider

//设置package包名称以及导入依赖的类
package org.ieee.codemeow.geometric.spark.data

import com.vividsolutions.jts.geom.Geometry
import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
import org.ieee.codemeow.geometric.Feature
import org.ieee.codemeow.geometric.spark.LayerConfiguration



class SQLDataProvider(_spark: SparkSession, _layer: LayerConfiguration) extends AbstractDataProvider(_spark, _layer){

  val url = layer.kwargs.get("url").get.asInstanceOf[String]
  val dbtables = layer.kwargs.get("dbtables").get.asInstanceOf[Map[String, String]]
  val user = layer.kwargs.get("user").get.asInstanceOf[String]
  val password = layer.kwargs.get("password").get.asInstanceOf[String]
  val zoomConfig = layer.kwargs.get("zooms").get.asInstanceOf[Map[String, String]]

  // load all tables
  dbtables.foreach(tuple => {
    val sparkTableName = tuple._1
    val realTableName = tuple._2

    val mapDataFrame = spark.read.format("jdbc")
      .option("url", url)
      .option("user", user)
      .option("password", password)
      .option("dbtable", realTableName).load

    mapDataFrame.createOrReplaceTempView(sparkTableName)
  })

  override def getFeatures(layerName: String, zoom: Long): Option[Dataset[Feature]] ={
    // Ref http://stackoverflow.com/questions/38664972/why-is-unable-to-find-encoder-for-type-stored-in-a-dataset-when-creating-a-dat
    import spark.implicits._
    // Ref http://stackoverflow.com/questions/36648128/how-to-store-custom-objects-in-dataset
    implicit val featureEncoder = Encoders.kryo[Feature]

    val natureSQL = zoomConfig.get(zoom.toString)
    if(natureSQL.isEmpty){
      return None
    }

    val rawDF = spark.sql(natureSQL.get)
    val featureCollection = rawDF.map(row => {
      val id = row.getAs[Long]("__id__")
      val geom = row.getAs[Geometry]("__geometry__")
      val fields = row.schema.filter(field => {
        !Seq("__id__", "__geometry__").contains(field.name)
      }).map(field => field.name)
      val props = row.getValuesMap[String](fields)
      Feature(id, geom, props)
    })
    Some(featureCollection)
  }
} 
开发者ID:codemeow5,项目名称:Vector-Tile-Spark-Process,代码行数:56,代码来源:SQLDataProvider.scala

示例12: FortisTargetTablename

//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.aggregators

import com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.dto.{AggregationRecord, Event, EventBatchEntry}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

trait FortisAggregator {
 protected val KeyspaceName = "fortis"
 protected val CassandraFormat = "org.apache.spark.sql.cassandra"
 protected val AggregateFunctions = "sum(mentioncount) as mentioncountagg, SentimentWeightedAvg(IF(IsNull(avgsentiment), 0, avgsentiment), IF(IsNull(mentioncount), 0, mentioncount)) as avgsentimentagg"
 //protected val IncrementalUpdateMentionsUDF = "SumMentions(a.mentioncountagg, IF(IsNull(b.mentioncount), 0, b.mentioncount)) as mentioncount"
 //protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg, IF(IsNull(b.avgsentiment), 0, b.avgsentiment), IF(IsNull(b.mentioncount), 0, b.mentioncount)) as avgsentiment"
 protected val IncrementalUpdateMentionsUDF = "a.mentioncountagg as mentioncount"
 protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg) as avgsentimentnumerator"
 protected val DataFrameNameFlattenedEvents = "flattenedEventsDF"
 protected val DataFrameNameComputed = "computedDF"

 def FortisTargetTablename: String
 def DfTableNameFlattenedEvents: String
 def DfTableNameComputedAggregates: String

 def FortisTargetTableDataFrame(session:SparkSession): DataFrame
 def flattenEvents(session: SparkSession, eventDS: Dataset[Event]): DataFrame
 def IncrementalUpdate(session:SparkSession, aggregatedDS: DataFrame): DataFrame
 def AggregateEventBatches(session: SparkSession, flattenedEvents: DataFrame): DataFrame
}

abstract class FortisAggregatorBase extends FortisAggregator {
  override def DfTableNameFlattenedEvents: String = s"$DataFrameNameFlattenedEvents$FortisTargetTablename"

  override def DfTableNameComputedAggregates: String = s"$DataFrameNameComputed$FortisTargetTablename"
} 
开发者ID:CatalystCode,项目名称:project-fortis-spark,代码行数:32,代码来源:FortisAggregator.scala

示例13: Xref

//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif

import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset


object Xref {
  def getXref: Dataset[XrefRow] = {
    val file = Job.sc.textFile("examples/spark_repl_demo/cif_xref.txt")

    val sqlContext = Job.sqlContext
    // this is used to implicitly convert an RDD to a DataFrame or Dataset.
    import sqlContext.implicits._

    //Have to wrap this Spark code in a block so that the header val is only scoped to this call
    //not the entire class.  If header val was class level, then the closure would try to serialize
    //this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
    val fileRdd = {
      val header = file.first()

      file
        .filter(line => line != header)
        .map(_.split("\t"))
        .map(p => XrefRow(p(0), p(1), p(2).toInt))
      //.toDS()
    }

    fileRdd.toDS
  }
}

case class XrefRow (XrefSystem: String,
                    XrefId: String,
                    CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:35,代码来源:Xref.scala

示例14: Customer

//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif

import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset


object Customer {
  def getCustomers: Dataset[CustomerRow] = {
    val file = Job.sc.textFile("examples/spark_repl_demo/cif_customer.txt")

    val sqlContext = Job.sqlContext
    // this is used to implicitly convert an RDD to a DataFrame or Dataset.
    import sqlContext.implicits._

    //Have to wrap this Spark code in a block so that the header val is only scoped to this call
    //not the entire class.  If header val was class level, then the closure would try to serialize
    //this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
    val fileRdd = {
      val header = file.first()

      file
        .filter(line => line != header)
        .map(_.split("\t"))
        .map(p => CustomerRow(p(0), p(1), p(2), p(3), p(4), p(5).toInt))
      //.toDS()
    }

    fileRdd.toDS
  }
}

case class CustomerRow (Name: String,
                    Address: String,
                    City:String,
                    PostalCode:String,
                    Phone: String,
                    CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:38,代码来源:Customer.scala

示例15: Customer

//设置package包名称以及导入依赖的类
package com.nextgendata.app.target

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Dataset


object Customer {
  def insert(customers: Dataset[CustomerRow]): Unit ={
    FileUtils.deleteDirectory(new File("target/Customer.txt"))
    customers.rdd.saveAsTextFile("target/Customer.txt")
  }
}

object BadCustomer {
  def insert(customers: Dataset[BadCustomerRow]): Unit ={
    FileUtils.deleteDirectory(new File("target/BadCustomer.txt"))
    customers.rdd.saveAsTextFile("target/BadCustomer.txt")
  }
}

case class CustomerRow(email: String, provinceCode: String, provinceName:String, countryName:String, postal: String, CIFId: Int)

case class BadCustomerRow(email: String, postal: String, CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:26,代码来源:Customer.scala


注:本文中的org.apache.spark.sql.Dataset类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。