本文整理汇总了Scala中org.apache.spark.sql.Dataset类的典型用法代码示例。如果您正苦于以下问题:Scala Dataset类的具体用法?Scala Dataset怎么用?Scala Dataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Dataset类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: apply
//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators
import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime
import scala.util.Random
def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
val sparkSession = SparkRuntime.getSparkSession()
import sparkSession.implicits._
val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
val random : Random = new Random()
val id : Int = random.nextInt()
val path : String = s"/tmp/${id}"
val sparkContext = sparkSession.sparkContext
generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
val edgesRDD = sparkContext.textFile(path)
.map( s => s.split("\t"))
.map( l => (l(0).toLong, l(1).toLong))
.zipWithIndex().map( { case ((tail,head), id) => (id, tail, head)})
sparkSession.createDataset(edgesRDD)
}
}
示例2: PrecipSource
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
case class PrecipSource(sourceId: Int,
name: String,
countryCode: String,
latitude: String,
longitude: String,
elevation: Int,
elementId: String,
beginDate: String,
endDate: String,
participantId: Int,
participantName: String
)
case class Precipication(stationId: Int,
sourceId: Int,
date: String,
amount: Int,
quality: Int
)
class Mappers() {
def precipicationDF(spark: SparkSession, sourceFilPath: String): Dataset[Precipication] = {
import spark.implicits._
var sourceFile: RDD[String] = spark.sparkContext.textFile(sourceFilPath)
val header = spark.sparkContext.parallelize(sourceFile.take(20))
sourceFile = sourceFile.subtract(header)
header.unpersist()
var precipitionDF: Dataset[Precipication] = sourceFile
.map(s => s.split(",")
.map(_.trim()))
.map(fields => Precipication(
stationId = fields(0).toInt,
sourceId = fields(1).toInt,
date = fields(2),
amount = fields(3).toInt,
quality = fields(4).toInt
))
.toDS()
precipitionDF.show(false)
precipitionDF
}
}
示例3: DataFrameFunctions
//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc
import org.apache.spark.sql.{Column, Dataset, Row}
class DataFrameFunctions(self: DC[Row]) {
def join(right: DC[Row]): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right)
}
val hashTarget = Seq("join")
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], usingColumn: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, usingColumn)
}
val hashTarget = Seq("join", usingColumn)
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")
def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, joinExprs)
}
val hashTarget = Seq("join", joinType, joinExprs.toString())
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
}
示例4: MiniPanda
//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.ml
import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
import com.holdenkarau.spark.testing._
import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
import org.scalatest.Matchers._
import org.scalatest.FunSuite
case class MiniPanda(happy: Double, fuzzy: Double, old: Double)
class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase {
val miniPandasList = List(
MiniPanda(1.0, 1.0, 1.0),
MiniPanda(1.0, 1.0, 0.0),
MiniPanda(1.0, 1.0, 0.0),
MiniPanda(0.0, 0.0, 1.0),
MiniPanda(0.0, 0.0, 0.0))
test("simple sanity test") {
val session = spark
import session.implicits._
val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList)
val assembler = new VectorAssembler()
assembler.setInputCols(Array("fuzzy", "old"))
assembler.setOutputCol("features")
val snb = new SimpleNaiveBayes()
snb.setLabelCol("happy")
snb.setFeaturesCol("features")
val pipeline = new Pipeline().setStages(Array(assembler, snb))
val model = pipeline.fit(ds)
val test = ds.select("fuzzy", "old")
val predicted = model.transform(test)
println(predicted.collect())
}
}
示例5: Partition
//设置package包名称以及导入依赖的类
package br.ufmg.cs.lib.privacy.kanonymity
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
case class Partition(
member: Dataset[Row],
memberCount: Long,
low: Array[Int],
high: Array[Int],
allow: Array[Int]) {
override def toString: String = {
s"Partition(memberlen=${memberCount}" +
s", low=${low.mkString("[", ", ", "]")}" +
s", high=${high.mkString("[", ",", "]")}" +
s", allow=${allow.mkString("[", ",", "]")})"
}
}
object Partition {
def apply(member: Dataset[Row], memberCount: Long,
low: Array[Int], high: Array[Int]): Partition = {
Partition(member, memberCount, low, high, Array.fill(low.length)(1))
}
}
示例6: cogroup
//设置package包名称以及导入依赖的类
package com.qunar.spark.tungsten.base
import org.apache.spark.sql.Dataset
import com.qunar.spark.tungsten.base.CommonEncoders._
import scala.reflect.runtime.universe.TypeTag
def cogroup[T: TypeTag, K: TypeTag](leftDataset: Dataset[T], rightDataset: Dataset[T], genJoinKey: T => K): Dataset[(Seq[T], Seq[T])] = {
// ????leftDataset
val thisKeyValueSet = leftDataset.groupByKey(data => genJoinKey(data))
val thisCogroupSet = thisKeyValueSet.mapGroups((key, dataIter) => {
val builder = Seq.newBuilder[T]
for (data <- dataIter) {
builder += data
}
(key, builder.result)
}).toDF("_1", "_2").as[(K, Seq[T])]
// ????rightDataset
val anotherKeyValueSet = rightDataset.groupByKey(data => genJoinKey(data))
val anotherCogroupSet = anotherKeyValueSet.mapGroups((key, dataIter) => {
val builder = Seq.newBuilder[T]
for (data <- dataIter) {
builder += data
}
(key, builder.result)
}).toDF("_1", "_2").as[(K, Seq[T])]
val resultDataFrame = thisCogroupSet.join(anotherCogroupSet, thisCogroupSet("_1") === anotherCogroupSet("_1"), "outer")
resultDataFrame.as[(Seq[T], Seq[T])]
}
}
示例7: OpenIE
//设置package包名称以及导入依赖的类
package com.knoldus
import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions.udf
import scala.collection.JavaConverters._
private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
}
object StartApplication extends App{
val spark = SparkSession.builder().appName("spark-nlp-starter").master("local[*]").getOrCreate()
val sc = spark.sparkContext
val readPdfFile: Dataset[String] = spark.read.textFile("test")
readPdfFile.show(false)
def openie = udf { sentence: String =>
new Sentence(sentence).openie().asScala.map(q => new OpenIE(q)).toSeq
}
val res = readPdfFile.select(openie(readPdfFile("value")))
res.show(false)
}
示例8: RowProfiler
//设置package包名称以及导入依赖的类
package io.gzet.profilers.raw
import org.apache.spark.sql.Dataset
case class RowProfiler() {
def profile(df: Dataset[String]): Dataset[RowReport] = {
import df.sparkSession.implicits._
val report = RowReport(df.count().toDouble)
df.sparkSession.createDataset[RowReport](
Seq(report)
)
}
}
case class RowReport(
metricValue: Double
)
示例9: EmptinessProfiler
//设置package包名称以及导入依赖的类
package io.gzet.profilers.field
import io.gzet.profilers.Utils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Dataset
import scalaz.Scalaz._
case class EmptinessProfiler() {
def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = {
import df.sparkSession.implicits._
val features = Utils.buildColumns(df)
features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) =>
(column, isNotEmpty)
}).count().map({ case ((column, isNotEmpty), count) =>
(column, Map(isNotEmpty -> count))
}).groupByKey({ case (column, map) =>
column
}).reduceGroups({ (v1, v2) =>
(v1._1, v1._2 |+| v2._2)
}).map({ case (col, (_, map)) =>
val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble
EmptinessReport(
col,
emptiness
)
})
}
}
case class EmptinessReport(
field: Int,
metricValue: Double
)
示例10: CountSentencesByLanguage
//设置package包名称以及导入依赖的类
package biz.meetmatch.modules
import biz.meetmatch.model.{Sentence, SentenceCountByLanguage}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.rogach.scallop.Scallop
object CountSentencesByLanguage extends Module with ParquetExtensions[SentenceCountByLanguage] {
override val parquetFile = "SentenceCountsByLanguage"
override def execute(scallopts: Scallop)(implicit sparkSession: SparkSession): Unit = {
val sentenceDS = DetectLanguage.loadResultsFromParquet
val sentenceCountByLanguageDS = calc(sentenceDS)
saveResultsToParquet(sentenceCountByLanguageDS)
}
def calc(sentenceDS: Dataset[Sentence])(implicit sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
import sparkSession.implicits._
sparkSession.sparkContext.setJobGroup(this.getClass.getName, this.getClass.getName)
sparkSession.sparkContext.setJobDescription("Count the sentences by language")
// TASK 2: count how many sentences exist for each detected language and save the results in the SentenceCountByLanguage case class
// when finished coding:
// - package, deploy and submit the spark application and verify the results using spark shell or a notebook (see https://github.com/tolomaus/languagedetector section Quick start - usage)
// - verify the logs of the executed module in the language detector UI
// solution:
sentenceDS
.groupByKey(_.detectedLanguage)
.count
.map { case (language, count) => SentenceCountByLanguage(language, count) }
}
def loadResultsFromParquet(implicit module: Class[_] = this.getClass, sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
import sparkSession.implicits._
loadResultsFromParquetAsDF(module, sparkSession).as[SentenceCountByLanguage]
}
}
示例11: SQLDataProvider
//设置package包名称以及导入依赖的类
package org.ieee.codemeow.geometric.spark.data
import com.vividsolutions.jts.geom.Geometry
import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
import org.ieee.codemeow.geometric.Feature
import org.ieee.codemeow.geometric.spark.LayerConfiguration
class SQLDataProvider(_spark: SparkSession, _layer: LayerConfiguration) extends AbstractDataProvider(_spark, _layer){
val url = layer.kwargs.get("url").get.asInstanceOf[String]
val dbtables = layer.kwargs.get("dbtables").get.asInstanceOf[Map[String, String]]
val user = layer.kwargs.get("user").get.asInstanceOf[String]
val password = layer.kwargs.get("password").get.asInstanceOf[String]
val zoomConfig = layer.kwargs.get("zooms").get.asInstanceOf[Map[String, String]]
// load all tables
dbtables.foreach(tuple => {
val sparkTableName = tuple._1
val realTableName = tuple._2
val mapDataFrame = spark.read.format("jdbc")
.option("url", url)
.option("user", user)
.option("password", password)
.option("dbtable", realTableName).load
mapDataFrame.createOrReplaceTempView(sparkTableName)
})
override def getFeatures(layerName: String, zoom: Long): Option[Dataset[Feature]] ={
// Ref http://stackoverflow.com/questions/38664972/why-is-unable-to-find-encoder-for-type-stored-in-a-dataset-when-creating-a-dat
import spark.implicits._
// Ref http://stackoverflow.com/questions/36648128/how-to-store-custom-objects-in-dataset
implicit val featureEncoder = Encoders.kryo[Feature]
val natureSQL = zoomConfig.get(zoom.toString)
if(natureSQL.isEmpty){
return None
}
val rawDF = spark.sql(natureSQL.get)
val featureCollection = rawDF.map(row => {
val id = row.getAs[Long]("__id__")
val geom = row.getAs[Geometry]("__geometry__")
val fields = row.schema.filter(field => {
!Seq("__id__", "__geometry__").contains(field.name)
}).map(field => field.name)
val props = row.getValuesMap[String](fields)
Feature(id, geom, props)
})
Some(featureCollection)
}
}
示例12: FortisTargetTablename
//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.aggregators
import com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.dto.{AggregationRecord, Event, EventBatchEntry}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
trait FortisAggregator {
protected val KeyspaceName = "fortis"
protected val CassandraFormat = "org.apache.spark.sql.cassandra"
protected val AggregateFunctions = "sum(mentioncount) as mentioncountagg, SentimentWeightedAvg(IF(IsNull(avgsentiment), 0, avgsentiment), IF(IsNull(mentioncount), 0, mentioncount)) as avgsentimentagg"
//protected val IncrementalUpdateMentionsUDF = "SumMentions(a.mentioncountagg, IF(IsNull(b.mentioncount), 0, b.mentioncount)) as mentioncount"
//protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg, IF(IsNull(b.avgsentiment), 0, b.avgsentiment), IF(IsNull(b.mentioncount), 0, b.mentioncount)) as avgsentiment"
protected val IncrementalUpdateMentionsUDF = "a.mentioncountagg as mentioncount"
protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg) as avgsentimentnumerator"
protected val DataFrameNameFlattenedEvents = "flattenedEventsDF"
protected val DataFrameNameComputed = "computedDF"
def FortisTargetTablename: String
def DfTableNameFlattenedEvents: String
def DfTableNameComputedAggregates: String
def FortisTargetTableDataFrame(session:SparkSession): DataFrame
def flattenEvents(session: SparkSession, eventDS: Dataset[Event]): DataFrame
def IncrementalUpdate(session:SparkSession, aggregatedDS: DataFrame): DataFrame
def AggregateEventBatches(session: SparkSession, flattenedEvents: DataFrame): DataFrame
}
abstract class FortisAggregatorBase extends FortisAggregator {
override def DfTableNameFlattenedEvents: String = s"$DataFrameNameFlattenedEvents$FortisTargetTablename"
override def DfTableNameComputedAggregates: String = s"$DataFrameNameComputed$FortisTargetTablename"
}
示例13: Xref
//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif
import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset
object Xref {
def getXref: Dataset[XrefRow] = {
val file = Job.sc.textFile("examples/spark_repl_demo/cif_xref.txt")
val sqlContext = Job.sqlContext
// this is used to implicitly convert an RDD to a DataFrame or Dataset.
import sqlContext.implicits._
//Have to wrap this Spark code in a block so that the header val is only scoped to this call
//not the entire class. If header val was class level, then the closure would try to serialize
//this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
val fileRdd = {
val header = file.first()
file
.filter(line => line != header)
.map(_.split("\t"))
.map(p => XrefRow(p(0), p(1), p(2).toInt))
//.toDS()
}
fileRdd.toDS
}
}
case class XrefRow (XrefSystem: String,
XrefId: String,
CIFId: Int)
示例14: Customer
//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif
import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset
object Customer {
def getCustomers: Dataset[CustomerRow] = {
val file = Job.sc.textFile("examples/spark_repl_demo/cif_customer.txt")
val sqlContext = Job.sqlContext
// this is used to implicitly convert an RDD to a DataFrame or Dataset.
import sqlContext.implicits._
//Have to wrap this Spark code in a block so that the header val is only scoped to this call
//not the entire class. If header val was class level, then the closure would try to serialize
//this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
val fileRdd = {
val header = file.first()
file
.filter(line => line != header)
.map(_.split("\t"))
.map(p => CustomerRow(p(0), p(1), p(2), p(3), p(4), p(5).toInt))
//.toDS()
}
fileRdd.toDS
}
}
case class CustomerRow (Name: String,
Address: String,
City:String,
PostalCode:String,
Phone: String,
CIFId: Int)
示例15: Customer
//设置package包名称以及导入依赖的类
package com.nextgendata.app.target
import java.io.File
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Dataset
object Customer {
def insert(customers: Dataset[CustomerRow]): Unit ={
FileUtils.deleteDirectory(new File("target/Customer.txt"))
customers.rdd.saveAsTextFile("target/Customer.txt")
}
}
object BadCustomer {
def insert(customers: Dataset[BadCustomerRow]): Unit ={
FileUtils.deleteDirectory(new File("target/BadCustomer.txt"))
customers.rdd.saveAsTextFile("target/BadCustomer.txt")
}
}
case class CustomerRow(email: String, provinceCode: String, provinceName:String, countryName:String, postal: String, CIFId: Int)
case class BadCustomerRow(email: String, postal: String, CIFId: Int)