本文整理汇总了Scala中org.apache.spark.sql.SparkSession类的典型用法代码示例。如果您正苦于以下问题:Scala SparkSession类的具体用法?Scala SparkSession怎么用?Scala SparkSession使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SparkSession类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: movies
//设置package包名称以及导入依赖的类
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
object movies {
case class Sentence(sentence: String,label: Double)
def main(args:Array[String]) {
val spark = SparkSession
.builder
.appName("Movies Reviews")
.config("spark.master", "local")
.getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val neg = spark.sparkContext.textFile("file:///data/train/neg/").repartition(4)
.map(w => Sentence(w, 0.0))
val pos = spark.sparkContext.textFile("file:///data/train/pos/").repartition(4)
.map(w => Sentence(w, 1.0))
val test = spark.sparkContext.wholeTextFiles("file:///data/test/").repartition(4)
.map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})
val training=neg.union(pos)
val trainingDF=spark.createDataFrame(training)
val testDF=spark.createDataFrame(test)
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
val tokenizer = new Tokenizer()
.setInputCol("sentence")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val nb = new NaiveBayes()
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, nb))
// Fit the pipeline to training documents.
val model = pipeline.fit(trainingDF)
// Make predictions on test documents.
model.transform(testDF).repartition(1)
.select("file", "prediction")
.write.format("csv")
.option("header","true")
.option("delimiter","\t")
.save("/tmp/spark-prediction")
spark.stop()
}
}
示例2: apply
//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators
import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime
import scala.util.Random
def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
val sparkSession = SparkRuntime.getSparkSession()
import sparkSession.implicits._
val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
val random : Random = new Random()
val id : Int = random.nextInt()
val path : String = s"/tmp/${id}"
val sparkContext = sparkSession.sparkContext
generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
val edgesRDD = sparkContext.textFile(path)
.map( s => s.split("\t"))
.map( l => (l(0).toLong, l(1).toLong))
.zipWithIndex().map( { case ((tail,head), id) => (id, tail, head)})
sparkSession.createDataset(edgesRDD)
}
}
示例3: WithCalcLogging
//设置package包名称以及导入依赖的类
package biz.meetmatch.decorators
import biz.meetmatch.logging.BusinessLogger
import org.apache.spark.sql.SparkSession
import org.rogach.scallop.Scallop
import scala.util.{Failure, Success, Try}
object WithCalcLogging {
def apply[B](f: => B)(implicit module: Class[_]): B = apply(module.getName)(f)
def apply[B](scallopts: Scallop, sparkSession: SparkSession)(f: => B)(implicit module: Class[_] = this.getClass): B = apply(module.getName, Some(scallopts), Some(sparkSession))(f)
def apply[B](module: String)(f: => B): B = apply(module, None, None)(f)
def apply[B](module: String, scallopts: Scallop)(f: => B)(): B = apply(module, Some(scallopts), None)(f)
def apply[B](module: String, scallopts: Scallop, sparkSession: SparkSession)(f: => B): B = apply(module, Some(scallopts), Some(sparkSession))(f)
def apply[B](module: String, scalloptsO: Option[Scallop], sparkSessionO: Option[SparkSession])(f: => B): B = {
val businessLogger = new BusinessLogger(module)
val optsString = scalloptsO
.map { scallopts =>
scallopts.opts
.map { opt => opt.name + " = " + scallopts.get(opt.name)(opt.converter.tag).getOrElse("(empty)") }
.mkString(",")
}
.getOrElse("")
val sparkAppId = sparkSessionO.map(_.sparkContext.applicationId).getOrElse("")
businessLogger.calcStarted(optsString, sparkAppId)
val attempt = Try(WithStopwatch(f))
attempt match {
case Success(result) =>
businessLogger.calcStopped("SUCCESS")
result
case Failure(exception) =>
businessLogger.calcStopped("FAILURE")
throw exception
}
}
}
示例4: PCASampleDemo
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
object PCASampleDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[4]")
.appName("PCAExample")
.getOrCreate()
val data = Array(
Vectors.dense(3.5, 2.0, 5.0, 6.3, 5.60, 2.4),
Vectors.dense(4.40, 0.10, 3.0, 9.0, 7.0, 8.75),
Vectors.dense(3.20, 2.40, 0.0, 6.0, 7.4, 3.34)
)
val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
df.show(false)
val pca = new PCA()
.setInputCol("features")
.setOutputCol("pcaFeatures")
.setK(4)
.fit(df)
val result = pca.transform(df).select("pcaFeatures")
result.show(false)
spark.stop()
}
}
示例5: MafExample
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
object MafExample {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("MAF Example")
val sc = new SparkContext(conf)
val spark = SparkSession
.builder()
.getOrCreate()
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.option("delimiter", "\t")
.option("delimiter", "\t")
.option("comment", "#")
.load("TCGA.ACC.mutect.abbe72a5-cb39-48e4-8df5-5fd2349f2bb2.somatic.maf")
df.createOrReplaceTempView("mutations")
val topTwenty = spark.sql("SELECT Hugo_Symbol, count(*) FROM mutations GROUP BY Hugo_symbol ORDER BY count(*) DESC LIMIT 20")
val topTwentyMissense = spark.sql("SELECT Hugo_Symbol, count(*) FROM mutations WHERE Variant_Classification='Missense_Mutation' GROUP BY Hugo_symbol ORDER BY count(*) DESC LIMIT 20")
val fat4 = spark.sql("SELECT Chromosome, Start_Position, End_Position, Strand, Variant_Classification, Variant_Type, Tumor_Sample_Barcode FROM mutations WHERE Hugo_Symbol='FAT4'")
topTwenty.coalesce(1).write.format("com.databricks.spark.csv").save("results/topTwenty")
topTwentyMissense.coalesce(1).write.format("com.databricks.spark.csv").save("results/topTwentyMissense")
fat4.coalesce(1).write.format("com.databricks.spark.csv").save("results/fat4")
}
}
示例6: VeChallengeIngest
//设置package包名称以及导入依赖的类
package io.github.adrianulbona.ve
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import twitter4j.{GeoLocation, Place, Status}
object VeChallengeIngest {
case class Location(latitude: Double, longitude: Double)
case class Tweet(time: Long, text: String, user: String, isRetweet: Boolean, country: String, location: Location)
def main(args: Array[String]) {
val spark = SparkSession.builder
.master("local[*]")
.appName("ve-challenge")
.getOrCreate()
import spark.sqlContext.implicits._
val ssc = new StreamingContext(spark.sparkContext, Minutes(2))
val stream = TwitterUtils.createStream(ssc, None, Seq("challenge"))
stream.map(extract).map(normalize).foreachRDD((batch, time) => {
val batchDF: DataFrame = batch.toDF.cache
batchDF.groupBy($"country").count().toDF("country", "count").orderBy($"count".desc).show(6)
batchDF.coalesce(1).write.parquet("tweets/batch=" + time.milliseconds)
batchDF.unpersist()
})
ssc.start()
ssc.awaitTermination()
spark.stop()
}
def extract(status: Status): (Long, String, String, Boolean, Option[Place], Option[GeoLocation]) = {
(status.getCreatedAt.getTime,
status.getText,
status.getUser.getName,
status.isRetweet,
Option(status.getPlace),
Option(status.getGeoLocation))
}
def normalize(extract: (Long, String, String, Boolean, Option[Place], Option[GeoLocation])): Tweet = extract match {
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), None) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(Double.NaN, Double.NaN))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, "unknown", Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, None) =>
Tweet(time, text, user, isRetweet, "unknown", Location(Double.NaN, Double.NaN))
}
}
示例7: PrintMetrics
//设置package包名称以及导入依赖的类
package com.github.jongwook
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object PrintMetrics extends App {
val (prediction, labels) = RankingDataProvider(MovieLensLoader.load())
val spark = SparkSession.builder().master(new SparkConf().get("spark.master", "local[8]")).getOrCreate()
val metrics = new SparkRankingMetrics(spark.createDataFrame(prediction), spark.createDataFrame(labels), itemCol = "product", predictionCol = "rating")
val ats = Seq(5, 10, 20, 100, Integer.MAX_VALUE)
val toPrint = Map[String, SparkRankingMetrics => Seq[Int] => Seq[Double]](
"Precision" -> { m => k => m.precisionAt(k) },
"Recall" -> { m => k => m.recallAt(k) },
"F1" -> { m => k => m.f1At(k) },
"NDCG" -> { m => k => m.ndcgAt(k) },
"MAP" -> { m => k => m.mapAt(k) },
"MRR" -> { m => k => m.mrrAt(k) }
)
for ((metric, calculator) <- toPrint) {
printf("%12s", metric)
val f = calculator(metrics)
for (x <- f(ats)) {
printf("%12.8f", x)
}
println()
}
}
示例8:
//设置package包名称以及导入依赖的类
package com.shashank.akkahttp.project
import java.util.concurrent.ConcurrentHashMap
import akka.actor.ActorSystem
import akka.http.scaladsl.server.Directives._
import akka.stream.ActorMaterializer
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
import com.shashank.akkahttp.project.Models.{LoadRequest, ServiceJsonProtoocol}
import spray.json.JsArray
import scala.collection.JavaConverters._
import spray.json.{DefaultJsonProtocol, JsArray, pimpAny}
import spray.json.DefaultJsonProtocol._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql._
trait RestService {
implicit val system: ActorSystem
implicit val materializer: ActorMaterializer
implicit val sparkSession: SparkSession
val datasetMap = new ConcurrentHashMap[String, Dataset[Row]]()
import ServiceJsonProtoocol._
val route =
pathSingleSlash {
get {
complete {
"welcome to rest service"
}
}
} ~
path("load") {
post {
entity(as[LoadRequest]) {
loadRequest => complete {
val id = "" + System.nanoTime()
val dataset = sparkSession.read.format("csv")
.option("header", "true")
.load(loadRequest.path)
datasetMap.put(id, dataset)
id
}
}
}
} ~
path("view" / """[\w[0-9]-_]+""".r) { id =>
get {
complete {
val dataset = datasetMap.get(id)
dataset.take(10).map(row => row.toString())
}
}
}
}
示例9: CountVectorizerDemo
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }
object CountVectorizerDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val df = spark.createDataFrame(
Seq((0, Array("Jason", "David")),
(1, Array("David", "Martin")),
(2, Array("Martin", "Jason")),
(3, Array("Jason", "Daiel")),
(4, Array("Daiel", "Martin")),
(5, Array("Moahmed", "Jason")),
(6, Array("David", "David")),
(7, Array("Jason", "Martin")))).toDF("id", "name")
df.show(false)
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("name")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
val feature = cvModel.transform(df)
feature.show(false)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:40,代码来源:CountVectorizerDemo.scala
示例10: OneHotEncoderExample
//设置package包名称以及导入依赖的类
package org.sparksamples.regression.bikesharing
import org.apache.spark.sql.SparkSession
object OneHotEncoderExample {
def main(args: Array[String]): Unit = {
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
val spark = SparkSession
.builder()
.appName("Spark SQL basic example").master("local[1]")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// For implicit conversions like converting RDDs to DataFrames
val df = spark.createDataFrame(Seq(
(0, 3),
(1, 2),
(2, 4),
(3, 3),
(4, 3),
(5, 4)
)).toDF("id", "category")
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex")
.fit(df)
val indexed = indexer.transform(df)
val encoder = new OneHotEncoder()
.setInputCol("categoryIndex")
.setOutputCol("categoryVec")
val encoded = encoder.transform(indexed)
encoded.select("id", "categoryVec").show()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:41,代码来源:OneHotEncoderExample.scala
示例11: DocumentClassificationLibSVM
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.ml
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
object DocumentClassificationLibSVM {
def main(args: Array[String]): Unit = {
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkRatingData").config(spConfig)
.getOrCreate()
val data = spark.read.format("libsvm").load("./output/20news-by-date-train-libsvm/part-combined")
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1L)
// Train a NaiveBayes model.
val model = new NaiveBayes()
.fit(trainingData)
val predictions = model.transform(testData)
predictions.show()
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:37,代码来源:DocumentClassificationLibSVM.scala
示例12: TrackApp
//设置package包名称以及导入依赖的类
package com.esri
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{max, min}
object TrackApp extends App {
val spark = SparkSession
.builder()
.appName("Path SOM")
.master("local[*]")
.config("spark.ui.enabled", "false")
.getOrCreate()
import spark.implicits._
try {
val df = spark
.read
.json("Paths")
.as[TrackCells]
.cache()
val qrAgg = df
.flatMap(_.cells)
.distinct()
.agg(min("q").as("qmin"), max("q").alias("qmax"), min("r").alias("rmin"), max("r").alias("rmax"))
.as[QRMinMax]
.head
val qrMin = Cell(qrAgg.qmin, qrAgg.rmin)
val qrMax = Cell(qrAgg.qmax, qrAgg.rmax)
val qrDel = (qrMax - qrMin) + 1
val qrSize = qrDel size
val trainingArr = df
.rdd
.map(trackCells => trackCells.toBreeze(qrMin, qrDel, qrSize))
.collect()
val rnd = new java.security.SecureRandom()
val somSize = 3
val nodes = for {
q <- 0 until somSize
r <- 0 until somSize
} yield Node(q, r, trainingArr(rnd.nextInt(trainingArr.length)))
val som = SOM(nodes)
val epochMax = trainingArr.length * 400
implicit val progressBar = TerminalProgressBar(epochMax)
som.train(trainingArr, epochMax, 2.5, initialAlpha = 0.4)
som.saveAsFig("/tmp/fig.png", qrDel)
} finally {
spark.stop()
}
}
示例13: CarMileageTest
//设置package包名称以及导入依赖的类
package org.dele.misc.bookFastDS
import org.apache.spark.sql.SparkSession
object CarMileageTest extends App {
val spark = SparkSession.builder()
.appName("car mileage")
.master("local[*]")
.config("logConf", "true")
.getOrCreate()
val milData = spark.read.option("header", "true").option("inferSchema", "true").csv("res/data/car-milage.csv")
println(milData.count())
milData.show(5)
milData.printSchema()
milData.describe("mpg", "hp", "torque", "automatic").show()
milData.groupBy("automatic").avg("mpg", "hp", "torque").show()
milData.groupBy().avg("mpg", "hp", "torque").show()
import org.apache.spark.sql.functions._
milData.agg(stddev(milData("mpg")), avg(milData("torque"))).show()
val cor = milData.stat.corr("hp", "weight")
println(f"'hp' to 'weight' correlation: $cor%.4f")
val cov = milData.stat.cov("hp", "weight")
println(f"'hp' to 'weight' covariance: $cov%.4f")
val crosstab = milData.stat.crosstab("automatic", "NoOfSpeed")
crosstab.show()
val crosstab2 = milData.stat.crosstab("hp", "weight")
crosstab2.show()
spark.close()
}
示例14: GroupByTest
//设置package包名称以及导入依赖的类
package org.dele.misc.bookMasterSpark2.scalaExamples
import org.apache.spark.sql.SparkSession
import scala.util.Random
object GroupByTest {
def main(args:Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("GroupBy test")
.master("local[*]")
.getOrCreate()
val numMappers = 2
val numKVPairs = 1000
val valSize = 1000
val numReducers = numMappers
val pairs1 = spark.sparkContext.parallelize(
0 until numMappers,
numMappers
).flatMap{ p =>
val ranGen = new Random()
val arr = new Array[(Int, Array[Byte])](numKVPairs)
(0 until numKVPairs).foreach{ idx =>
val byteArr = new Array[Byte](valSize)
ranGen.nextBytes(byteArr)
arr(idx) = (math.abs(ranGen.nextInt()) % 500, byteArr)
}
arr
}.cache()
pairs1.count()
val groups = pairs1.groupByKey(numReducers)
val groupCount = groups.count()
println(groupCount)
val keyedGroups = groups.take(groupCount.toInt)
println(keyedGroups.length)
spark.stop()
}
}
示例15: Job
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object Job {
val AppName = "ECAD_JSON_Converter"
val sparkMaster = "local[3]"
// val sparkMaster = "spark://node0.local:7077"
val HDFSDataDir = "hdfs://node0.local:9000/ECAD_Data/"
val HDFSNameNode = "hdfs://node0.local:9000"
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(AppName).setMaster(sparkMaster)
val spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
val sc = spark.sparkContext
val hadoopConf = sc.hadoopConfiguration
hadoopConf.set("fs.defaultFS", HDFSNameNode)
val mapper = new Mappers()
val sourceDF = mapper.genSourceDF(spark, HDFSDataDir + "sources.txt")
val precipDF = mapper.precipicationDF(spark, HDFSDataDir + "RR_SOUID100014.txt")
}
}