本文整理汇总了Scala中org.apache.spark.sql.hive.HiveContext类的典型用法代码示例。如果您正苦于以下问题:Scala HiveContext类的具体用法?Scala HiveContext怎么用?Scala HiveContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HiveContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: KuduAccountMartSimpleSums
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAccountMartSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
示例2: KuduAppEventSimpleSums
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAppEventSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAppEventTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAppEventTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAppEventTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("app_event_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
示例3: KuduToHDFS
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduToHDFS {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduTaxiTripTableName> " +
"<hdfsTaxiTripTableName> " +
"<numOfCenters> " +
"<numOfIterations> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduTaxiTripTableName = args(2)
val hdfsTaxiTripTableName = args(3)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduTaxiTripTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("kuduTaxiTripTableName")
hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
" AS SELECT * FROM kuduTaxiTripTableName " +
" STORED AS PARQUET")
sc.stop()
}
}
示例4: KuduAppEventSimpleSums
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAppEventSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAppEventTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAppEventTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAppEventTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("app_event_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
示例5: KuduToHDFS
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduToHDFS {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> " +
"<hdfsAccountMartTableName> " +
"<numOfCenters> " +
"<numOfIterations> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val hdfsAccountMartTableName = args(3)
val numOfCenters = args(4).toInt
val numOfIterations = args(5).toInt
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
hiveContext.sql("CREATE TABLE " + hdfsAccountMartTableName + " AS SELECT * FROM account_mart_tmp")
sc.stop()
}
}
示例6: KuduAccountMartSimpleSums
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAccountMartSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
示例7: Main
//设置package包名称以及导入依赖的类
package com.microsoft.netalyzer.loader
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Main {
val settings = new Settings()
val conf = new SparkConf()
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
def main(args: Array[String]): Unit = {
sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
sqlContext.setConf("spark.sql.shuffle.partitions", "200")
Utils.initializeDb(settings.cookedData, sqlContext)
Utils.importCsvData(settings.rawData, sqlContext)
}
}
示例8: Average
//设置package包名称以及导入依赖的类
package nl.techdays.bigdataprocessing.demo03
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext
case class Average(dimension: String, average: Double)
object Program {
def main(args: Array[String]) = {
val conf = new SparkConf().setAppName("adl-sample-app")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
val measurements = sqlContext.sql("SELECT * FROM measurements")
measurements
.map(x => (x.getAs[String]("dimension"), x.getAs[Double]("value")))
.reduceByKey((left, right) => (left + right) / 2)
.map { case (dimension, average) => Average(dimension,average) }
.toDF()
.write.mode(SaveMode.Append).saveAsTable("averages")
}
}
示例9: LiveConverter
//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object LiveConverter {
def StringToDouble(s:String): Any = {
try {
return s.toDouble
} catch {
case ex: NumberFormatException => null
case ex: NullPointerException => null
}
}
def convertNvar(m: Map[String, Any]): Map[String, java.lang.Double] = {
m.map {
case (k, v: java.lang.Double) => (k,v)
case (k, v: String) if v.isEmpty || v.toLowerCase == "null" => (k, null)
// case (k, v: String) if v.isEmpty || v.toLowerCase == "null" || v.toUpperCase == "NO_DATA" => (k, null)
// case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, "123321.0".toDouble)
// case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, null)
case (k, v: String) => (k, StringToDouble(v))
case (k, null) => (k, null)
}.asInstanceOf[Map[String, java.lang.Double]]
}
def main (args: Array[String]) {
if(args.length != 1) {
throw new RuntimeException("Expect 1 argument: <month:yyyy-MM|day:yyyy-MM-dd>")
}
var month = args.apply(0)
month = month.replace("/","-")
val sc = new SparkContext(new SparkConf().setAppName("Live Converter").setMaster("yarn-client"))
@transient val hc = new HiveContext(sc)
hc.udf.register("convertNvar", convertNvar _)
val df_orig = hc.read.parquet("/apps/risk/det/madmen20/archive/source=live_with_nvar_type_issue/date=" + month + "*")
val df_new = df_orig.selectExpr("mid", "source", "date", "meta", "convertNvar(nvar) as nvar", "cvar")
// df_new.write.partitionBy("date").format("parquet").mode("error")
// .save("/apps/risk/det/madmen20/bre/source=live_convert_" + month)
df_new.write.format("parquet").mode(saveMode = "overwrite")
.save("/apps/risk/det/madmen20/bre/source=live/" + month)
}
}
示例10: Example
//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20.example
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Example {
def splitArray (args: Array[String]) {
args.map((arg: String) => println("One arg is: " + arg + "!"))
}
def main(args: Array[String]) {
implicit val sc = new SparkContext(new SparkConf().setAppName("Example").setMaster("yarn-client"))
implicit val hc = new HiveContext(sc)
// implicit val sqlc = new SQLContext(sc)
val df = hc.parquetFile("/apps/risk/det/madmen20/bre/source=live")
df.printSchema()
df.take(1).foreach(println)
splitArray(args)
}
}
示例11: SparkTermCandidatesWeighter
//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig
abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
val termDFName = "Term"
def allFeatures: Seq[FeatureConfig]
def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
//iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
log.info(s"Initializing feature ${f.id}...")
val featureComputer = f.build(candidates, dataset)
log.info(s"Computing feature ${f.id}...")
featureComputer.compute(candidates)
})
log.info(s"${allFeatures.size} features have been computed")
resByFeatures.transpose
}
def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
val schema = StructType(header)
val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
df
}
def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
val featureValues = convert2FeatureSpace(candidates, dataset)
val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
val weightedDF = weight(initDF)
val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
val weightColId: String = id //for serialization
val termColId: String = termDFName
val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
terms
}
def weight(df: DataFrame) : DataFrame
}
object SparkConfigs {
val sparkConf = new SparkConf()
.setAppName("ATR Evaluation System")
.setMaster("local[16]")
.set("spark.driver.memory", "1g")
val sc = new SparkContext(sparkConf)
val sqlc = new HiveContext(sc)
}
示例12: NamedContext
//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.worker
import java.io.File
import io.hydrosphere.mist.api.{CentralLoggingConf, RuntimeJobInfo, SetupConfiguration}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Duration
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
class NamedContext(
val sparkContext: SparkContext,
val namespace: String,
streamingDuration: Duration = Duration(40 * 1000),
loggingConf: Option[CentralLoggingConf] = None
) {
private val jars = mutable.Buffer.empty[String]
def addJar(jarPath: String): Unit = {
val jarAbsolutePath = new File(jarPath).getAbsolutePath
if (!jars.contains(jarAbsolutePath)) {
sparkContext.addJar(jarPath)
jars += jarAbsolutePath
}
}
def setupConfiguration(jobId: String): SetupConfiguration = {
SetupConfiguration(
context = sparkContext,
streamingDuration = streamingDuration,
info = RuntimeJobInfo(jobId, namespace),
loggingConf = loggingConf
)
}
//TODO: can we call that inside python directly using setupConfiguration?
// python support
def sparkConf: SparkConf = sparkContext.getConf
// python support
def javaContext: JavaSparkContext = new JavaSparkContext(sparkContext)
// python support
def sqlContext: SQLContext = new SQLContext(sparkContext)
// python support
def hiveContext: HiveContext = new HiveContext(sparkContext)
def stop(): Unit = {
sparkContext.stop()
}
}
示例13: HiveContextFactory
//设置package包名称以及导入依赖的类
package spark.jobserver.context
import com.typesafe.config.Config
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import spark.jobserver.{api, ContextLike, SparkHiveJob}
import spark.jobserver.util.SparkJobUtils
class HiveContextFactory extends ScalaContextFactory {
type C = HiveContext with ContextLike
def isValidJob(job: api.SparkJobBase): Boolean = job.isInstanceOf[SparkHiveJob]
def makeContext(sparkConf: SparkConf, config: Config, contextName: String): C = {
contextFactory(sparkConf)
}
protected def contextFactory(conf: SparkConf): C = {
new HiveContext(new SparkContext(conf)) with HiveContextLike
}
}
private[jobserver] trait HiveContextLike extends ContextLike {
def stop() { this.sparkContext.stop() }
}
示例14: HiveOperationTest
//设置package包名称以及导入依赖的类
package cn.com.warlock.sql
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object HiveOperationTest {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
System.err.println("Usage: <inpath>")
System.exit(1)
}
val inputFile = args(0)
val conf = new SparkConf().setAppName("HiveOperationTest")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
// create table
sqlContext.sql("CREATE TABLE IF NOT EXISTS weather (date STRING, city STRING, minTem Int, maxTem Int) row format delimited fields terminated by '\t'")
sqlContext.sql(s"LOAD DATA INPATH '${inputFile}' INTO TABLE weather")
// Queries are expressed in HiveQL
sqlContext.sql("select city, avg(minTem) from weather group by city").collect().foreach(println)
// ?? udf
sqlContext.udf.register("class", (s: Int) => if (s <= 20) "lower" else "high")
sqlContext.sql("select city, maxTem, class(maxTem) from weather").collect().foreach(println)
sc.stop()
}
}
示例15: RandomSampling
//设置package包名称以及导入依赖的类
package com.burness.algorithm.preprocess
import breeze.numerics.abs
import com.burness.utils.AbstractParams
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import scopt.OptionParser
class RandomSampling(sc: SparkContext) {
case class Params(samplingRatio: Double =1.0,
inputTableName: String = null,
outputTableName: String = null)
extends AbstractParams[Params]
def parseParams(args: Array[String]): Params = {
val defaultParams = Params()
val parser = new OptionParser[Params]("RandomSampling") {
head("Random Sampling Params parse")
opt[String]("inputTableName")
.text("data input path")
.action((x, c) => c.copy(inputTableName = x))
opt[String]("outputTableName")
.text("data output path")
.action((x, c) => c.copy(outputTableName = x))
opt[Double]("samplingRatio")
.text("random sampling ratio")
.action((x, c) => c.copy(samplingRatio = x))
}
parser.parse(args, defaultParams) match {
case Some(params) =>
params
case None =>
defaultParams
}
}
def run(params: Params): Unit ={
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
// ????????????0.7???????????0-9??????6???
val result = sql(s"select * from ${params.inputTableName}").sample(withReplacement = false, params.samplingRatio)
val r = scala.util.Random
r.setSeed(System.currentTimeMillis())
val tempNum = abs(r.nextInt())
val tempName = "random_"+tempNum.toString+"_sample_table"
result.registerTempTable(tempName)
sql(s"create table ${params.outputTableName} as select * from $tempName")
}
}