当前位置: 首页>>代码示例>>Scala>>正文


Scala HiveContext类代码示例

本文整理汇总了Scala中org.apache.spark.sql.hive.HiveContext的典型用法代码示例。如果您正苦于以下问题:Scala HiveContext类的具体用法?Scala HiveContext怎么用?Scala HiveContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了HiveContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: KuduAccountMartSimpleSums

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala

示例2: KuduAppEventSimpleSums

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:52,代码来源:KuduAppEventSimpleSums.scala

示例3: KuduToHDFS

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiTripTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiTripTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("kuduTaxiTripTableName")

    hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
      " AS SELECT * FROM kuduTaxiTripTableName " +
      " STORED AS PARQUET")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:50,代码来源:KuduToHDFS.scala

示例4: KuduAppEventSimpleSums

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:52,代码来源:KuduAppEventSimpleSums.scala

示例5: KuduToHDFS

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> " +
        "<hdfsAccountMartTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)
    val hdfsAccountMartTableName = args(3)
    val numOfCenters = args(4).toInt
    val numOfIterations = args(5).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")

    hiveContext.sql("CREATE TABLE " + hdfsAccountMartTableName + " AS SELECT * FROM account_mart_tmp")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:50,代码来源:KuduToHDFS.scala

示例6: KuduAccountMartSimpleSums

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala

示例7: Main

//设置package包名称以及导入依赖的类
package com.microsoft.netalyzer.loader

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object Main {
  val settings = new Settings()
  val conf = new SparkConf()
  val sc = new SparkContext(conf)
  val sqlContext = new HiveContext(sc)

  def main(args: Array[String]): Unit = {
    sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
    sqlContext.setConf("spark.sql.shuffle.partitions", "200")

    Utils.initializeDb(settings.cookedData, sqlContext)
    Utils.importCsvData(settings.rawData, sqlContext)
  }
} 
开发者ID:bitvector2,项目名称:netalyzer-loader,代码行数:20,代码来源:Main.scala

示例8: Average

//设置package包名称以及导入依赖的类
package nl.techdays.bigdataprocessing.demo03

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext

case class Average(dimension: String, average: Double)

object Program {
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName("adl-sample-app")
    val sc = new SparkContext(conf)
    val sqlContext = new HiveContext(sc)

    import sqlContext.implicits._

    val measurements = sqlContext.sql("SELECT * FROM measurements")

    measurements
      .map(x => (x.getAs[String]("dimension"), x.getAs[Double]("value")))
      .reduceByKey((left, right) => (left + right) / 2)
      .map { case (dimension, average) => Average(dimension,average) }
      .toDF()
      .write.mode(SaveMode.Append).saveAsTable("averages")
  }
} 
开发者ID:wmeints,项目名称:techdays2016,代码行数:27,代码来源:Program.scala

示例9: LiveConverter

//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object LiveConverter {

  def StringToDouble(s:String): Any = {
    try {
      return s.toDouble
    } catch {
      case ex: NumberFormatException => null
      case ex: NullPointerException =>  null
    }
  }

  def convertNvar(m: Map[String, Any]): Map[String, java.lang.Double] = {
    m.map {
      case (k, v: java.lang.Double) => (k,v)
      case (k, v: String) if v.isEmpty || v.toLowerCase == "null" => (k, null)
//      case (k, v: String) if v.isEmpty || v.toLowerCase == "null" || v.toUpperCase == "NO_DATA" => (k, null)
//      case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, "123321.0".toDouble)
//      case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, null)
      case (k, v: String) => (k, StringToDouble(v))
      case (k, null) => (k, null)
    }.asInstanceOf[Map[String, java.lang.Double]]
  }

  def main (args: Array[String]) {
    if(args.length != 1) {
      throw new RuntimeException("Expect 1 argument: <month:yyyy-MM|day:yyyy-MM-dd>")
    }

    var month = args.apply(0)
    month = month.replace("/","-")
    val sc = new SparkContext(new SparkConf().setAppName("Live Converter").setMaster("yarn-client"))
    @transient val hc = new HiveContext(sc)
    hc.udf.register("convertNvar", convertNvar _)

    val df_orig = hc.read.parquet("/apps/risk/det/madmen20/archive/source=live_with_nvar_type_issue/date=" + month + "*")
    val df_new = df_orig.selectExpr("mid", "source", "date", "meta", "convertNvar(nvar) as nvar", "cvar")
//    df_new.write.partitionBy("date").format("parquet").mode("error")
//      .save("/apps/risk/det/madmen20/bre/source=live_convert_" + month)
    df_new.write.format("parquet").mode(saveMode = "overwrite")
      .save("/apps/risk/det/madmen20/bre/source=live/" + month)

  }



} 
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:53,代码来源:LiveConverter.scala

示例10: Example

//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20.example

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object Example {
  def splitArray (args: Array[String]) {
    args.map((arg: String) => println("One arg is: " + arg + "!"))
  }

  def main(args: Array[String]) {
    implicit val sc = new SparkContext(new SparkConf().setAppName("Example").setMaster("yarn-client"))
    implicit val hc = new HiveContext(sc)
//    implicit val sqlc = new SQLContext(sc)
    val df = hc.parquetFile("/apps/risk/det/madmen20/bre/source=live")
    df.printSchema()
    df.take(1).foreach(println)
    splitArray(args)

  }
} 
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:24,代码来源:Example.scala

示例11: SparkTermCandidatesWeighter

//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig


abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
  val termDFName = "Term"

  def allFeatures: Seq[FeatureConfig]

  def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
    val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
      //iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
      log.info(s"Initializing feature ${f.id}...")
      val featureComputer = f.build(candidates, dataset)
      log.info(s"Computing feature ${f.id}...")
      featureComputer.compute(candidates)
    })
    log.info(s"${allFeatures.size} features have been computed")
    resByFeatures.transpose
    }

  def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
    val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
    val schema = StructType(header)
    val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
    val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
    val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
    df
  }

  def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
    val featureValues = convert2FeatureSpace(candidates, dataset)
    val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
    val weightedDF = weight(initDF)
    val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
    val weightColId: String = id //for serialization
    val termColId: String = termDFName
    val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
    terms
  }

  def weight(df: DataFrame) : DataFrame
}

object SparkConfigs {
  val sparkConf = new SparkConf()
    .setAppName("ATR Evaluation System")
    .setMaster("local[16]")
    .set("spark.driver.memory", "1g")
  val sc = new SparkContext(sparkConf)
  val sqlc = new HiveContext(sc)
} 
开发者ID:ispras,项目名称:atr4s,代码行数:61,代码来源:SparkTermCandidatesWeighter.scala

示例12: NamedContext

//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.worker

import java.io.File

import io.hydrosphere.mist.api.{CentralLoggingConf, RuntimeJobInfo, SetupConfiguration}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Duration
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

class NamedContext(
  val sparkContext: SparkContext,
  val namespace: String,
  streamingDuration: Duration = Duration(40 * 1000),
  loggingConf: Option[CentralLoggingConf] = None
) {

  private val jars = mutable.Buffer.empty[String]

  def addJar(jarPath: String): Unit = {
    val jarAbsolutePath = new File(jarPath).getAbsolutePath
    if (!jars.contains(jarAbsolutePath)) {
      sparkContext.addJar(jarPath)
      jars += jarAbsolutePath
    }
  }

  def setupConfiguration(jobId: String): SetupConfiguration = {
    SetupConfiguration(
      context = sparkContext,
      streamingDuration = streamingDuration,
      info = RuntimeJobInfo(jobId, namespace),
      loggingConf = loggingConf
    )
  }

  //TODO: can we call that inside python directly using setupConfiguration?
  // python support
  def sparkConf: SparkConf = sparkContext.getConf

  // python support
  def javaContext: JavaSparkContext = new JavaSparkContext(sparkContext)

  // python support
  def sqlContext: SQLContext = new SQLContext(sparkContext)

  // python support
  def hiveContext: HiveContext = new HiveContext(sparkContext)

  def stop(): Unit = {
    sparkContext.stop()
  }

} 
开发者ID:Hydrospheredata,项目名称:mist,代码行数:58,代码来源:NamedContext.scala

示例13: HiveContextFactory

//设置package包名称以及导入依赖的类
package spark.jobserver.context

import com.typesafe.config.Config
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import spark.jobserver.{api, ContextLike, SparkHiveJob}
import spark.jobserver.util.SparkJobUtils

class HiveContextFactory extends ScalaContextFactory {
  type C = HiveContext with ContextLike

  def isValidJob(job: api.SparkJobBase): Boolean = job.isInstanceOf[SparkHiveJob]

  def makeContext(sparkConf: SparkConf, config: Config,  contextName: String): C = {
    contextFactory(sparkConf)
  }

  protected def contextFactory(conf: SparkConf): C = {
    new HiveContext(new SparkContext(conf)) with HiveContextLike
  }
}

private[jobserver] trait HiveContextLike extends ContextLike {
  def stop() { this.sparkContext.stop() }
} 
开发者ID:TruenoDB,项目名称:trueno-compute-server,代码行数:26,代码来源:HiveContextFactory.scala

示例14: HiveOperationTest

//设置package包名称以及导入依赖的类
package cn.com.warlock.sql

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object HiveOperationTest {
  def main(args: Array[String]): Unit = {
    if (args.length < 1) {
      System.err.println("Usage: <inpath>")
      System.exit(1)
    }

    val inputFile = args(0)

    val conf = new SparkConf().setAppName("HiveOperationTest")
    val sc = new SparkContext(conf)
    val sqlContext = new HiveContext(sc)

    // create table
    sqlContext.sql("CREATE TABLE IF NOT EXISTS weather (date STRING, city STRING, minTem Int, maxTem Int) row format delimited fields terminated by '\t'")
    sqlContext.sql(s"LOAD DATA INPATH '${inputFile}' INTO TABLE weather")

    // Queries are expressed in HiveQL
    sqlContext.sql("select city, avg(minTem) from weather group by city").collect().foreach(println)

    // ?? udf
    sqlContext.udf.register("class", (s: Int) => if (s <= 20) "lower" else "high")

    sqlContext.sql("select city, maxTem, class(maxTem) from weather").collect().foreach(println)

    sc.stop()
  }
} 
开发者ID:warlock-china,项目名称:spark-meepo,代码行数:34,代码来源:HiveOperationTest.scala

示例15: RandomSampling

//设置package包名称以及导入依赖的类
package com.burness.algorithm.preprocess

import breeze.numerics.abs
import com.burness.utils.AbstractParams
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import scopt.OptionParser


class RandomSampling(sc: SparkContext) {



  case class Params(samplingRatio: Double =1.0,
                   inputTableName: String = null,
                    outputTableName: String = null)
  extends AbstractParams[Params]


  def parseParams(args: Array[String]): Params = {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("RandomSampling") {
      head("Random Sampling Params parse")
      opt[String]("inputTableName")
        .text("data input path")
        .action((x, c) => c.copy(inputTableName = x))
      opt[String]("outputTableName")
        .text("data output path")
        .action((x, c) => c.copy(outputTableName = x))
      opt[Double]("samplingRatio")
        .text("random sampling ratio")
        .action((x, c) => c.copy(samplingRatio = x))
    }

      parser.parse(args, defaultParams) match {
        case Some(params) =>
          params
        case None =>
          defaultParams
      }

    }
  def run(params: Params): Unit ={
    val hiveContext = new HiveContext(sc)
    import hiveContext.implicits._
    import hiveContext.sql
    // ????????????0.7???????????0-9??????6???
    val result = sql(s"select * from ${params.inputTableName}").sample(withReplacement = false, params.samplingRatio)
    val r = scala.util.Random
    r.setSeed(System.currentTimeMillis())
    val tempNum = abs(r.nextInt())
    val tempName = "random_"+tempNum.toString+"_sample_table"
    result.registerTempTable(tempName)
    sql(s"create table ${params.outputTableName} as select * from $tempName")

  }


} 
开发者ID:spark-mler,项目名称:algorithmEngine,代码行数:60,代码来源:RandomSampling.scala


注:本文中的org.apache.spark.sql.hive.HiveContext类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。