当前位置: 首页>>代码示例>>Scala>>正文


Scala Row类代码示例

本文整理汇总了Scala中org.apache.spark.sql.Row的典型用法代码示例。如果您正苦于以下问题:Scala Row类的具体用法?Scala Row怎么用?Scala Row使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Row类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: SimpleApp

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

import org.apache.spark.ml.clustering.LDA
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StructField, StructType}


object SimpleApp {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Simple Application").set("spark.ui.enabled", "false")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // Loads data
    val rowRDD = sc.textFile("/tmp/lda_data.txt").filter(_.nonEmpty)
      .map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
    val schema = StructType(Array(StructField("name", new VectorUDT, false)))
    val dataset = sqlContext.createDataFrame(rowRDD, schema)
    dataset.show()

    val lda = new LDA()
      .setK(10)
      .setMaxIter(10)
      .setFeaturesCol("name")
    val model = lda.fit(dataset)
    val transformed = model.transform(dataset)

    val ll = model.logLikelihood(dataset)
    val lp = model.logPerplexity(dataset)

    // describeTopics
    val topics = model.describeTopics(3)

    // Shows the result
    topics.show(false)
    transformed.show(false)
  }
} 
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:41,代码来源:SimpleApp.scala

示例2: LRCV

//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater

import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{ StringIndexerModel, VectorAssembler }
import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel, ParamGridBuilder }
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }

class LRCV(sc: SparkContext) {

  implicit val sqlContext = new SQLContext(sc)

  val lr = new LogisticRegression().setMaxIter(10).setFeaturesCol("scaledFeatures")

  val paramGrid = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(0.1, 0.01))
    .build()

  val assembler = new VectorAssembler()
    .setInputCols(Array("gender", "age", "weight", "height", "indexedJob"))
    .setOutputCol("features")

  val pipeline = new Pipeline()
    .setStages(Array(assembler, standardScaler("features"), lr))

  val cv = new CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(new BinaryClassificationEvaluator)
    .setEstimatorParamMaps(paramGrid)
    .setNumFolds(10)

  def train(df: DataFrame): (StringIndexerModel, CrossValidatorModel, Matrix) = {

    // need to index strings on all data to not missing the job fields.
    // other alternative can be manually assign values for each job like gender.
    val indexerModel = stringIndexer("job").fit(df)
    val indexed = indexerModel.transform(df)

    val splits = indexed.randomSplit(Array(0.8, 0.2))
    val training = splits(0).cache()
    val test = splits(1)

    val cvModel = cv.fit(training)

    val predictionAndLabels = cvModel
      .transform(test)
      .select("label", "prediction").map {
        case Row(label: Double, prediction: Double) ?
          (prediction, label)
      }

    printBinaryMetrics(predictionAndLabels)

    (indexerModel, cvModel, confusionMatrix(predictionAndLabels))

  }

} 
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:62,代码来源:LRCV.scala

示例3: RealEstateData

//设置package包名称以及导入依赖的类
package fr.grislain

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._

object RealEstateData {
  println("Starting real_estate_price")
  val conf = new SparkConf().setAppName("real_estate_price").setMaster("local")
  val context = new SparkContext(conf)
  val sqlContext = new SQLContext(context)

  def dataFrame: DataFrame = {
    val input = context.textFile("../data/insee_notaires.csv")
    sqlContext.createDataFrame(input mapPartitions { _.drop(1) } map {
      line =>
        Row.fromSeq(line.split(",").view.zipWithIndex filter { e => e._2 > 0 } flatMap {
          e =>
            e match {
              case (t, 1) => Seq(t.take(4).toInt, t.drop(5).toInt)
              case (p, _) => Seq(p.toDouble)
            }
        })
    },
      StructType(StructField("year", IntegerType) ::
        StructField("quarter", IntegerType) ::
        StructField("75001", DoubleType) ::
        StructField("75002", DoubleType) ::
        StructField("75003", DoubleType) ::
        StructField("75004", DoubleType) ::
        StructField("75005", DoubleType) ::
        StructField("75006", DoubleType) ::
        StructField("75007", DoubleType) ::
        StructField("75008", DoubleType) ::
        StructField("75009", DoubleType) ::
        StructField("75010", DoubleType) ::
        StructField("75011", DoubleType) ::
        StructField("75012", DoubleType) ::
        StructField("75013", DoubleType) ::
        StructField("75014", DoubleType) ::
        StructField("75015", DoubleType) ::
        StructField("75016", DoubleType) ::
        StructField("75017", DoubleType) ::
        StructField("75018", DoubleType) ::
        StructField("75019", DoubleType) ::
        StructField("75020", DoubleType) :: Nil))
  }
} 
开发者ID:ngrislain,项目名称:french_real_estate,代码行数:50,代码来源:RealEstateData.scala

示例4: StudyRDD

//设置package包名称以及导入依赖的类
package com.study.spark.datasource

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}


class StudyRDD(sqlContext: SQLContext, schema: StructType) extends RDD[Row](sqlContext.sparkContext, deps=Nil) {
  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[Row] = new StudyReader(context, schema, split)

  // ??? ?? ????? 2?? ???? ??? ????.
  // ? Executor? ???? ??? ????. ???? ???? 2? ??? ???, ??? ??? ? ?? Executor? ?? 2???.
  override protected def getPartitions: Array[Partition] = {
    val arr: Array[Partition] = new Array[Partition](2)
    arr.update(0, new Partition() {
      override def index: Int = 0
    })
    arr.update(1, new Partition() {
      override def index: Int = 1
    })
    arr
  }
} 
开发者ID:hackpupu,项目名称:LML,代码行数:27,代码来源:StudyRDD.scala

示例5: DataFrameFunctions

//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc

import org.apache.spark.sql.{Column, Dataset, Row}


class DataFrameFunctions(self: DC[Row]) {

    def join(right: DC[Row]): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right)
      }
      val hashTarget = Seq("join")
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], usingColumn: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, usingColumn)
      }
      val hashTarget = Seq("join", usingColumn)
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")

    def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, joinExprs)
      }
      val hashTarget = Seq("join", joinType, joinExprs.toString())
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }


} 
开发者ID:bloomberg,项目名称:spark-flow,代码行数:36,代码来源:DataFrameFunctions.scala

示例6: RatePredictor

//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater

import akka.actor.ActorSystem
import com.ferhtaydn.models.PatientInfo
import org.apache.spark.ml.feature.StringIndexerModel
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.mllib.linalg.{ Matrix, Vector }
import org.apache.spark.sql.{ Row, SQLContext }

import scala.concurrent.{ ExecutionContextExecutor, Future }

class RatePredictor(system: ActorSystem, sqlContext: SQLContext,
    indexModel: StringIndexerModel, cvModel: CrossValidatorModel,
    confusionMatrix: String) {

  private val decimalFormatter = new java.text.DecimalFormat("##.##")
  private val blockingDispatcher: ExecutionContextExecutor = system.dispatchers.lookup("ml.predictor.dispatcher")

  def confusionMatrixString: Future[String] = {
    Future {
      confusionMatrix
    }(blockingDispatcher)
  }

  def predict(patientInfo: PatientInfo): Future[Either[String, Double]] = {

    Future {

      val df = sqlContext.createDataFrame(Seq(patientInfo.toRecord))
      val indexedJobDF = indexModel.transform(df)

      val result = cvModel
        .transform(indexedJobDF)
        .select("prediction", "probability").map {
          case Row(prediction: Double, probability: Vector) ?
            (probability, prediction)
        }

      result.collect().headOption match {
        case Some((prob, _)) ? Right(decimalFormatter.format(prob(1)).toDouble)
        case None            ? Left(s"No result can be predicted for the patient")
      }

    }(blockingDispatcher)
  }

} 
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:48,代码来源:RatePredictor.scala

示例7: TikaMetadataRelation

//设置package包名称以及导入依赖的类
package com.jasonfeist.spark.tika

import org.apache.spark.input.PortableDataStream
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StructType}
import org.slf4j.LoggerFactory


class TikaMetadataRelation protected[tika] (path: String,
                                            userSchema: StructType,
                                            metadataExtractor: MetadataExtractor,
                                            fieldDataExtractor: FieldDataExtractor)
                          (@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with Serializable {

  val logger = LoggerFactory.getLogger(classOf[TikaMetadataRelation])

  override def schema: StructType = this.userSchema

  override def buildScan(): RDD[Row] = {

    val rdd = sqlContext
      .sparkContext.binaryFiles(path)
    rdd.map(extractFunc(_))
  }

  def extractFunc(
                    file: (String, PortableDataStream)
                  ) : Row  =
  {
    val extractedData = metadataExtractor.extract(file)
    val rowArray = new Array[Any](schema.fields.length)
    var index = 0
    while (index < schema.fields.length) {
      val field = schema(index)
      val fieldData = fieldDataExtractor.matchedField(field.name,
        field.dataType, extractedData._1, file._1, extractedData._2,
        extractedData._3)
      rowArray(index) = fieldData
      index = index + 1
    }
    Row.fromSeq(rowArray)
  }
} 
开发者ID:jasonfeist,项目名称:tika-spark-datasource,代码行数:47,代码来源:TikaMetadataRelation.scala

示例8: GenerateScalingData

//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.tools

import com.highperformancespark.examples.dataframe.RawPanda

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.linalg.Vector

object GenerateScalingData {
  
  def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
      RDD[RawPanda] = {
    val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows)
      .map(_.toInt.toString)
    val valuesRDD = RandomRDDs.normalVectorRDD(
      sc, numRows = rows, numCols = numCols)
    zipRDD.zip(valuesRDD).map{case (z, v) =>
      RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)
    }
  }
  // end::MAGIC_PANDA[]
} 
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:25,代码来源:GenerateScalingData.scala

示例9: StudyRelation

//设置package包名称以及导入依赖的类
package com.study.spark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}


class StudyRelation(parameters: Map[String, String])(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan {

  override def schema: StructType = {
    // ??? ?? ?????, ?? ??? ???? ????. ???? ?????? ???? ??????, ???? ?? ??? ????
    val fields: Array[StructField] = new Array[StructField](3)
    fields.update(0, new StructField("field1", StringType))
    fields.update(1, new StructField("field2", StringType))
    fields.update(2, new StructField("field2", StringType))
    new StructType(fields.asInstanceOf[Array[StructField]])
  }

  // RDD[Row]? ???? StudyRDD? ???.
  override def buildScan(): RDD[Row] = new StudyRDD(sqlContext, schema)
} 
开发者ID:hackpupu,项目名称:LML,代码行数:24,代码来源:StudyRelation.scala

示例10: StudyReader

//设置package包名称以及导入依赖的类
package com.study.spark.datasource

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


class StudyReader(context: TaskContext, schema: StructType, split: Partition) extends Iterator[Row] {
  private[this] var counter: Int = 0

  // Task? ???? ???? close? ????? ??.
  if(context != null) {
    context.addTaskCompletionListener(context => close())
  }

  // 100?? Row? ??? ??
  override def hasNext: Boolean = {
    if(counter < 100) {
      true
    } else {
      false
    }
  }

  // 1?? Row? ????.
  override def next(): Row = {
    if(!hasNext) {
      throw new NoSuchElementException("End of stream")
    }
    counter += 1
    Row(split.index + " field1 " + counter, "field2 " + counter, "field3: " + counter)
  }

  // close?? ? ??? ??? ??? close??.
  def close() = println("closed")
} 
开发者ID:hackpupu,项目名称:LML,代码行数:37,代码来源:StudyReader.scala

示例11: StoreFormat

//设置package包名称以及导入依赖的类
package com.sasaki.utils

import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
import org.apache.spark.rdd.RDD

import com.sasaki.discretization._

object StoreFormat {

	def rdd2DF(rdd : RDD[Row], sqlContext : SQLContext) = {
    		val schema = StructType(
    			StructField("role", StringType, nullable = false) ::
    			StructField("mark", StringType, nullable = false) ::
			StructField("seqs", ArrayType(StringType), nullable = false) :: 
			Nil)
		sqlContext.createDataFrame(rdd, schema)		
	}

	def saveAsJSON(rdd : RDD[Row], 
			path : String, sqlContext : SQLContext) = {
		val df = rdd2DF(rdd, sqlContext)
		val saveOptions = Map("header" -> "false", "path" -> path)
    		df.write.format("json").mode(SaveMode.Ignore).options(saveOptions).save
	}
	
} 
开发者ID:sasakigao,项目名称:log-discrete,代码行数:28,代码来源:StoreFormat.scala

示例12: sample

//设置package包名称以及导入依赖的类
package com.rishabh.spark.datasource.s3


import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}


case class sample(id: Integer)

class S3Relation(accesskey: String, secretkey: String, fileType: String, bucket: String, path:
String, write: Boolean)
                (@transient
                 val sqlContext: SQLContext) extends BaseRelation with TableScan {

  import sqlContext.implicits._

  val dummyData = Seq(sample(1))
  var df = sqlContext.sparkContext.parallelize(dummyData, 4).toDF()
  val s3Path = "s3a://" + bucket + path

  val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
  hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
  hadoopConf.set("fs.s3a.access.key", accesskey)
  hadoopConf.set("fs.s3a.secret.key", secretkey)

  override def schema: StructType = {
    fileType match {
      case "json" =>
        df = sqlContext.read.json(s3Path)
      case "csv" =>
        df = sqlContext.read.format("com.databricks.spark.csv").load(s3Path)
      case "parquet" =>
        df = sqlContext.read.parquet(s3Path)
    }
    df.schema
  }

  override def buildScan(): RDD[Row] = {
    df.rdd
  }
} 
开发者ID:rishabhbhardwaj,项目名称:spark-datasource-s3,代码行数:44,代码来源:S3Relation.scala

示例13: ScorePredictor

//设置package包名称以及导入依赖的类
package org.wikimedia.research.recommendation.job.translation

import java.io.File

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}

import scala.collection.parallel.mutable.ParArray

object ScorePredictor {
  val log: Logger = LogManager.getLogger(ScorePredictor.getClass)

  def predictScores(spark: SparkSession,
                    modelsInputDir: File,
                    predictionsOutputDir: Option[File],
                    sites: ParArray[String],
                    featureData: DataFrame): Unit = {
    log.info("Scoring items")

    val predictions: Array[DataFrame] = sites.map(target => {
      try {
        log.info("Scoring for " + target)
        log.info("Getting work data for " + target)
        val workData: DataFrame = Utils.getWorkData(spark, featureData, target, exists = false)
        log.info("Loading model for " + target)
        val model = RandomForestRegressionModel.load(
          new File(modelsInputDir, target).getAbsolutePath)
        log.info("Scoring data for " + target)
        val predictions = model
          .setPredictionCol(target)
          .transform(workData)
          .select("id", target)

        predictions
      } catch {
        case unknown: Throwable =>
          log.error("Score for " + target + " failed", unknown)
          val schema = StructType(Seq(
            StructField("id", StringType, nullable = false),
            StructField(target, DoubleType, nullable = true)))
          spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
      }
    }).toArray

    val predictedScores = predictions.reduce((left, right) => left.join(right, Seq("id"), "outer"))

    log.info("Saving predictions")
    predictionsOutputDir.foreach(f = o =>
      predictedScores.coalesce(1)
        .write
        .mode(SaveMode.ErrorIfExists)
        .option("header", value = true)
        .option("compression", "bzip2")
        .csv(new File(o, "allPredictions").getAbsolutePath))
  }
} 
开发者ID:schana,项目名称:recommendation-translation,代码行数:59,代码来源:ScorePredictor.scala

示例14: prepareData

//设置package包名称以及导入依赖的类
package net.koseburak.recommendation.util

import net.koseburak.recommendation.constant.Field.PlaylistField
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

trait DataUtils extends Serializable{
  def prepareData(path: String)(implicit spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.read.text(path)
      .map(parseRow)
      .filter(_.length > 1)
      .map(Tuple1.apply)
      .toDF(PlaylistField)
      .cache
  }

  private def parseRow(row: Row) = row.mkString.split(" ").toList
}

object DataUtils extends DataUtils 
开发者ID:burakkose,项目名称:word2vec-playlist-generation,代码行数:21,代码来源:DataUtils.scala

示例15: ManyToManyNormalJoin

//设置package包名称以及导入依赖的类
package com.malaska.spark.training.manytomany

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable

object ManyToManyNormalJoin {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .getOrCreate()

    val jsonDf = sparkSession.read.json(jsonPath)

    val nGramWordCount = jsonDf.rdd.flatMap(r => {
      val actions = r.getAs[mutable.WrappedArray[Row]]("actions")

      val resultList = new mutable.MutableList[((Long, Long), Int)]

      actions.foreach(a => {
        val aValue = a.getAs[Long]("action")
        actions.foreach(b => {
          val bValue = b.getAs[Long]("action")
          if (aValue < bValue) {
            resultList.+=(((aValue, bValue), 1))
          }
        })
      })
      resultList.toSeq
    }).reduceByKey(_ + _)

    nGramWordCount.collect().foreach(r => {
      println(" - " + r)
    })
  }
} 
开发者ID:TedBear42,项目名称:spark_training,代码行数:46,代码来源:ManyToManyNormalJoin.scala


注:本文中的org.apache.spark.sql.Row类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。