本文整理汇总了Scala中org.apache.spark.mllib.linalg.Vector类的典型用法代码示例。如果您正苦于以下问题:Scala Vector类的具体用法?Scala Vector怎么用?Scala Vector使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vector类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: Predict
//设置package包名称以及导入依赖的类
package com.databricks.apps.twitterClassifier
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.streaming.twitter._
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Predict extends App {
import SparkSetup._
val options = PredictOptions.parse(args)
val ssc = new StreamingContext(sc, Seconds(options.intervalInSecs))
Predictor.doIt(options, sc, ssc)
}
object Predictor {
def doIt(options: PredictOptions, sc: SparkContext, ssc: StreamingContext) {
println("Initializing the the KMeans model...")
val model: KMeansModel = new KMeansModel(sc.objectFile[Vector](options.modelDirectory.getCanonicalPath).collect)
println("Materializing Twitter stream...")
TwitterUtils.createStream(ssc, maybeTwitterAuth)
.map(_.getText)
.foreachRDD { rdd =>
rdd.filter(t => model.predict(featurize(t)) == options.clusterNumber)
.foreach(print) // register DStream as an output stream and materialize it
}
println("Initialization complete, starting streaming computation.")
ssc.start()
ssc.awaitTermination()
}
}
示例2: PCAClustering
//设置package包名称以及导入依赖的类
package graph
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable
class PCAClustering {
def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
val numNode = inputGraph.numVertices.toInt
val mapping = new mutable.HashMap[Long,Int]()
val revMapping = new mutable.HashMap[Int, Long]()
val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
for(i<-0 to numNode - 1) {
mapping.put(verticeIds.apply(i), i)
revMapping.put(i, verticeIds.apply(i))
}
//reindex the verteces from 0 to the num of nodes
val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
val ngraph = Graph(nVertices, nEdges)
val output = ngraph.collectNeighborIds(EdgeDirection.Out)
val spvec = output.mapValues(r => Vectors.sparse( numNode, r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
val rows = spvec.map(v=>v._2)
val order = spvec.map(v=>v._1)
val mat = new RowMatrix(rows)
val pc = mat.computePrincipalComponents(eigsNum)
val pcRDD = matrixToRDD(sc, pc)
val clusters = KMeans.train(pcRDD, clusterNum, 100)
val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
Graph(origVerextRDD, inputGraph.edges)
}
}
示例3: RatePredictor
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import akka.actor.ActorSystem
import com.ferhtaydn.models.PatientInfo
import org.apache.spark.ml.feature.StringIndexerModel
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.mllib.linalg.{ Matrix, Vector }
import org.apache.spark.sql.{ Row, SQLContext }
import scala.concurrent.{ ExecutionContextExecutor, Future }
class RatePredictor(system: ActorSystem, sqlContext: SQLContext,
indexModel: StringIndexerModel, cvModel: CrossValidatorModel,
confusionMatrix: String) {
private val decimalFormatter = new java.text.DecimalFormat("##.##")
private val blockingDispatcher: ExecutionContextExecutor = system.dispatchers.lookup("ml.predictor.dispatcher")
def confusionMatrixString: Future[String] = {
Future {
confusionMatrix
}(blockingDispatcher)
}
def predict(patientInfo: PatientInfo): Future[Either[String, Double]] = {
Future {
val df = sqlContext.createDataFrame(Seq(patientInfo.toRecord))
val indexedJobDF = indexModel.transform(df)
val result = cvModel
.transform(indexedJobDF)
.select("prediction", "probability").map {
case Row(prediction: Double, probability: Vector) ?
(probability, prediction)
}
result.collect().headOption match {
case Some((prob, _)) ? Right(decimalFormatter.format(prob(1)).toDouble)
case None ? Left(s"No result can be predicted for the patient")
}
}(blockingDispatcher)
}
}
示例4: SimpleApp
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.{Vector, Vectors}
object SimpleApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val data = Array(1,2,3)
val distData = sc.parallelize(data)
val vectorData = distData.map(x => Vectors.dense(x))
val summary = Statistics.colStats(vectorData)
println("mean is: %s".format(summary.mean))
println("max is: %s".format(summary.max))
println("min is: %s".format(summary.min))
//find correlation
// student, exam1, exam2, exam3
val data = sc.parallelize(Array("111, 60, 65, 73", "222, 98,95,88", "333, 56,67,62"))
val vectorRdd = data.map((line: String) => line.split(",").drop(1).map((ele: String) => ele.toDouble)).map(Vectors.dense)
val corrMatrix = Statistics.corr(vectorRdd)
}
}
示例5: GenerateScalingData
//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.tools
import com.highperformancespark.examples.dataframe.RawPanda
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.linalg.Vector
object GenerateScalingData {
def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
RDD[RawPanda] = {
val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows)
.map(_.toInt.toString)
val valuesRDD = RandomRDDs.normalVectorRDD(
sc, numRows = rows, numCols = numCols)
zipRDD.zip(valuesRDD).map{case (z, v) =>
RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)
}
}
// end::MAGIC_PANDA[]
}
示例6: SparkSetup
//设置package包名称以及导入依赖的类
package com.databricks.apps
package twitterClassifier {
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
object SparkSetup {
val spark = SparkSession
.builder
.appName(getClass.getSimpleName.replace("$", ""))
.getOrCreate()
val sqlContext = spark.sqlContext
val sc: SparkContext = spark.sparkContext
// Suppress "WARN BlockManager: Block input-0-1478266015800 replicated to only 0 peer(s) instead of 1 peers" messages
sc.setLogLevel("ERROR")
}
}
package object twitterClassifier {
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.feature.HashingTF
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder
val numFeatures = 1000
val tf = new HashingTF(numFeatures)
def maybeTwitterAuth: Some[OAuthAuthorization] = Some(new OAuthAuthorization(new ConfigurationBuilder().build))
def featurize(s: String): Vector = tf.transform(s.sliding(2).toSeq)
}
示例7: ZFLSH
//设置package包名称以及导入依赖的类
package AccurateML.lsh
import breeze.linalg.DenseMatrix
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import AccurateML.blas.ZFBLAS
class ZFLSH(
n: Int,
m: Int) {
val normal01 = breeze.stats.distributions.Gaussian(0, 1)
val nmat = DenseMatrix.rand(m, n, normal01)
def hashVector(vector: linalg.Vector): String = {
val r = new Array[Int](n)
for (i <- 0 until n) {
val mc = nmat(::, (i))
val ans = ZFBLAS.dot(vector, Vectors.dense(mc.toArray))
if (ans > 0)
r(i) = 1
}
r.mkString("")
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("test lsh")
val sc = new SparkContext(conf)
val numBits = 4
val numFeatures = 1000
val lsh = new ZFLSH(numBits, numFeatures)
val data: RDD[Vector] = sc.objectFile("") //eg, the first element in data is dfirst=Vector(1.0,2.0,...,1000.0)
val mapData:RDD[(String,Vector)]=data.map(vec=>(lsh.hashVector(vec),vec))
//eg,the first element in mapData mdfirst=Tuple2("1010",Vector(1.0,2.0,...,100.0))
//"1010" is the sketch of dfirst=Vector(1.0,2.0,...,1000.0)
//the instances with the same sketch will belong to the same cluster
}
}
示例8: BMRMSuite
//设置package包名称以及导入依赖的类
package org.apache.spark.mllib.optimization.bmrm
import org.scalatest.FunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import scala.util.Random
object BMRMSuite {
def generateSubInput(nPoint: Int, dim: Int,seed: Int):(Array[Vector], Array[Double], Vector) = {
val rnd = new Random(seed)
val label = Array.fill[Double](nPoint)(rnd.nextInt(5)+1.0)
val testData = Array.fill[Vector](nPoint)(Vectors.dense(Array.fill(dim)(rnd.nextInt(10)+1.0)))
val initWeights = Vectors.dense(Array.fill(dim)(rnd.nextInt(10)+1.0))
(testData, label, initWeights)
}
}
class BMRMSuite extends FunSuite with MLlibTestSparkContext {
test("Test the loss and gradient of first iteration") {
val subGrad = new NdcgSubGradient()
val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 100, 45)
val (gradient, loss) = subGrad.compute(testData, label, initWeights)
println(gradient)
println(loss)
}
test("Test the update of the weights of first iteration") {
val subGrad = new NdcgSubGradient()
val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 1000, 45)
val (gradient, loss) = subGrad.compute(testData, label, initWeights)
val subUpdater = new DaiFletcherUpdater()
val (newWeights, objval) = subUpdater.compute(initWeights, gradient, loss, 1.0)
println(initWeights)
println(loss)
println(newWeights)
println(objval)
}
test("Test the BMRM optimization") {
val subGrad = new NdcgSubGradient()
val subUpdater = new DaiFletcherUpdater()
val bmrm = new BMRM(subGrad, subUpdater)
val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 10, 45)
println(initWeights)
val newWeights = bmrm.optimize(testData, label, initWeights)
println(newWeights)
}
}
示例9: FeaturesParser
//设置package包名称以及导入依赖的类
package com.evrenio.ml
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
object FeaturesParser{
def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble))
val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble))
vectors
}
def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble))
val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
labeledPoints
}
}
示例10: SparkSVDExampleOne
//设置package包名称以及导入依赖的类
package linalg.svd
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {
def main(args: Array[String]) {
val denseData = Seq(
Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
)
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
val sc = new SparkContext(spConfig)
val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))
// Compute the top 20 singular values and corresponding singular vectors.
val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
val U: RowMatrix = svd.U // The U factor is a RowMatrix.
val s: Vector = svd.s // The singular values are stored in a local dense vector.
val V: Matrix = svd.V // The V factor is a local dense matrix.
println("U:" + U)
println("s:" + s)
println("V:" + V)
sc.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:30,代码来源:SparkSVDExampleOne.scala
示例11: Util
//设置package包名称以及导入依赖的类
package org.sparksamples
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object Util {
val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
val DATA_PATH= "../../../data/ml-100k"
val PATH_MOVIES = DATA_PATH + "/u.item"
def reduceDimension2(x: Vector) : String= {
var i = 0
var l = x.toArray.size
var l_2 = l/2.toInt
var x_ = 0.0
var y_ = 0.0
for(i <- 0 until l_2) {
x_ += x(i).toDouble
}
for(i <- (l_2 + 1) until l) {
y_ += x(i).toDouble
}
var t = x_ + "," + y_
return t
}
def getMovieDataDF(spark : SparkSession) : DataFrame = {
//1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
// |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
val customSchema = StructType(Array(
StructField("id", StringType, true),
StructField("name", StringType, true),
StructField("date", StringType, true),
StructField("url", StringType, true)));
val movieDf = spark.read.format("com.databricks.spark.csv")
.option("delimiter", "|").schema(customSchema)
.load(PATH_MOVIES)
return movieDf
}
}
示例12: SyntheticFeatures
//设置package包名称以及导入依赖的类
package org.template.classification
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
object SyntheticFeatures {
// Synthesize averages for feature combinations.
// Three features become seven:
// f1, f2, f3, (f1+f2)/2, (f2+f3)/2, (f3+f1)/2, (f1+f2+f3)/3
// These are averages of each pair and all three.
def transform(vector:Vector):Vector = {
val realFeatures: Array[Double] = vector.toArray
var synthFeatures: Array[Double] = Array()
var sumRealFeatures:Double = 0
var i = 0
var v:Double = 0
v = realFeatures.apply(0)
sumRealFeatures = sumRealFeatures + v
synthFeatures = synthFeatures :+ (v + realFeatures.apply(1))/2
v = realFeatures.apply(1)
sumRealFeatures = sumRealFeatures + v
synthFeatures = synthFeatures :+ (v + realFeatures.apply(2))/2
v = realFeatures.apply(2)
sumRealFeatures = sumRealFeatures + v
synthFeatures = synthFeatures :+ (v + realFeatures.apply(0))/2
// average of all features
synthFeatures = synthFeatures :+ sumRealFeatures/3
Vectors.dense(realFeatures ++ synthFeatures)
}
}
示例13: Utils
//设置package包名称以及导入依赖的类
package mapper.utils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{ Vector, DenseVector, SparseVector }
import breeze.linalg.{ DenseVector => BDV, SparseVector => BSV, Vector => BV }
import org.apache.spark.mllib.linalg.distributed.{ IndexedRowMatrix, IndexedRow, BlockMatrix }
object Utils {
def toBlockMatrix(x: RDD[Vector], rowsPerBlock: Int = 1024, colsPerBlock: Int = 1024): BlockMatrix = {
new IndexedRowMatrix(
x.zipWithIndex().map({ xi => IndexedRow(xi._2, xi._1) })
).toBlockMatrix(rowsPerBlock, colsPerBlock)
}
def toBreeze(v: Vector): BV[Double] = v match {
case DenseVector(values) => new BDV[Double](values)
case SparseVector(size, indices, values) => new BSV[Double](indices, values, size)
}
def toSpark(bv: BV[Double]): Vector = bv match {
case v: BDV[Double] => new DenseVector(v.toArray)
case v: BSV[Double] => new SparseVector(v.length, v.index, v.data)
}
def cartesian[A](xs: Traversable[Traversable[A]]): Seq[Seq[A]] =
xs.foldLeft(Seq(Seq.empty[A])) { (x, y) => for (a <- x; b <- y) yield a :+ b }
}
示例14: MapperSpec
//设置package包名称以及导入依赖的类
package com.github.log0ymxm.mapper
import org.scalatest._
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.{ SparkSession, Row }
import org.apache.spark.mllib.linalg.distributed.{ CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors }
class MapperSpec extends FunSuite with SharedSparkContext {
test("simple mapper on noisy circle") {
val spark = SparkSession.builder().getOrCreate()
val fileLoc = getClass.getClassLoader.getResource("circles.csv").getPath()
val circle = spark.read
.option("header", false)
.option("inferSchema", true)
.csv(fileLoc)
assert(circle.count == 400)
val indexedRDD = circle.rdd.zipWithIndex.map {
case (Row(x: Double, y: Double), i) =>
val v: Vector = new DenseVector(Array(x, y))
IndexedRow(i, v)
}
val matrix = new IndexedRowMatrix(indexedRDD)
val similarities = matrix.toCoordinateMatrix
.transpose()
.toIndexedRowMatrix()
.columnSimilarities()
val distances = new CoordinateMatrix(
similarities
.entries
.map((entry) => new MatrixEntry(entry.i, entry.j, 1 - entry.value))
)
val filtration = new IndexedRowMatrix(indexedRDD.map({ row =>
IndexedRow(row.index, new DenseVector(Array(
Vectors.norm(row.vector, 2)
)))
}))
//Mapper.writeAsJson(graph, "mapper-vis/circle-graph.json")
val graph = Mapper.mapper(sc, distances, filtration, 100, 2.0)
assert(graph.vertices.count == 160)
assert(graph.edges.count == 327)
}
}
示例15: FeaturesParser
//设置package包名称以及导入依赖的类
package com.micvog.ml
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
object FeaturesParser{
def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble))
val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble))
vectors
}
def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble))
val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
labeledPoints
}
}