本文整理汇总了Scala中org.apache.spark.mllib.linalg.DenseVector类的典型用法代码示例。如果您正苦于以下问题:Scala DenseVector类的具体用法?Scala DenseVector怎么用?Scala DenseVector使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DenseVector类的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: PCAClustering
//设置package包名称以及导入依赖的类
package graph
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable
class PCAClustering {
def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
val numNode = inputGraph.numVertices.toInt
val mapping = new mutable.HashMap[Long,Int]()
val revMapping = new mutable.HashMap[Int, Long]()
val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
for(i<-0 to numNode - 1) {
mapping.put(verticeIds.apply(i), i)
revMapping.put(i, verticeIds.apply(i))
}
//reindex the verteces from 0 to the num of nodes
val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
val ngraph = Graph(nVertices, nEdges)
val output = ngraph.collectNeighborIds(EdgeDirection.Out)
val spvec = output.mapValues(r => Vectors.sparse( numNode, r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
val rows = spvec.map(v=>v._2)
val order = spvec.map(v=>v._1)
val mat = new RowMatrix(rows)
val pc = mat.computePrincipalComponents(eigsNum)
val pcRDD = matrixToRDD(sc, pc)
val clusters = KMeans.train(pcRDD, clusterNum, 100)
val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
Graph(origVerextRDD, inputGraph.edges)
}
}
示例2: RGB
//设置package包名称以及导入依赖的类
package example
import java.awt.Color
import io.flatmap.ml.som.GaussianSelfOrganizingMap
import io.flatmap.ml.util.Plot
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.sql.SparkSession
object RGB {
def main(args: Array[String]): Unit = {
implicit val sparkSession =
SparkSession
.builder
.appName("rgb-clustering")
.getOrCreate()
val rgb = sparkSession.sparkContext
.textFile("data/rgb.csv")
.map(_.split(",").map(_.toDouble / 255.0))
.map(new DenseVector(_))
val (som, params) =
GaussianSelfOrganizingMap(24, 24, sigma = 0.5, learningRate = 0.3)
.initialize(rgb)
.train(rgb, 20)
Plot.errors(params.errors.reverse)
Plot.som(f"Trained SOM (error=${params.errors.head}%1.4f)", som.codeBook, "trained_som.png") {
case red :: green :: blue :: Nil =>
new Color((red*255.0).toInt, (green*255.0).toInt, (blue*255.0).toInt).getRGB.toDouble
case _ => Color.white.getRGB.toDouble
}
}
}
示例3: SelfOrganizingMapSpec
//设置package包名称以及导入依赖的类
package io.flatmap.ml.som
import breeze.numerics.closeTo
import breeze.linalg.DenseMatrix
import io.flatmap.ml.som.SelfOrganizingMap.Shape
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.random.RandomRDDs
import org.scalatest._
import util.{FakeDecayFunction, FakeMetrics, FakeNeighborhoodKernel, TestSparkContext}
class SelfOrganizingMapSpec extends FlatSpec with Matchers with BeforeAndAfterEach with TestSparkContext {
def SOM(width: Int, height: Int) =
new SelfOrganizingMap with FakeNeighborhoodKernel with FakeDecayFunction with FakeMetrics {
override val shape: Shape = (width, height)
override val learningRate: Double = 0.1
override val sigma: Double = 0.2
}
"instantiation" should "create a SOM with codebook of zeros" in {
val som = SOM(6, 6)
som.codeBook should === (DenseMatrix.fill[Array[Double]](6, 6)(Array.emptyDoubleArray))
}
"initialize" should "copy random data points from RDD into codebook" in {
val data = RandomRDDs.normalVectorRDD(sparkSession.sparkContext, numRows = 512L, numCols = 3)
val som = SOM(6, 6)
som.initialize(data).codeBook should !== (DenseMatrix.fill[Array[Double]](6, 6)(Array.emptyDoubleArray))
}
"winner" should "return best matching unit (BMU)" in {
val som = SOM(6, 6)
som.codeBook.keysIterator.foreach { case (x, y) => som.codeBook(x, y) = Array(0.2, 0.2, 0.2) }
som.codeBook(3, 3) = Array(0.3, 0.3, 0.3)
som.winner(new DenseVector(Array(2.0, 2.0, 2.0)), som.codeBook) should equal ((3, 3))
som.winner(new DenseVector(Array(0.26, 0.26, 0.26)), som.codeBook) should equal ((3, 3))
}
"winner" should "return last best matching unit (BMU) index in case of multiple BMUs" in {
val som = SOM(6, 6)
som.codeBook.keysIterator.foreach { case (x, y) => som.codeBook(x, y) = Array(0.2, 0.2, 0.2) }
som.codeBook(3, 3) = Array(0.3, 0.3, 0.3)
som.winner(new DenseVector(Array(0.25, 0.25, 0.25)), som.codeBook) should equal ((5, 5))
}
"classify" should "return the best matching unit along with Euclidean distance" in {
val som = SOM(6, 6)
som.codeBook.keysIterator.foreach { case (x, y) => som.codeBook(x, y) = Array(0.2, 0.2, 0.2) }
som.codeBook(3, 3) = Array(0.3, 0.3, 0.3)
val (bmu, distance) = som.classify(new DenseVector(Array(0.26, 0.26, 0.26)))
bmu should === ((3, 3))
assert(closeTo(distance, 0.06, relDiff = 1e-2))
}
}
示例4: GaussianSelfOrganizingMapSpec
//设置package包名称以及导入依赖的类
package io.flatmap.ml.som
import java.awt.Color
import breeze.linalg.DenseMatrix
import breeze.numerics.closeTo
import io.flatmap.ml.util.Plot
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.random.RandomRDDs
import org.scalatest._
import util.TestSparkContext
class GaussianSelfOrganizingMapSpec extends FlatSpec with Matchers with BeforeAndAfterEach with TestSparkContext {
"train" should "return a fitted SOM instance" in {
val path = getClass.getResource("/rgb.csv").getPath
val rgb = sparkSession.sparkContext
.textFile(path)
.map(_.split(",").map(_.toDouble / 255.0))
.map(new DenseVector(_))
val som = GaussianSelfOrganizingMap(6, 6, _sigma = 0.5, _learningRate = 0.3).initialize(rgb)
val initialCodeBook = som.codeBook.copy
val codeBookVectorToRGB: List[Double] => Double = {
case red :: green :: blue :: Nil =>
new Color((red*255.0).toInt, (green*255.0).toInt, (blue*255.0).toInt).getRGB.toDouble
case _ => Color.white.getRGB.toDouble
}
Plot.som("Initial SOM", som.codeBook, "initial_som.png")(codeBookVectorToRGB)
val (newSom, params) = som.train(rgb, 20)
Plot.som(f"Trained SOM (error=${params.errors.head}%1.4f)", newSom.codeBook, "trained_som.png")(codeBookVectorToRGB)
Plot.errors(params.errors.reverse)
newSom.codeBook should not equal initialCodeBook
assert(closeTo(params.errors.head, 0.15, relDiff = 1e-2))
}
}
示例5: Predict
//设置package包名称以及导入依赖的类
package TelCom
import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
import org.apache.spark.sql.catalyst.expressions.Row
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel}
import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
object Predict {
val sizeOfSample:Int = 22
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("logic")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
var fileRoot = "H:\\????_???????_contest_data"
val data = sc.textFile(fileRoot + "/toPredict.txt")
val Model1 = LogisticRegressionModel.load(sc, fileRoot + "/model")
val Model2 = LogisticRegressionModel.load(sc, fileRoot + "/model")
//??????
val feature1 = data.map(x => x.split("\t")).map(x =>toPredict(x,Model1,Model2)).filter(x => x != "Normal")
print(feature1)
feature1.randomSplit(Array(1, 0), seed = 11L)(0).repartition(1).saveAsTextFile(fileRoot + "/out")
}
def toPredict(x:Array[String],model1:LogisticRegressionModel,model2: LogisticRegressionModel): String =
{
var z:Array[Double] = new Array[Double](sizeOfSample - 3)
for(i <- 2 to x.size-1)
z(i-2) = x(i).toDouble
val v:Vector = Vectors.dense(z)
val result1 = model1.predict(v)
val result2 = model1.predict(v)
var returnResult:String = null
if (result1 > 0.95){
returnResult = x(0).toString + "\t" + x(1).toString + "\t" + "1"}
else if (result2 > 1){
returnResult = x(0).toString + "\t" + x(1).toString + "\t" + "2"}
else
{returnResult = "Normal"
}
return returnResult
}
}
示例6: PipelineClassifier
//设置package包名称以及导入依赖的类
import org.apache.spark.ml.PipelineModel
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
class PipelineClassifier(val pipeline: PipelineModel) extends UnifiedClassifier with Serializable {
override def predict[T](data: DataFrame): RDD[(T, Double)] = {
val singletonDF = ModelHelpers.addMetadata(data)
val predictions = pipeline.transform(singletonDF)
predictions.map(row => {
val firstClass = row.getAs[DenseVector](DataFrameColumns.RAW_PREDICTION)(1)
val zeroClass = row.getAs[DenseVector](DataFrameColumns.RAW_PREDICTION)(0)
val prob = firstClass.toDouble / (firstClass.toDouble + zeroClass.toDouble)
(row.getAs[T](DataFrameColumns.KEY), prob)
})
}
}
示例7: Utils
//设置package包名称以及导入依赖的类
package mapper.utils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{ Vector, DenseVector, SparseVector }
import breeze.linalg.{ DenseVector => BDV, SparseVector => BSV, Vector => BV }
import org.apache.spark.mllib.linalg.distributed.{ IndexedRowMatrix, IndexedRow, BlockMatrix }
object Utils {
def toBlockMatrix(x: RDD[Vector], rowsPerBlock: Int = 1024, colsPerBlock: Int = 1024): BlockMatrix = {
new IndexedRowMatrix(
x.zipWithIndex().map({ xi => IndexedRow(xi._2, xi._1) })
).toBlockMatrix(rowsPerBlock, colsPerBlock)
}
def toBreeze(v: Vector): BV[Double] = v match {
case DenseVector(values) => new BDV[Double](values)
case SparseVector(size, indices, values) => new BSV[Double](indices, values, size)
}
def toSpark(bv: BV[Double]): Vector = bv match {
case v: BDV[Double] => new DenseVector(v.toArray)
case v: BSV[Double] => new SparseVector(v.length, v.index, v.data)
}
def cartesian[A](xs: Traversable[Traversable[A]]): Seq[Seq[A]] =
xs.foldLeft(Seq(Seq.empty[A])) { (x, y) => for (a <- x; b <- y) yield a :+ b }
}
示例8: MapperSpec
//设置package包名称以及导入依赖的类
package com.github.log0ymxm.mapper
import org.scalatest._
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.{ SparkSession, Row }
import org.apache.spark.mllib.linalg.distributed.{ CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors }
class MapperSpec extends FunSuite with SharedSparkContext {
test("simple mapper on noisy circle") {
val spark = SparkSession.builder().getOrCreate()
val fileLoc = getClass.getClassLoader.getResource("circles.csv").getPath()
val circle = spark.read
.option("header", false)
.option("inferSchema", true)
.csv(fileLoc)
assert(circle.count == 400)
val indexedRDD = circle.rdd.zipWithIndex.map {
case (Row(x: Double, y: Double), i) =>
val v: Vector = new DenseVector(Array(x, y))
IndexedRow(i, v)
}
val matrix = new IndexedRowMatrix(indexedRDD)
val similarities = matrix.toCoordinateMatrix
.transpose()
.toIndexedRowMatrix()
.columnSimilarities()
val distances = new CoordinateMatrix(
similarities
.entries
.map((entry) => new MatrixEntry(entry.i, entry.j, 1 - entry.value))
)
val filtration = new IndexedRowMatrix(indexedRDD.map({ row =>
IndexedRow(row.index, new DenseVector(Array(
Vectors.norm(row.vector, 2)
)))
}))
//Mapper.writeAsJson(graph, "mapper-vis/circle-graph.json")
val graph = Mapper.mapper(sc, distances, filtration, 100, 2.0)
assert(graph.vertices.count == 160)
assert(graph.edges.count == 327)
}
}
示例9: CoverSpec
//设置package包名称以及导入依赖的类
package com.github.log0ymxm.mapper
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.linalg.distributed.{ IndexedRow, IndexedRowMatrix }
import org.scalatest._
import com.holdenkarau.spark.testing.SharedSparkContext
class CoverSpec extends FunSuite with SharedSparkContext {
test("cover") {
val rdd = sc.parallelize((0 to 10).toSeq)
val filtration = new IndexedRowMatrix(
rdd.map({ x =>
new IndexedRow(x, new DenseVector(Array(x * 2, scala.math.sin(x))))
})
)
val cover = new Cover(filtration, 4, 0.5)
assert(cover.numCoverSegments == 16)
assert(cover.filterRanges(0) == NumericBoundary(0.0, 20.0))
assert(cover.filterRanges(1).lower >= -1.0)
assert(cover.filterRanges(1).upper <= 1.0)
assert(cover.coverAssignment(new DenseVector(Array(8.33, 0.5))) == List(CoverSegmentKey(6), CoverSegmentKey(7)))
}
}
示例10: LabelPropagationClassifierTest
//设置package包名称以及导入依赖的类
package cz.cvut.fit.palicand.vocloud.ssl.ml
import com.holdenkarau.spark.testing._
import cz.cvut.fit.palicand.vocloud.ssl.ml.classification.LabelPropagationClassifier
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.{DenseVector, VectorUDT, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}
import org.scalatest.{Matchers, FlatSpec}
class LabelPropagationClassifierTest extends FlatSpec with SharedSparkContext with Matchers {
behavior of "LabelPropagationTest"
it should "train" in {
val sqlContext = new SQLContext(sc)
val rdd: RDD[Row] = sc.parallelize(Row(0L, 0.0, Vectors.dense(0.0, 1.0)) :: Row(1L, 1.0, Vectors.dense(1.0, 0.0)) :: Row(2L, 2.0, Vectors.dense(0.0, 0.0)) :: Nil)
val df = sqlContext.createDataFrame(rdd, StructType(List(StructField("rowNo", LongType), StructField("label", DoubleType), StructField("features", new VectorUDT))))
val clf = new LabelPropagationClassifier()
clf.setKNeighbours(2)
clf.setLabelCol("label")
clf.setFeaturesCol("features")
val model = clf.fit(df)
model.labelWeights.toIndexedRowMatrix().rows.collect() should be(createIndexedRow(0, 1.0, 0.0) ::
createIndexedRow(1, 0.0, 1.0) :: createIndexedRow(2, 1.0, 0) :: Nil)
}
def createIndexedRow(i: Int, vals: Double*): IndexedRow = {
new IndexedRow(i, new DenseVector(vals.toArray))
}
}
示例11: DataGenerator
//设置package包名称以及导入依赖的类
package com.bistel.wordcount.logisticRegression
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
class DataGenerator(numTasks: Int)(implicit sc: SparkContext) {
final val SIGMA = 2.0
private def f(mean: Double): Double =
mean + SIGMA *( Random.nextDouble - 0.5)
def apply(half: Int, mu: Double): Array[LabeledPoint] = {
val trainObs =
ArrayBuffer.fill(half)(Array[Double](f(1.0),f(1.0),f(1.0))) ++
ArrayBuffer.fill(half)(Array[Double](f(mu),f(mu),f(mu)))
val labels = ArrayBuffer.fill(half)(0.0) ++
ArrayBuffer.fill(half)(1.0)
labels.zip(trainObs).map{ case (y, ar) =>
LabeledPoint(y, new DenseVector(ar)) }.toArray
}
}