本文整理汇总了Scala中org.apache.spark.mllib.linalg.distributed.RowMatrix类的典型用法代码示例。如果您正苦于以下问题:Scala RowMatrix类的具体用法?Scala RowMatrix怎么用?Scala RowMatrix使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RowMatrix类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: PCAClustering
//设置package包名称以及导入依赖的类
package graph
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable
class PCAClustering {
def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
val numNode = inputGraph.numVertices.toInt
val mapping = new mutable.HashMap[Long,Int]()
val revMapping = new mutable.HashMap[Int, Long]()
val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
for(i<-0 to numNode - 1) {
mapping.put(verticeIds.apply(i), i)
revMapping.put(i, verticeIds.apply(i))
}
//reindex the verteces from 0 to the num of nodes
val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
val ngraph = Graph(nVertices, nEdges)
val output = ngraph.collectNeighborIds(EdgeDirection.Out)
val spvec = output.mapValues(r => Vectors.sparse( numNode, r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
val rows = spvec.map(v=>v._2)
val order = spvec.map(v=>v._1)
val mat = new RowMatrix(rows)
val pc = mat.computePrincipalComponents(eigsNum)
val pcRDD = matrixToRDD(sc, pc)
val clusters = KMeans.train(pcRDD, clusterNum, 100)
val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
Graph(origVerextRDD, inputGraph.edges)
}
}
示例2: Utils
//设置package包名称以及导入依赖的类
package com.github.aadamson.spark_glove
import org.apache.spark.{SparkConf, SparkContext};
import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix, Matrices, DenseMatrix};
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, BlockMatrix, RowMatrix, MatrixEntry, IndexedRow, IndexedRowMatrix};
import org.apache.spark.rdd.RDD;
object Utils {
type CoordinateRDD[T] = RDD[((Long, Long), T)];
implicit def CoordinateRDD2CoordinateMatrix(a: CoordinateRDD[Float]): CoordinateMatrix = {
val entries: RDD[MatrixEntry] = a.map { case ((i, j), value) => new MatrixEntry(i, j, value) };
val mat: CoordinateMatrix = new CoordinateMatrix(entries);
return mat;
}
def broadcastVector(v: Vector, numRows: Int, sc: SparkContext): IndexedRowMatrix = {
val rows: RDD[IndexedRow] = sc.parallelize(0 to numRows-1).map(i => new IndexedRow(i, v));
val mat: IndexedRowMatrix = new IndexedRowMatrix(rows);
return mat;
}
def elementwiseProduct[T](a: T, b: T): T = (a, b) match {
case (x: BlockMatrix, y: BlockMatrix) => {
val aIRM = x.toIndexedRowMatrix();
val bIRM = y.toIndexedRowMatrix();
val rows = aIRM.rows.zip(bIRM.rows).map {
case (aRow: IndexedRow, bRow: IndexedRow) => new IndexedRow(aRow.index, elementwiseProduct(aRow.vector, bRow.vector));
}
return (new IndexedRowMatrix(rows)).toBlockMatrix().asInstanceOf[T];
}
case (x: Vector, y: Vector) => {
val values = Array(x.toArray, y.toArray);
return Vectors.dense(values.transpose.map(_.sum)).asInstanceOf[T];;
}
}
}
示例3: SparkSVDExampleOne
//设置package包名称以及导入依赖的类
package linalg.svd
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {
def main(args: Array[String]) {
val denseData = Seq(
Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
)
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
val sc = new SparkContext(spConfig)
val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))
// Compute the top 20 singular values and corresponding singular vectors.
val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
val U: RowMatrix = svd.U // The U factor is a RowMatrix.
val s: Vector = svd.s // The singular values are stored in a local dense vector.
val V: Matrix = svd.V // The V factor is a local dense matrix.
println("U:" + U)
println("s:" + s)
println("V:" + V)
sc.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:30,代码来源:SparkSVDExampleOne.scala
示例4:
//设置package包名称以及导入依赖的类
//???
val result = h.sql("select max(visit_times) from model_input_active_t") //??????
val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_times) from model_input_active_t") //??????
val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times)
val result = h.sql("select max(last_online_time) from model_input_active_t") //??????
val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(last_online_time) from model_input_active_t") //??????
val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time)
val result = h.sql("select max(pay_times) from model_input_active_t") //??????
val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(pay_times) from model_input_active_t") //??????
val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else ( max_pay_times - min_pay_times)
val result = h.sql("select max(comment_times) from model_input_active_t") //???????
val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(comment_times) from model_input_active_t") //???????
val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else ( max_comment_times - min_comment_times)
val result = h.sql("select max(stay_time) from model_input_active_t") //??????
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = h.sql("select min(stay_time) from model_input_active_t") //??????
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else ( max_stay_time - min_stay_time)
val result = h.sql("select max(visit_day_times) from model_input_active_t") //??????
val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_day_times) from model_input_active_t") //??????
val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else ( max_visit_day_times - min_visit_day_times)
//???visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1
val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1")
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
//DataFrame???Vectors???????API????Dataframe??rdd??????Vectors.dense????????
val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble))
val rm = new RowMatrix(data)
val pc = rm.computePrincipalComponents(1)
val mx = rm.multiply(pc)
//????