本文整理汇总了Scala中org.apache.spark.mllib.linalg.Matrix类的典型用法代码示例。如果您正苦于以下问题:Scala Matrix类的具体用法?Scala Matrix怎么用?Scala Matrix使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Matrix类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: LRCV
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{ StringIndexerModel, VectorAssembler }
import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel, ParamGridBuilder }
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
class LRCV(sc: SparkContext) {
implicit val sqlContext = new SQLContext(sc)
val lr = new LogisticRegression().setMaxIter(10).setFeaturesCol("scaledFeatures")
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.build()
val assembler = new VectorAssembler()
.setInputCols(Array("gender", "age", "weight", "height", "indexedJob"))
.setOutputCol("features")
val pipeline = new Pipeline()
.setStages(Array(assembler, standardScaler("features"), lr))
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(10)
def train(df: DataFrame): (StringIndexerModel, CrossValidatorModel, Matrix) = {
// need to index strings on all data to not missing the job fields.
// other alternative can be manually assign values for each job like gender.
val indexerModel = stringIndexer("job").fit(df)
val indexed = indexerModel.transform(df)
val splits = indexed.randomSplit(Array(0.8, 0.2))
val training = splits(0).cache()
val test = splits(1)
val cvModel = cv.fit(training)
val predictionAndLabels = cvModel
.transform(test)
.select("label", "prediction").map {
case Row(label: Double, prediction: Double) ?
(prediction, label)
}
printBinaryMetrics(predictionAndLabels)
(indexerModel, cvModel, confusionMatrix(predictionAndLabels))
}
}
示例2: SamplePCA
//设置package包名称以及导入依赖的类
package org.broadinstitute.hail.methods
import org.apache.spark.mllib.linalg.{Matrix, DenseMatrix}
import org.apache.spark.rdd.RDD
import org.broadinstitute.hail.variant.Variant
import org.broadinstitute.hail.variant.VariantDataset
class SamplePCA(k: Int, computeLoadings: Boolean, computeEigenvalues: Boolean) {
def name = "SamplePCA"
def apply(vds: VariantDataset): (Matrix, Option[RDD[(Variant, Array[Double])]], Option[Array[Double]]) = {
val (variants, mat) = ToStandardizedIndexedRowMatrix(vds)
val sc = vds.sparkContext
val variantsBc = sc.broadcast(variants)
val svd = mat.computeSVD(k, computeU = computeLoadings)
val scores =
svd.V.multiply(DenseMatrix.diag(svd.s))
val loadings =
if (computeLoadings)
Some(svd.U.rows.map(ir =>
(variantsBc.value(ir.index.toInt), ir.vector.toArray)))
else
None
val eigenvalues =
if (computeEigenvalues)
Some(svd.s.toArray.map(x => x * x))
else
None
(scores, loadings, eigenvalues)
}
}
示例3: PCAClustering
//设置package包名称以及导入依赖的类
package graph
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable
class PCAClustering {
def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
val numNode = inputGraph.numVertices.toInt
val mapping = new mutable.HashMap[Long,Int]()
val revMapping = new mutable.HashMap[Int, Long]()
val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
for(i<-0 to numNode - 1) {
mapping.put(verticeIds.apply(i), i)
revMapping.put(i, verticeIds.apply(i))
}
//reindex the verteces from 0 to the num of nodes
val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
val ngraph = Graph(nVertices, nEdges)
val output = ngraph.collectNeighborIds(EdgeDirection.Out)
val spvec = output.mapValues(r => Vectors.sparse( numNode, r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
val rows = spvec.map(v=>v._2)
val order = spvec.map(v=>v._1)
val mat = new RowMatrix(rows)
val pc = mat.computePrincipalComponents(eigsNum)
val pcRDD = matrixToRDD(sc, pc)
val clusters = KMeans.train(pcRDD, clusterNum, 100)
val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
Graph(origVerextRDD, inputGraph.edges)
}
}
示例4: RatePredictor
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import akka.actor.ActorSystem
import com.ferhtaydn.models.PatientInfo
import org.apache.spark.ml.feature.StringIndexerModel
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.mllib.linalg.{ Matrix, Vector }
import org.apache.spark.sql.{ Row, SQLContext }
import scala.concurrent.{ ExecutionContextExecutor, Future }
class RatePredictor(system: ActorSystem, sqlContext: SQLContext,
indexModel: StringIndexerModel, cvModel: CrossValidatorModel,
confusionMatrix: String) {
private val decimalFormatter = new java.text.DecimalFormat("##.##")
private val blockingDispatcher: ExecutionContextExecutor = system.dispatchers.lookup("ml.predictor.dispatcher")
def confusionMatrixString: Future[String] = {
Future {
confusionMatrix
}(blockingDispatcher)
}
def predict(patientInfo: PatientInfo): Future[Either[String, Double]] = {
Future {
val df = sqlContext.createDataFrame(Seq(patientInfo.toRecord))
val indexedJobDF = indexModel.transform(df)
val result = cvModel
.transform(indexedJobDF)
.select("prediction", "probability").map {
case Row(prediction: Double, probability: Vector) ?
(probability, prediction)
}
result.collect().headOption match {
case Some((prob, _)) ? Right(decimalFormatter.format(prob(1)).toDouble)
case None ? Left(s"No result can be predicted for the patient")
}
}(blockingDispatcher)
}
}
示例5: SparkSVDExampleOne
//设置package包名称以及导入依赖的类
package linalg.svd
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {
def main(args: Array[String]) {
val denseData = Seq(
Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
)
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
val sc = new SparkContext(spConfig)
val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))
// Compute the top 20 singular values and corresponding singular vectors.
val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
val U: RowMatrix = svd.U // The U factor is a RowMatrix.
val s: Vector = svd.s // The singular values are stored in a local dense vector.
val V: Matrix = svd.V // The V factor is a local dense matrix.
println("U:" + U)
println("s:" + s)
println("V:" + V)
sc.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:30,代码来源:SparkSVDExampleOne.scala
示例6: CorrelationMatrix
//设置package包名称以及导入依赖的类
package com.paypal.risk.smunf.math.stats
import org.apache.spark.mllib.linalg.Matrix
import scala.collection.mutable
import java.lang.Double.isNaN
class CorrelationMatrix(val values: Array[Double], val numRows: Int, val numCols: Int) {
def apply(i: Int, j: Int): Double = values(index(i, j))
private def index(i: Int, j: Int) = i + numRows * j
def toString(headers: Map[Int, String]): String = {
val corrMap = mutable.Map[String, Double]()
for (row <- 0 until numRows) {
for (col <- row + 1 until numCols) {
val name = s"""${headers.getOrElse(row, "")}, ${headers.getOrElse(col, "")}"""
corrMap(name) = values(index(row, col))
}
}
val sorted = corrMap.toSeq.sortBy(_._2).reverse
sorted.map(x => s"'${x._1}' : ${x._2}").mkString("\n")
}
def toSimilarity: Seq[(Long, Long, Double)] = {
val items = mutable.ArrayBuffer[(Long, Long, Double)]()
for (row <- 0 until numRows)
for (col <- row + 1 until numCols)
items.append((row, col, math.abs(values(index(row, col)))))
items.toSeq
}
}
object CorrelationMatrix {
def apply(matrix: Matrix): CorrelationMatrix = {
val array = matrix.toArray.map(x => if (isNaN(x)) -2.0 else x)
new CorrelationMatrix(array, matrix.numRows, matrix.numCols)
}
}
示例7:
//设置package包名称以及导入依赖的类
//???
val result = h.sql("select max(visit_times) from model_input_active_t") //??????
val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_times) from model_input_active_t") //??????
val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times)
val result = h.sql("select max(last_online_time) from model_input_active_t") //??????
val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(last_online_time) from model_input_active_t") //??????
val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time)
val result = h.sql("select max(pay_times) from model_input_active_t") //??????
val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(pay_times) from model_input_active_t") //??????
val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else ( max_pay_times - min_pay_times)
val result = h.sql("select max(comment_times) from model_input_active_t") //???????
val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(comment_times) from model_input_active_t") //???????
val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else ( max_comment_times - min_comment_times)
val result = h.sql("select max(stay_time) from model_input_active_t") //??????
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = h.sql("select min(stay_time) from model_input_active_t") //??????
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else ( max_stay_time - min_stay_time)
val result = h.sql("select max(visit_day_times) from model_input_active_t") //??????
val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_day_times) from model_input_active_t") //??????
val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else ( max_visit_day_times - min_visit_day_times)
//???visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1
val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1")
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
//DataFrame???Vectors???????API????Dataframe??rdd??????Vectors.dense????????
val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble))
val rm = new RowMatrix(data)
val pc = rm.computePrincipalComponents(1)
val mx = rm.multiply(pc)
//????
示例8: CsvWriter
//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.linalg.Matrix
object CsvWriter {
def writeMatrixToFile(matrix: Matrix, filename : String): Unit = {
import java.io._
val localMatrix: List[Array[Double]] = matrix
.transpose // Transpose since .toArray is column major
.toArray
.grouped(matrix.numCols)
.toList
val lines: List[String] = localMatrix
.map(line => line.mkString(","))
.map(_ + "\n")
val writer = new PrintWriter(new File(filename))
lines.foreach(writer.write)
writer.close()
}
}
示例9: ChiSqLearning
//设置package包名称以及导入依赖的类
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.linalg.{ Matrix, Matrices, Vectors }
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.{
SparkConf,
SparkContext
}
object ChiSqLearning {
def main(args: Array[String]) {
val vd = Vectors.dense(1, 2, 3, 4, 5)
val vdResult = Statistics.chiSqTest(vd)
println(vd)
println(vdResult)
println("-------------------------------")
val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6))
val mtxResult = Statistics.chiSqTest(mtx)
println(mtx)
println(mtxResult)
//print :??????????????p?,???????p
println("-------------------------------")
val mtx2 = Matrices.dense(2, 2, Array(19.0, 34, 24, 10.0))
printChiSqTest(mtx2)
printChiSqTest(Matrices.dense(2, 2, Array(26.0, 36, 7, 2.0)))
// val mtxResult2 = Statistics.chiSqTest(mtx2)
// println(mtx2)
// println(mtxResult2)
}
def printChiSqTest(matrix: Matrix): Unit = {
println("-------------------------------")
val mtxResult2 = Statistics.chiSqTest(matrix)
println(matrix)
println(mtxResult2)
}
}