本文整理汇总了Scala中org.apache.spark.ml.feature.CountVectorizerModel类的典型用法代码示例。如果您正苦于以下问题:Scala CountVectorizerModel类的具体用法?Scala CountVectorizerModel怎么用?Scala CountVectorizerModel使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了CountVectorizerModel类的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: CountVectorizerDemo
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }
object CountVectorizerDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val df = spark.createDataFrame(
Seq((0, Array("Jason", "David")),
(1, Array("David", "Martin")),
(2, Array("Martin", "Jason")),
(3, Array("Jason", "Daiel")),
(4, Array("Daiel", "Martin")),
(5, Array("Moahmed", "Jason")),
(6, Array("David", "David")),
(7, Array("Jason", "Martin")))).toDF("id", "name")
df.show(false)
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("name")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
val feature = cvModel.transform(df)
feature.show(false)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:40,代码来源:CountVectorizerDemo.scala
示例2: CountVectorizer
//设置package包名称以及导入依赖的类
package com.lhcg.ml
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
object CountVectorizer {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CountVectorizer")
// .setMaster("local[2]")
val spark = new SparkContext(conf)
val sqlContext = new SQLContext(spark)
val df = sqlContext.createDataFrame(Seq(
(0, Array("a", "b", "c")),
(1, Array("a", "b", "b", "c", "a"))
)).toDF("id", "words")
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
// alternatively, define CountVectorizerModel with a-priori vocabulary
val cvm = new CountVectorizerModel(Array("a", "b", "c"))
.setInputCol("words")
.setOutputCol("features")
cvModel.transform(df).select("features").show()
}
}
示例3: LocalCountVectorizerModel
//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.api.ml.preprocessors
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.linalg.Vectors
import scala.collection.mutable
class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel) extends LocalTransformer[CountVectorizerModel] {
override def transform(localData: LocalData): LocalData = {
val dict = sparkTransformer.vocabulary.zipWithIndex.toMap
val minTf = sparkTransformer.getMinTF
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val newCol = column.data.map { data =>
val termCounts = mutable.HashMap.empty[Int, Double]
var tokenCount = 0L
val arr = data.asInstanceOf[List[String]]
arr.foreach { token =>
dict.get(token) foreach { index =>
val storedValue = termCounts.getOrElseUpdate(index, 0.0)
termCounts.update(index, storedValue + 1.0)
}
tokenCount += 1
}
val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf
val eCounts = if (sparkTransformer.getBinary) {
termCounts filter(_._2 >= eTF) map(_._1 -> 1.0) toSeq
} else {
termCounts filter(_._2 >= eTF) toSeq
}
Vectors.sparse(dict.size, eCounts.toList)
}
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol))
case None => localData
}
}
}
object LocalCountVectorizerModel extends LocalModel[CountVectorizerModel] {
override def load(metadata: Metadata, data: Map[String, Any]): CountVectorizerModel = {
val vocabulary = data("vocabulary").asInstanceOf[List[String]].toArray
val inst = new CountVectorizerModel(metadata.uid, vocabulary)
inst
.setInputCol(metadata.paramMap("inputCol").toString)
.setOutputCol(metadata.paramMap("outputCol").toString)
.set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean])
.set(inst.minDF, metadata.paramMap("minDF").toString.toDouble)
.set(inst.minTF, metadata.paramMap("minTF").toString.toDouble)
.set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue())
}
override implicit def getTransformer(transformer: CountVectorizerModel): LocalTransformer[CountVectorizerModel] = new LocalCountVectorizerModel(transformer)
}
示例4: AmazonReviewsIT
//设置package包名称以及导入依赖的类
package com.github.leifker.spark.sentiment
import com.github.leifker.spark.test.{ITest, ITestContext}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.FlatSpec
import org.scalatest.tagobjects.Slow
class AmazonReviewsIT extends FlatSpec {
val amazonReviews = AmazonReviews(ITestContext.localConfig, ITestContext.amazonReviewsKeyspace, "IntegrationTest")
val oneStarReviews = amazonReviews.oneStarElectronics
.sample(false, 0.2)
.cache()
val fiveStarReviews = amazonReviews.fiveStarElectronics
.sample(false, 0.2)
.cache()
val sampleReviews: Dataset[Row] = amazonReviews.oneStarElectronics.sample(false, 0.007)
.union(amazonReviews.fiveStarElectronics.sample(false, 0.007))
"Spark" should "be able to process text reviews of sample rows" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer()
sampleReviews.foreach(row => tokenizer.transform(row.getAs[String]("text")))
}
it should "be able get at least a 500 sample" taggedAs(ITest, Slow) in {
assert(sampleReviews.count() >= 1000)
}
it should "be able to tokenize" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
val tokenized = tokenizer.transform(oneStarReviews)
assert(tokenized.select("words", "score").take(1000).length == 1000)
}
it should "vectorize" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
val tokenized = tokenizer.transform(oneStarReviews.limit(1000))
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setVocabSize(500)
.setMinDF(10)
.fit(tokenized)
cvModel.transform(tokenized).select("features").show()
}
}
示例5: LocalCountVectorizerModel
//设置package包名称以及导入依赖的类
package io.hydrosphere.spark_ml_serving.preprocessors
import io.hydrosphere.spark_ml_serving._
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.linalg.Vectors
import scala.collection.mutable
class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel) extends LocalTransformer[CountVectorizerModel] {
override def transform(localData: LocalData): LocalData = {
val dict = sparkTransformer.vocabulary.zipWithIndex.toMap
val minTf = sparkTransformer.getMinTF
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val newCol = column.data.map { data =>
val termCounts = mutable.HashMap.empty[Int, Double]
var tokenCount = 0L
val arr = data.asInstanceOf[List[String]]
arr.foreach { token =>
dict.get(token) foreach { index =>
val storedValue = termCounts.getOrElseUpdate(index, 0.0)
termCounts.update(index, storedValue + 1.0)
}
tokenCount += 1
}
val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf
val eCounts = if (sparkTransformer.getBinary) {
termCounts filter(_._2 >= eTF) map(_._1 -> 1.0) toSeq
} else {
termCounts filter(_._2 >= eTF) toSeq
}
Vectors.sparse(dict.size, eCounts.toList)
}
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol))
case None => localData
}
}
}
object LocalCountVectorizerModel extends LocalModel[CountVectorizerModel] {
override def load(metadata: Metadata, data: Map[String, Any]): CountVectorizerModel = {
val vocabulary = data("vocabulary").asInstanceOf[List[String]].toArray
val inst = new CountVectorizerModel(metadata.uid, vocabulary)
inst
.setInputCol(metadata.paramMap("inputCol").toString)
.setOutputCol(metadata.paramMap("outputCol").toString)
.set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean])
.set(inst.minDF, metadata.paramMap("minDF").toString.toDouble)
.set(inst.minTF, metadata.paramMap("minTF").toString.toDouble)
.set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue())
}
override implicit def getTransformer(transformer: CountVectorizerModel): LocalTransformer[CountVectorizerModel] = new LocalCountVectorizerModel(transformer)
}