本文整理汇总了Scala中org.apache.spark.ml.feature.CountVectorizer类的典型用法代码示例。如果您正苦于以下问题:Scala CountVectorizer类的具体用法?Scala CountVectorizer怎么用?Scala CountVectorizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了CountVectorizer类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: preprocess
//设置package包名称以及导入依赖的类
package functions
import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame
def preprocess(data: DataFrame): Pipeline = {
val spark = data.sparkSession
val params = new PreprocessParams
val indexModel = new StringIndexer()
.setHandleInvalid(params.handleInvalid)
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
val cleaner = new Cleaner()
.setFanJian(params.fanjian)
.setQuanBan(params.quanban)
.setMinLineLen(params.minLineLen)
.setInputCol("content")
.setOutputCol("cleand")
val segmenter = new Segmenter()
.isAddNature(params.addNature)
.isDelEn(params.delEn)
.isDelNum(params.delNum)
.isNatureFilter(params.natureFilter)
.setMinTermLen(params.minTermLen)
.setMinTermNum(params.minTermNum)
.setSegType(params.segmentType)
.setInputCol(cleaner.getOutputCol)
.setOutputCol("segmented")
val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
val remover = new StopWordsRemover()
.setStopWords(stopwords)
.setInputCol(segmenter.getOutputCol)
.setOutputCol("removed")
val vectorizer = new CountVectorizer()
.setMinTF(params.minTF)
.setVocabSize(params.vocabSize)
.setInputCol(remover.getOutputCol)
.setOutputCol("vectorized")
val idf = new IDF()
.setMinDocFreq(params.minDocFreq)
.setInputCol(vectorizer.getOutputCol)
.setOutputCol("features")
val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
new Pipeline().setStages(stages)
}
}
示例2: CountVectorizerDemo
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }
object CountVectorizerDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val df = spark.createDataFrame(
Seq((0, Array("Jason", "David")),
(1, Array("David", "Martin")),
(2, Array("Martin", "Jason")),
(3, Array("Jason", "Daiel")),
(4, Array("Daiel", "Martin")),
(5, Array("Moahmed", "Jason")),
(6, Array("David", "David")),
(7, Array("Jason", "Martin")))).toDF("id", "name")
df.show(false)
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("name")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
val feature = cvModel.transform(df)
feature.show(false)
spark.stop()
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:40,代码来源:CountVectorizerDemo.scala
示例3: Model
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions._
class Model {
private val toDouble = udf[Double, String](_.toDouble)
def train(inputPath: String, outputFolder: String) = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("SentimentAnalysis")
val ss = SparkSession
.builder()
.config(conf)
.getOrCreate()
ss.sparkContext.setLogLevel("error")
val rawDf = ss
.read
.option("header", true)
.option("delimiter", "|")
.csv(inputPath)
val df = rawDf.withColumn("label", toDouble(rawDf("label")))
val Array(train, test) = df.randomSplit(Array(0.8, 0.2))
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val vectorizer = new CountVectorizer()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(30)
.setRegParam(0.001)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, vectorizer, lr))
val model = pipeline.fit(train)
model.write.overwrite().save(outputFolder)
var totalCorrect = 0.0
val result = model
.transform(test)
.select("prediction", "label")
.collect()
result.foreach{ case Row(prediction, label) => if (prediction == label) totalCorrect += 1 }
val accuracy = totalCorrect / result.length
println(s"Accuracy: $accuracy")
model
}
}
object Model extends App {
val model = new Model().train("", "")
}
示例4: CountVectorizer
//设置package包名称以及导入依赖的类
package com.lhcg.ml
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
object CountVectorizer {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CountVectorizer")
// .setMaster("local[2]")
val spark = new SparkContext(conf)
val sqlContext = new SQLContext(spark)
val df = sqlContext.createDataFrame(Seq(
(0, Array("a", "b", "c")),
(1, Array("a", "b", "b", "c", "a"))
)).toDF("id", "words")
// fit a CountVectorizerModel from the corpus
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setVocabSize(3)
.setMinDF(2)
.fit(df)
// alternatively, define CountVectorizerModel with a-priori vocabulary
val cvm = new CountVectorizerModel(Array("a", "b", "c"))
.setInputCol("words")
.setOutputCol("features")
cvModel.transform(df).select("features").show()
}
}
示例5: CountVectorizerTest
//设置package包名称以及导入依赖的类
package com.github.leifker.spark.sentiment
import com.github.leifker.cassandra.config.CassandraConfig
import com.github.leifker.spark.CassandraSparkContext
import com.github.leifker.spark.config.CassandraSparkConfig
import com.github.leifker.spark.sentiment.test.UnitTest
import org.apache.spark.ml.feature.CountVectorizer
import org.scalatest.FlatSpec
class CountVectorizerTest extends FlatSpec {
val conf = CassandraSparkConfig(new CassandraConfig(), "local[4]")
val context = new CassandraSparkContext(conf, "CountVectorizerTest")
val sqlContext = context.session.sqlContext
val df = sqlContext.createDataFrame(Seq(
(0, Array("bad", "horrible", "horrible")),
(1, Array("good", "excellent", "excellent")),
(2, Array("good")),
(3, Array("bad"))
)).toDF("id", "words")
"CountVectorizer" should "apply min document frequency" taggedAs(UnitTest) in {
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setMinDF(2)
assert(countVectorizer.fit(df).vocabulary.toSet == Set("bad", "good"))
}
it should "apply min term frequency" taggedAs(UnitTest) in {
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setMinTF(2)
assert(countVectorizer.fit(df).vocabulary.toSet == Set("excellent", "horrible", "bad", "good"))
}
}
示例6: AmazonReviewsIT
//设置package包名称以及导入依赖的类
package com.github.leifker.spark.sentiment
import com.github.leifker.spark.test.{ITest, ITestContext}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.FlatSpec
import org.scalatest.tagobjects.Slow
class AmazonReviewsIT extends FlatSpec {
val amazonReviews = AmazonReviews(ITestContext.localConfig, ITestContext.amazonReviewsKeyspace, "IntegrationTest")
val oneStarReviews = amazonReviews.oneStarElectronics
.sample(false, 0.2)
.cache()
val fiveStarReviews = amazonReviews.fiveStarElectronics
.sample(false, 0.2)
.cache()
val sampleReviews: Dataset[Row] = amazonReviews.oneStarElectronics.sample(false, 0.007)
.union(amazonReviews.fiveStarElectronics.sample(false, 0.007))
"Spark" should "be able to process text reviews of sample rows" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer()
sampleReviews.foreach(row => tokenizer.transform(row.getAs[String]("text")))
}
it should "be able get at least a 500 sample" taggedAs(ITest, Slow) in {
assert(sampleReviews.count() >= 1000)
}
it should "be able to tokenize" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
val tokenized = tokenizer.transform(oneStarReviews)
assert(tokenized.select("words", "score").take(1000).length == 1000)
}
it should "vectorize" taggedAs(ITest, Slow) in {
val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
val tokenized = tokenizer.transform(oneStarReviews.limit(1000))
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setVocabSize(500)
.setMinDF(10)
.fit(tokenized)
cvModel.transform(tokenized).select("features").show()
}
}