当前位置: 首页>>代码示例>>Scala>>正文


Scala CountVectorizer类代码示例

本文整理汇总了Scala中org.apache.spark.ml.feature.CountVectorizer的典型用法代码示例。如果您正苦于以下问题:Scala CountVectorizer类的具体用法?Scala CountVectorizer怎么用?Scala CountVectorizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了CountVectorizer类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: preprocess

//设置package包名称以及导入依赖的类
package functions

import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame


  def preprocess(data: DataFrame): Pipeline = {
    val spark = data.sparkSession
    val params = new PreprocessParams

    val indexModel = new StringIndexer()
      .setHandleInvalid(params.handleInvalid)
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(data)

    val cleaner = new Cleaner()
      .setFanJian(params.fanjian)
      .setQuanBan(params.quanban)
      .setMinLineLen(params.minLineLen)
      .setInputCol("content")
      .setOutputCol("cleand")

    val segmenter = new Segmenter()
      .isAddNature(params.addNature)
      .isDelEn(params.delEn)
      .isDelNum(params.delNum)
      .isNatureFilter(params.natureFilter)
      .setMinTermLen(params.minTermLen)
      .setMinTermNum(params.minTermNum)
      .setSegType(params.segmentType)
      .setInputCol(cleaner.getOutputCol)
      .setOutputCol("segmented")

    val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
    val remover = new StopWordsRemover()
      .setStopWords(stopwords)
      .setInputCol(segmenter.getOutputCol)
      .setOutputCol("removed")

    val vectorizer = new CountVectorizer()
      .setMinTF(params.minTF)
      .setVocabSize(params.vocabSize)
      .setInputCol(remover.getOutputCol)
      .setOutputCol("vectorized")

    val idf = new IDF()
      .setMinDocFreq(params.minDocFreq)
      .setInputCol(vectorizer.getOutputCol)
      .setOutputCol("features")

    val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
    new Pipeline().setStages(stages)
  }
} 
开发者ID:yhao2014,项目名称:CkoocNLP,代码行数:60,代码来源:Preprocessor.scala

示例2: CountVectorizerDemo

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }

object CountVectorizerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, Array("Jason", "David")),
        (1, Array("David", "Martin")),
        (2, Array("Martin", "Jason")),
        (3, Array("Jason", "Daiel")),
        (4, Array("Daiel", "Martin")),
        (5, Array("Moahmed", "Jason")),
        (6, Array("David", "David")),
        (7, Array("Jason", "Martin")))).toDF("id", "name")

    df.show(false)

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("name")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    val feature = cvModel.transform(df)
    feature.show(false)

    spark.stop()
  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:40,代码来源:CountVectorizerDemo.scala

示例3: Model

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{CountVectorizer, Tokenizer}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions._

class Model {

  private val toDouble = udf[Double, String](_.toDouble)

  def train(inputPath: String, outputFolder: String) = {
    val conf = new SparkConf()
        .setMaster("local[*]")
      .setAppName("SentimentAnalysis")
    val ss = SparkSession
      .builder()
      .config(conf)
      .getOrCreate()
    ss.sparkContext.setLogLevel("error")
    val rawDf = ss
      .read
      .option("header", true)
      .option("delimiter", "|")
      .csv(inputPath)
    val df = rawDf.withColumn("label", toDouble(rawDf("label")))
    val Array(train, test) = df.randomSplit(Array(0.8, 0.2))
    val tokenizer = new Tokenizer()
      .setInputCol("text")
      .setOutputCol("words")
    val vectorizer = new CountVectorizer()
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val lr = new LogisticRegression()
      .setMaxIter(30)
      .setRegParam(0.001)
    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, vectorizer, lr))

    val model = pipeline.fit(train)
    model.write.overwrite().save(outputFolder)

    var totalCorrect = 0.0
    val result = model
      .transform(test)
      .select("prediction", "label")
      .collect()

    result.foreach{ case Row(prediction, label) => if (prediction == label) totalCorrect += 1 }
    val accuracy = totalCorrect / result.length
    println(s"Accuracy: $accuracy")

    model
  }

}

object Model extends App {
  val model = new Model().train("", "")
} 
开发者ID:a-panchenko,项目名称:scala-sentiment-analysis,代码行数:61,代码来源:Model.scala

示例4: CountVectorizer

//设置package包名称以及导入依赖的类
package com.lhcg.ml

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}


object CountVectorizer {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("CountVectorizer")
    //      .setMaster("local[2]")
    val spark = new SparkContext(conf)
    val sqlContext = new SQLContext(spark)

    val df = sqlContext.createDataFrame(Seq(
      (0, Array("a", "b", "c")),
      (1, Array("a", "b", "b", "c", "a"))
    )).toDF("id", "words")

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    // alternatively, define CountVectorizerModel with a-priori vocabulary
    val cvm = new CountVectorizerModel(Array("a", "b", "c"))
      .setInputCol("words")
      .setOutputCol("features")

    cvModel.transform(df).select("features").show()
  }
} 
开发者ID:lhcg,项目名称:lovespark,代码行数:36,代码来源:CountVectorizer.scala

示例5: CountVectorizerTest

//设置package包名称以及导入依赖的类
package com.github.leifker.spark.sentiment

import com.github.leifker.cassandra.config.CassandraConfig
import com.github.leifker.spark.CassandraSparkContext
import com.github.leifker.spark.config.CassandraSparkConfig
import com.github.leifker.spark.sentiment.test.UnitTest
import org.apache.spark.ml.feature.CountVectorizer
import org.scalatest.FlatSpec


class CountVectorizerTest extends FlatSpec {

  val conf = CassandraSparkConfig(new CassandraConfig(), "local[4]")
  val context = new CassandraSparkContext(conf, "CountVectorizerTest")
  val sqlContext = context.session.sqlContext

  val df = sqlContext.createDataFrame(Seq(
    (0, Array("bad", "horrible", "horrible")),
    (1, Array("good", "excellent", "excellent")),
    (2, Array("good")),
    (3, Array("bad"))
  )).toDF("id", "words")

  "CountVectorizer" should "apply min document frequency" taggedAs(UnitTest) in {
    val countVectorizer = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setMinDF(2)

    assert(countVectorizer.fit(df).vocabulary.toSet == Set("bad", "good"))
  }

  it should "apply min term frequency" taggedAs(UnitTest) in {
    val countVectorizer = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setMinTF(2)

    assert(countVectorizer.fit(df).vocabulary.toSet == Set("excellent", "horrible", "bad", "good"))
  }
} 
开发者ID:leifker,项目名称:geo-sentiment,代码行数:42,代码来源:CountVectorizerTest.scala

示例6: AmazonReviewsIT

//设置package包名称以及导入依赖的类
package com.github.leifker.spark.sentiment

import com.github.leifker.spark.test.{ITest, ITestContext}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.FlatSpec
import org.scalatest.tagobjects.Slow


class AmazonReviewsIT extends FlatSpec {
  val amazonReviews = AmazonReviews(ITestContext.localConfig, ITestContext.amazonReviewsKeyspace, "IntegrationTest")
  val oneStarReviews = amazonReviews.oneStarElectronics
    .sample(false, 0.2)
    .cache()
  val fiveStarReviews = amazonReviews.fiveStarElectronics
    .sample(false, 0.2)
    .cache()
  val sampleReviews: Dataset[Row] = amazonReviews.oneStarElectronics.sample(false, 0.007)
    .union(amazonReviews.fiveStarElectronics.sample(false, 0.007))

  "Spark" should "be able to process text reviews of sample rows" taggedAs(ITest, Slow) in {
    val tokenizer = new ReviewTokenizer()
    sampleReviews.foreach(row => tokenizer.transform(row.getAs[String]("text")))
  }

  it should "be able get at least a 500 sample" taggedAs(ITest, Slow) in {
    assert(sampleReviews.count() >= 1000)
  }

  it should "be able to tokenize" taggedAs(ITest, Slow) in {
    val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
    val tokenized = tokenizer.transform(oneStarReviews)
    assert(tokenized.select("words", "score").take(1000).length == 1000)
  }

  it should "vectorize" taggedAs(ITest, Slow) in {
    val tokenizer = new ReviewTokenizer().setInputCol("text").setOutputCol("words")
    val tokenized = tokenizer.transform(oneStarReviews.limit(1000))
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("words")
      .setOutputCol("features")
      .setVocabSize(500)
      .setMinDF(10)
      .fit(tokenized)

    cvModel.transform(tokenized).select("features").show()
  }
} 
开发者ID:leifker,项目名称:geo-sentiment,代码行数:49,代码来源:AmazonReviewsIT.scala


注:本文中的org.apache.spark.ml.feature.CountVectorizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。