本文整理汇总了Scala中org.apache.spark.ml.feature.StopWordsRemover类的典型用法代码示例。如果您正苦于以下问题:Scala StopWordsRemover类的具体用法?Scala StopWordsRemover怎么用?Scala StopWordsRemover使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StopWordsRemover类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: preprocess
//设置package包名称以及导入依赖的类
package functions
import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame
def preprocess(data: DataFrame): Pipeline = {
val spark = data.sparkSession
val params = new PreprocessParams
val indexModel = new StringIndexer()
.setHandleInvalid(params.handleInvalid)
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
val cleaner = new Cleaner()
.setFanJian(params.fanjian)
.setQuanBan(params.quanban)
.setMinLineLen(params.minLineLen)
.setInputCol("content")
.setOutputCol("cleand")
val segmenter = new Segmenter()
.isAddNature(params.addNature)
.isDelEn(params.delEn)
.isDelNum(params.delNum)
.isNatureFilter(params.natureFilter)
.setMinTermLen(params.minTermLen)
.setMinTermNum(params.minTermNum)
.setSegType(params.segmentType)
.setInputCol(cleaner.getOutputCol)
.setOutputCol("segmented")
val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
val remover = new StopWordsRemover()
.setStopWords(stopwords)
.setInputCol(segmenter.getOutputCol)
.setOutputCol("removed")
val vectorizer = new CountVectorizer()
.setMinTF(params.minTF)
.setVocabSize(params.vocabSize)
.setInputCol(remover.getOutputCol)
.setOutputCol("vectorized")
val idf = new IDF()
.setMinDocFreq(params.minDocFreq)
.setInputCol(vectorizer.getOutputCol)
.setOutputCol("features")
val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
new Pipeline().setStages(stages)
}
}
示例2: CooccurrenceTokenizer
//设置package包名称以及导入依赖的类
package com.indix.ml2npy.text
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover}
class CooccurrenceTokenizer extends RegexTokenizer {
protected override def createTransformFunc: (String) => Seq[String] = { input =>
val stopWordSet = StopWordsRemover.loadDefaultStopWords("english").toSet
val tokens:Array[String] = super.createTransformFunc(input).toSet.toArray
val filteredTokens = tokens.filter(token => !stopWordSet.contains(token))
val coocc = for {
(tokenI: String, i: Int) <- filteredTokens.zipWithIndex
(tokenJ: String, j: Int) <- filteredTokens.zipWithIndex if j > i
} yield {
val (t1: String, t2: String) = if (i < j) (tokenI, tokenJ) else (tokenJ, tokenI)
s"${t1}_$t2"
}
coocc
}
}
示例3: LocalStopWordsRemover
//设置package包名称以及导入依赖的类
package io.hydrosphere.spark_ml_serving.preprocessors
import io.hydrosphere.spark_ml_serving._
import org.apache.spark.ml.feature.StopWordsRemover
class LocalStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends LocalTransformer[StopWordsRemover] {
override def transform(localData: LocalData): LocalData = {
val stopWordsSet = sparkTransformer.getStopWords
val toLower = (s: String) => if (s != null) s.toLowerCase else s
val lowerStopWords = stopWordsSet.map(toLower(_)).toSet
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val newData = column.data.map(r => {
if (sparkTransformer.getCaseSensitive) {
r.asInstanceOf[List[String]].filter(s => !stopWordsSet.contains(s))
} else {
r.asInstanceOf[List[String]].filter(s => !lowerStopWords.contains(toLower(s)))
}
})
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
case None => localData
}
}
}
object LocalStopWordsRemover extends LocalModel[StopWordsRemover] {
override def load(metadata: Metadata, data: Map[String, Any]): StopWordsRemover = {
new StopWordsRemover(metadata.uid)
.setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
.setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
.setCaseSensitive(metadata.paramMap("caseSensitive").asInstanceOf[Boolean])
.setStopWords(metadata.paramMap("stopWords").asInstanceOf[List[String]].toArray)
}
override implicit def getTransformer(transformer: StopWordsRemover): LocalTransformer[StopWordsRemover] = new LocalStopWordsRemover(transformer)
}
示例4: StopWordsRemoverExample
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover
object StopWordsRemoverExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val sentence = spark.createDataFrame(Seq(
(0, "Tokenization,is the process of enchanting words,from the raw text"),
(1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
(2, " Here,will provide a sample example on how to tockenize sentences"),
(3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")
val regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W+")
.setGaps(true)
val countTokens = udf { (words: Seq[String]) => words.length }
val regexTokenized = regexTokenizer.transform(sentence)
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered")
val newDF = remover.transform(regexTokenized)
newDF.select("id", "filtered").show(false)
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:41,代码来源:StopWordsRemoverExample.scala
示例5: LocalStopWordsRemover
//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.api.ml.preprocessors
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.feature.StopWordsRemover
class LocalStopWordsRemover(override val sparkTransformer: StopWordsRemover) extends LocalTransformer[StopWordsRemover] {
override def transform(localData: LocalData): LocalData = {
val stopWordsSet = sparkTransformer.getStopWords
val toLower = (s: String) => if (s != null) s.toLowerCase else s
val lowerStopWords = stopWordsSet.map(toLower(_)).toSet
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val newData = column.data.map(r => {
if (sparkTransformer.getCaseSensitive) {
r.asInstanceOf[List[String]].filter(s => !stopWordsSet.contains(s))
} else {
r.asInstanceOf[List[String]].filter(s => !lowerStopWords.contains(toLower(s)))
}
})
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
case None => localData
}
}
}
object LocalStopWordsRemover extends LocalModel[StopWordsRemover] {
override def load(metadata: Metadata, data: Map[String, Any]): StopWordsRemover = {
new StopWordsRemover(metadata.uid)
.setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
.setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
.setCaseSensitive(metadata.paramMap("caseSensitive").asInstanceOf[Boolean])
.setStopWords(metadata.paramMap("stopWords").asInstanceOf[List[String]].toArray)
}
override implicit def getTransformer(transformer: StopWordsRemover): LocalTransformer[StopWordsRemover] = new LocalStopWordsRemover(transformer)
}
示例6: StopWordsRemoverJob
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.sql.SparkSession
object StopWordsRemoverJob extends MLMistJob {
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(savePath: String): Map[String, Any] = {
val df = session.createDataFrame(Seq(
(0, Seq("I", "saw", "the", "red", "balloon")),
(1, Seq("Mary", "had", "a", "little", "lamb"))
)).toDF("id", "raw")
val remover = new StopWordsRemover()
.setInputCol("raw")
.setOutputCol("filtered")
val pipeline = new Pipeline().setStages(Array(remover))
val model = pipeline.fit(df)
model.write.overwrite().save(savePath)
Map.empty[String, Any]
}
def serve(modelPath: String, features: List[List[String]]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(modelPath)
val data = LocalData(
LocalDataColumn("raw", features)
)
val result: LocalData = pipeline.transform(data)
Map("result" -> result.select("raw", "filtered").toMapList)
}
}