本文整理汇总了Scala中org.apache.spark.ml.feature.IDF类的典型用法代码示例。如果您正苦于以下问题:Scala IDF类的具体用法?Scala IDF怎么用?Scala IDF使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了IDF类的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: preprocess
//设置package包名称以及导入依赖的类
package functions
import config.paramconf.PreprocessParams
import functions.clean.Cleaner
import functions.segment.Segmenter
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{CountVectorizer, IDF, StopWordsRemover, StringIndexer}
import org.apache.spark.sql.DataFrame
def preprocess(data: DataFrame): Pipeline = {
val spark = data.sparkSession
val params = new PreprocessParams
val indexModel = new StringIndexer()
.setHandleInvalid(params.handleInvalid)
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
val cleaner = new Cleaner()
.setFanJian(params.fanjian)
.setQuanBan(params.quanban)
.setMinLineLen(params.minLineLen)
.setInputCol("content")
.setOutputCol("cleand")
val segmenter = new Segmenter()
.isAddNature(params.addNature)
.isDelEn(params.delEn)
.isDelNum(params.delNum)
.isNatureFilter(params.natureFilter)
.setMinTermLen(params.minTermLen)
.setMinTermNum(params.minTermNum)
.setSegType(params.segmentType)
.setInputCol(cleaner.getOutputCol)
.setOutputCol("segmented")
val stopwords = spark.sparkContext.textFile(params.stopwordFilePath).collect()
val remover = new StopWordsRemover()
.setStopWords(stopwords)
.setInputCol(segmenter.getOutputCol)
.setOutputCol("removed")
val vectorizer = new CountVectorizer()
.setMinTF(params.minTF)
.setVocabSize(params.vocabSize)
.setInputCol(remover.getOutputCol)
.setOutputCol("vectorized")
val idf = new IDF()
.setMinDocFreq(params.minDocFreq)
.setInputCol(vectorizer.getOutputCol)
.setOutputCol("features")
val stages = Array(cleaner, indexModel, segmenter, remover, vectorizer, idf)
new Pipeline().setStages(stages)
}
}
示例2: HashingTF
//设置package包名称以及导入依赖的类
package com.lhcg.ml
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SQLContext
object HashingTF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("HashingTF")
// .setMaster("local[2]")
val spark = new SparkContext(conf)
val sqlContext = new SQLContext(spark)
val sentenceData = sqlContext.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
}
}
示例3: MllibDemo
//设置package包名称以及导入依赖的类
package com.wallace.spark.sparkmllibdemo
import com.wallace.common.LogSupport
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SparkSession
object MllibDemo extends App with LogSupport {
val warehouseLocation = System.getProperty("user.dir") + "/" + "spark-warehouse"
val spark = SparkSession
.builder()
.master("local[*]")
.appName("RddConvertToDataFrame")
.config("spark.sql.warehouse.dir", warehouseLocation)
.getOrCreate()
val sc = spark.sparkContext
val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
spark.stop()
}
示例4: TFIDFJob
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.mllib.linalg.{Vector => LVector}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
object TFIDFJob extends MLMistJob {
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(savePath: String): Map[String, Any] = {
val df = session.createDataFrame(Seq(
(0, "Provectus rocks!"),
(0, "Machine learning for masses!"),
(1, "BigData is a hot topick right now")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, idf))
val model = pipeline.fit(df)
model.write.overwrite().save(savePath)
Map.empty[String, Any]
}
def serve(modelPath: String, sentences: List[String]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(modelPath)
val data = LocalData(LocalDataColumn("sentence", sentences))
val result = pipeline.transform(data)
val response = result.select("sentence", "features").toMapList.map(rowMap => {
val conv = rowMap("features").asInstanceOf[LVector].toArray
rowMap + ("features" -> conv)
})
Map("result" -> response)
}
}
示例5: ets
//设置package包名称以及导入依赖的类
package sparkml
/**
* Created by I311352 on 3/27/2017.
*/
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
class ets {
}
object tfidf extends App {
val spark = SparkSession.builder().appName("TIDFExample").master("local[2]").getOrCreate()
val sentenceData = spark.createDataFrame(Seq(
(0.0, "Hi I heard about Spark"),
(0.0, "I wish Java could use case classes"),
(1.0, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
// alternatively, CountVectorizer can also be used to get term frequency vectors
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
}