本文整理汇总了Scala中org.apache.spark.ml.feature.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Scala Tokenizer类的具体用法?Scala Tokenizer怎么用?Scala Tokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tokenizer类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: movies
//设置package包名称以及导入依赖的类
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
object movies {
case class Sentence(sentence: String,label: Double)
def main(args:Array[String]) {
val spark = SparkSession
.builder
.appName("Movies Reviews")
.config("spark.master", "local")
.getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val neg = spark.sparkContext.textFile("file:///data/train/neg/").repartition(4)
.map(w => Sentence(w, 0.0))
val pos = spark.sparkContext.textFile("file:///data/train/pos/").repartition(4)
.map(w => Sentence(w, 1.0))
val test = spark.sparkContext.wholeTextFiles("file:///data/test/").repartition(4)
.map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})
val training=neg.union(pos)
val trainingDF=spark.createDataFrame(training)
val testDF=spark.createDataFrame(test)
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
val tokenizer = new Tokenizer()
.setInputCol("sentence")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val nb = new NaiveBayes()
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, nb))
// Fit the pipeline to training documents.
val model = pipeline.fit(trainingDF)
// Make predictions on test documents.
model.transform(testDF).repartition(1)
.select("file", "prediction")
.write.format("csv")
.option("header","true")
.option("delimiter","\t")
.save("/tmp/spark-prediction")
spark.stop()
}
}
示例2: MLClassification
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
object MLClassification extends MLMistJob {
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(): Map[String, Any] = {
val training = session.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
val model = pipeline.fit(training)
model.write.overwrite().save("regression")
Map.empty[String, Any]
}
def serve(text: List[String]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(s"regression")
val data = LocalData(
LocalDataColumn("text", text)
)
val result: LocalData = pipeline.transform(data)
Map("result" -> result.select("text", "prediction").toMapList)
}
}
示例3: LocalTokenizer
//设置package包名称以及导入依赖的类
package io.hydrosphere.spark_ml_serving.preprocessors
import io.hydrosphere.spark_ml_serving._
import org.apache.spark.ml.feature.Tokenizer
class LocalTokenizer(override val sparkTransformer: Tokenizer) extends LocalTransformer[Tokenizer] {
override def transform(localData: LocalData): LocalData = {
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val method = classOf[Tokenizer].getMethod("createTransformFunc")
val newData = column.data.map(s => {
method.invoke(sparkTransformer).asInstanceOf[String => Seq[String]](s.asInstanceOf[String])
})
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
case None => localData
}
}
}
object LocalTokenizer extends LocalModel[Tokenizer] {
override def load(metadata: Metadata, data: Map[String, Any]): Tokenizer = {
new Tokenizer(metadata.uid)
.setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
.setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
}
override implicit def getTransformer(transformer: Tokenizer): LocalTransformer[Tokenizer] = new LocalTokenizer(transformer)
}
示例4: StopWordsRemoverExample
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover
object StopWordsRemoverExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val sentence = spark.createDataFrame(Seq(
(0, "Tokenization,is the process of enchanting words,from the raw text"),
(1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
(2, " Here,will provide a sample example on how to tockenize sentences"),
(3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")
val regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W+")
.setGaps(true)
val countTokens = udf { (words: Seq[String]) => words.length }
val regexTokenized = regexTokenizer.transform(sentence)
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered")
val newDF = remover.transform(regexTokenized)
newDF.select("id", "filtered").show(false)
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:41,代码来源:StopWordsRemoverExample.scala
示例5: TockenizerExample
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
object TockenizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val sentence = spark.createDataFrame(Seq(
(0, "Tokenization,is the process of enchanting words,from the raw text"),
(1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
(2, " Here,will provide a sample example on how to tockenize sentences"),
(3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W+")
.setGaps(true)
val countTokens = udf { (words: Seq[String]) => words.length }
val tokenized = tokenizer.transform(sentence)
tokenized.select("sentence", "words")
.withColumn("tokens", countTokens(col("words")))
.show(false)
val regexTokenized = regexTokenizer.transform(sentence)
regexTokenized.select("sentence", "words")
.withColumn("tokens", countTokens(col("words")))
.show(false)
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:43,代码来源:TockenizerExample.scala
示例6: HashingTF
//设置package包名称以及导入依赖的类
package com.lhcg.ml
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SQLContext
object HashingTF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("HashingTF")
// .setMaster("local[2]")
val spark = new SparkContext(conf)
val sqlContext = new SQLContext(spark)
val sentenceData = sqlContext.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
}
}
示例7: MllibDemo
//设置package包名称以及导入依赖的类
package com.wallace.spark.sparkmllibdemo
import com.wallace.common.LogSupport
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SparkSession
object MllibDemo extends App with LogSupport {
val warehouseLocation = System.getProperty("user.dir") + "/" + "spark-warehouse"
val spark = SparkSession
.builder()
.master("local[*]")
.appName("RddConvertToDataFrame")
.config("spark.sql.warehouse.dir", warehouseLocation)
.getOrCreate()
val sc = spark.sparkContext
val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
spark.stop()
}
示例8: LocalTokenizer
//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.api.ml.preprocessors
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.feature.Tokenizer
class LocalTokenizer(override val sparkTransformer: Tokenizer) extends LocalTransformer[Tokenizer] {
override def transform(localData: LocalData): LocalData = {
localData.column(sparkTransformer.getInputCol) match {
case Some(column) =>
val method = classOf[Tokenizer].getMethod("createTransformFunc")
val newData = column.data.map(s => {
method.invoke(sparkTransformer).asInstanceOf[String => Seq[String]](s.asInstanceOf[String])
})
localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
case None => localData
}
}
}
object LocalTokenizer extends LocalModel[Tokenizer] {
override def load(metadata: Metadata, data: Map[String, Any]): Tokenizer = {
new Tokenizer(metadata.uid)
.setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
.setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
}
override implicit def getTransformer(transformer: Tokenizer): LocalTransformer[Tokenizer] = new LocalTokenizer(transformer)
}
示例9: TFIDFJob
//设置package包名称以及导入依赖的类
import io.hydrosphere.mist.api._
import io.hydrosphere.mist.api.ml._
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.mllib.linalg.{Vector => LVector}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
object TFIDFJob extends MLMistJob {
def session: SparkSession = SparkSession
.builder()
.appName(context.appName)
.config(context.getConf)
.getOrCreate()
def train(savePath: String): Map[String, Any] = {
val df = session.createDataFrame(Seq(
(0, "Provectus rocks!"),
(0, "Machine learning for masses!"),
(1, "BigData is a hot topick right now")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, idf))
val model = pipeline.fit(df)
model.write.overwrite().save(savePath)
Map.empty[String, Any]
}
def serve(modelPath: String, sentences: List[String]): Map[String, Any] = {
import LocalPipelineModel._
val pipeline = PipelineLoader.load(modelPath)
val data = LocalData(LocalDataColumn("sentence", sentences))
val result = pipeline.transform(data)
val response = result.select("sentence", "features").toMapList.map(rowMap => {
val conv = rowMap("features").asInstanceOf[LVector].toArray
rowMap + ("features" -> conv)
})
Map("result" -> response)
}
}
示例10: ets
//设置package包名称以及导入依赖的类
package sparkml
/**
* Created by I311352 on 3/27/2017.
*/
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
class ets {
}
object tfidf extends App {
val spark = SparkSession.builder().appName("TIDFExample").master("local[2]").getOrCreate()
val sentenceData = spark.createDataFrame(Seq(
(0.0, "Hi I heard about Spark"),
(0.0, "I wish Java could use case classes"),
(1.0, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
// alternatively, CountVectorizer can also be used to get term frequency vectors
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
}