本文整理汇总了Scala中org.apache.spark.ml.feature.RegexTokenizer类的典型用法代码示例。如果您正苦于以下问题:Scala RegexTokenizer类的具体用法?Scala RegexTokenizer怎么用?Scala RegexTokenizer使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RegexTokenizer类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: CooccurrenceTokenizer
//设置package包名称以及导入依赖的类
package com.indix.ml2npy.text
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover}
class CooccurrenceTokenizer extends RegexTokenizer {
protected override def createTransformFunc: (String) => Seq[String] = { input =>
val stopWordSet = StopWordsRemover.loadDefaultStopWords("english").toSet
val tokens:Array[String] = super.createTransformFunc(input).toSet.toArray
val filteredTokens = tokens.filter(token => !stopWordSet.contains(token))
val coocc = for {
(tokenI: String, i: Int) <- filteredTokens.zipWithIndex
(tokenJ: String, j: Int) <- filteredTokens.zipWithIndex if j > i
} yield {
val (t1: String, t2: String) = if (i < j) (tokenI, tokenJ) else (tokenJ, tokenI)
s"${t1}_$t2"
}
coocc
}
}
示例2: StopWordsRemoverExample
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover
object StopWordsRemoverExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val sentence = spark.createDataFrame(Seq(
(0, "Tokenization,is the process of enchanting words,from the raw text"),
(1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
(2, " Here,will provide a sample example on how to tockenize sentences"),
(3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")
val regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W+")
.setGaps(true)
val countTokens = udf { (words: Seq[String]) => words.length }
val regexTokenized = regexTokenizer.transform(sentence)
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered")
val newDF = remover.transform(regexTokenized)
newDF.select("id", "filtered").show(false)
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:41,代码来源:StopWordsRemoverExample.scala
示例3: TockenizerExample
//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
object TockenizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[*]")
.config("spark.sql.warehouse.dir", "E:/Exp/")
.appName(s"OneVsRestExample")
.getOrCreate()
val sentence = spark.createDataFrame(Seq(
(0, "Tokenization,is the process of enchanting words,from the raw text"),
(1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
(2, " Here,will provide a sample example on how to tockenize sentences"),
(3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W+")
.setGaps(true)
val countTokens = udf { (words: Seq[String]) => words.length }
val tokenized = tokenizer.transform(sentence)
tokenized.select("sentence", "words")
.withColumn("tokens", countTokens(col("words")))
.show(false)
val regexTokenized = regexTokenizer.transform(sentence)
regexTokenized.select("sentence", "words")
.withColumn("tokens", countTokens(col("words")))
.show(false)
}
}
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:43,代码来源:TockenizerExample.scala