Scala RegexTokenizer类代码示例

本文整理汇总了Scala中org.apache.spark.ml.feature.RegexTokenizer类的典型用法代码示例。如果您正苦于以下问题：Scala RegexTokenizer类的具体用法？Scala RegexTokenizer怎么用？Scala RegexTokenizer使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了RegexTokenizer类的3个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: CooccurrenceTokenizer

//设置package包名称以及导入依赖的类
package com.indix.ml2npy.text

import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover}


class CooccurrenceTokenizer extends RegexTokenizer {
  protected override def createTransformFunc: (String) => Seq[String] = { input =>
    val stopWordSet = StopWordsRemover.loadDefaultStopWords("english").toSet
    val tokens:Array[String] = super.createTransformFunc(input).toSet.toArray
    val filteredTokens = tokens.filter(token => !stopWordSet.contains(token))
    val coocc = for {
      (tokenI: String, i: Int) <- filteredTokens.zipWithIndex
      (tokenJ: String, j: Int) <- filteredTokens.zipWithIndex if j > i
    } yield {
      val (t1: String, t2: String) = if (i < j) (tokenI, tokenJ) else (tokenJ, tokenI)
      s"${t1}_$t2"
    }
    coocc
  }
}

开发者ID:indix，项目名称:ml2npy，代码行数:21，代码来源:CooccurrenceTokenizer.scala

示例2: StopWordsRemoverExample

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning

import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }
    val regexTokenized = regexTokenizer.transform(sentence)

    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")

    val newDF = remover.transform(regexTokenized)
    newDF.select("id", "filtered").show(false)

  }
}

开发者ID:PacktPublishing，项目名称:Scala-and-Spark-for-Big-Data-Analytics，代码行数:41，代码来源:StopWordsRemoverExample.scala

示例3: TockenizerExample

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object TockenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentence)
    
    tokenized.select("sentence", "words")
            .withColumn("tokens", countTokens(col("words")))
            .show(false)

    val regexTokenized = regexTokenizer.transform(sentence)
    
    regexTokenized.select("sentence", "words")   
                .withColumn("tokens", countTokens(col("words")))
                .show(false)
  }
}

开发者ID:PacktPublishing，项目名称:Scala-and-Spark-for-Big-Data-Analytics，代码行数:43，代码来源:TockenizerExample.scala

注：本文中的org.apache.spark.ml.feature.RegexTokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。