Scala HashingTF类代码示例

本文整理汇总了Scala中org.apache.spark.mllib.feature.HashingTF类的典型用法代码示例。如果您正苦于以下问题：Scala HashingTF类的具体用法？Scala HashingTF怎么用？Scala HashingTF使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了HashingTF类的5个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: Classifier

//设置package包名称以及导入依赖的类
package edu.neu.coe.scala.spark.spam

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext


object Classifier extends App {
  
  val conf = new SparkConf().setAppName("spam")
  val sc = new SparkContext(conf)
  val spam = sc.textFile("spam.txt")
  val norm = sc.textFile("normal.txt")

  val tf = new HashingTF(10000)
  val spamFeatures = spam.map(email => tf.transform(email.split(" ")))
  val normFeatures = norm.map(email => tf.transform(email.split(" ")))
  
  val posExamples = spamFeatures.map(f => LabeledPoint(1, f))
  val negExamples = normFeatures.map(f => LabeledPoint(0, f))
  val trainingData = posExamples.union(negExamples)
  trainingData.cache()
  
  val model = new LogisticRegressionWithSGD().run(trainingData)
  
  val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" "))
  val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" "))
  
  println(s"Prediction for positive test example: ${model.predict(posTest)}")
  println(s"Prediction for negative test example: ${model.predict(negTest)}")
}

开发者ID:menezesl，项目名称:Scala-repo，代码行数:34，代码来源:Classifier.scala

示例2: SparkSetup

//设置package包名称以及导入依赖的类
package com.databricks.apps

package twitterClassifier {
  import org.apache.spark.SparkContext
  import org.apache.spark.sql.SparkSession

  object SparkSetup {
    val spark = SparkSession
      .builder
      .appName(getClass.getSimpleName.replace("$", ""))
      .getOrCreate()

    val sqlContext = spark.sqlContext

    val sc: SparkContext = spark.sparkContext
    // Suppress "WARN BlockManager: Block input-0-1478266015800 replicated to only 0 peer(s) instead of 1 peers" messages
    sc.setLogLevel("ERROR")
  }
}

package object twitterClassifier {
  import org.apache.spark.mllib.linalg.Vector
  import org.apache.spark.mllib.feature.HashingTF
  import twitter4j.auth.OAuthAuthorization
  import twitter4j.conf.ConfigurationBuilder

  val numFeatures = 1000
  val tf = new HashingTF(numFeatures)

  def maybeTwitterAuth: Some[OAuthAuthorization] = Some(new OAuthAuthorization(new ConfigurationBuilder().build))

  
  def featurize(s: String): Vector = tf.transform(s.sliding(2).toSeq)
}

开发者ID:krish121，项目名称:Spark-reference-applications，代码行数:35，代码来源:package.scala

示例3: TfIdfSample

//设置package包名称以及导入依赖的类
package org.sparksamples.featureext

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF

object TfIdfSample{
  def main(args: Array[String]) {
    //TODO replace with path specific to your machine
    val file = "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6//README.md"
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
    print("Documents Size:" + documents.count)
    val hashingTF = new HashingTF()
    val tf = hashingTF.transform(documents)
    for(tf_ <- tf) {
      println(s"$tf_")
    }
    tf.cache()
    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    println("tfidf size : " + tfidf.count)
    for(tfidf_ <- tfidf) {
      println(s"$tfidf_")
    }
  }
}

开发者ID:PacktPublishing，项目名称:Machine-Learning-with-Spark-Second-Edition，代码行数:30，代码来源:TfIdfSample.scala

示例4: TfIdfSample

//设置package包名称以及导入依赖的类
package org.sparksamples.featureext

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF
import org.sparksamples.Util

object TfIdfSample{
  def main(args: Array[String]) {
    //TODO replace with path specific to your machine
    val file = Util.SPARK_HOME + "/README.md"
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
    print("Documents Size:" + documents.count)
    val hashingTF = new HashingTF()
    val tf = hashingTF.transform(documents)
    for(tf_ <- tf) {
      println(s"$tf_")
    }
    tf.cache()
    val idf = new IDF().fit(tf)
    val tfidf = idf.transform(tf)
    println("tfidf size : " + tfidf.count)
    for(tfidf_ <- tfidf) {
      println(s"$tfidf_")
    }
  }
}

开发者ID:PacktPublishing，项目名称:Machine-Learning-with-Spark-Second-Edition，代码行数:31，代码来源:TfIdfSample.scala

示例5: EmailSpam

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD

object EmailSpam extends App {
  val conf = new SparkConf().setAppName("email-spam").setMaster("local[*]")
  val sc = new SparkContext(conf)
  val spam = sc.textFile("./enron1/spam/0052.2003-12-20.GP.spam.txt", 4)
  val normal = sc.textFile("./enron1/ham/0022.1999-12-16.farmer.ham.txt", 4)
  val tf = new HashingTF(numFeatures = 10000)
  val spamFeatures = spam.map(email => tf.transform(email.split(" ")))
  val normalFeatures = normal.map(email => tf.transform(email.split(" ")))
  val positiveExamples = spamFeatures.map(features => LabeledPoint(1, features))
  val negativeExamples = normalFeatures.map(features => LabeledPoint(0, features))
  val trainingData = positiveExamples.union(negativeExamples)
  trainingData.cache()
  val model = new LogisticRegressionWithSGD().run(trainingData)
  //Test on a positive example (spam) and a negative one (normal).
  val posTest = tf.transform(
    "insurance plan which change your life ...".split(" "))
  val negTest = tf.transform(
    "hi sorry yaar i forget tell you i cant come today".split(" "))
  println("Prediction for positive test example: " + model.predict(posTest))
  println("Prediction for negative test example: " + model.predict(negTest))

}

开发者ID:phalodi，项目名称:Email_Spam_Spark，代码行数:29，代码来源:EmailSpam.scala

注：本文中的org.apache.spark.mllib.feature.HashingTF类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。