本文整理汇总了Scala中org.apache.spark.mllib.feature.HashingTF类的典型用法代码示例。如果您正苦于以下问题:Scala HashingTF类的具体用法?Scala HashingTF怎么用?Scala HashingTF使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HashingTF类的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: Classifier
//设置package包名称以及导入依赖的类
package edu.neu.coe.scala.spark.spam
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Classifier extends App {
val conf = new SparkConf().setAppName("spam")
val sc = new SparkContext(conf)
val spam = sc.textFile("spam.txt")
val norm = sc.textFile("normal.txt")
val tf = new HashingTF(10000)
val spamFeatures = spam.map(email => tf.transform(email.split(" ")))
val normFeatures = norm.map(email => tf.transform(email.split(" ")))
val posExamples = spamFeatures.map(f => LabeledPoint(1, f))
val negExamples = normFeatures.map(f => LabeledPoint(0, f))
val trainingData = posExamples.union(negExamples)
trainingData.cache()
val model = new LogisticRegressionWithSGD().run(trainingData)
val posTest = tf.transform("Subject: Cheap Stuff From: <omg.fu> O M G GET cheap stuff by sending money to Robin Hillyard".split(" "))
val negTest = tf.transform("Subject: Spark From: Robin Hillyard<[email protected]> Hi Adam, I started studying Spark the other day".split(" "))
println(s"Prediction for positive test example: ${model.predict(posTest)}")
println(s"Prediction for negative test example: ${model.predict(negTest)}")
}
示例2: SparkSetup
//设置package包名称以及导入依赖的类
package com.databricks.apps
package twitterClassifier {
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
object SparkSetup {
val spark = SparkSession
.builder
.appName(getClass.getSimpleName.replace("$", ""))
.getOrCreate()
val sqlContext = spark.sqlContext
val sc: SparkContext = spark.sparkContext
// Suppress "WARN BlockManager: Block input-0-1478266015800 replicated to only 0 peer(s) instead of 1 peers" messages
sc.setLogLevel("ERROR")
}
}
package object twitterClassifier {
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.feature.HashingTF
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder
val numFeatures = 1000
val tf = new HashingTF(numFeatures)
def maybeTwitterAuth: Some[OAuthAuthorization] = Some(new OAuthAuthorization(new ConfigurationBuilder().build))
def featurize(s: String): Vector = tf.transform(s.sliding(2).toSeq)
}
示例3: TfIdfSample
//设置package包名称以及导入依赖的类
package org.sparksamples.featureext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF
object TfIdfSample{
def main(args: Array[String]) {
//TODO replace with path specific to your machine
val file = "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6//README.md"
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val sc = new SparkContext(spConfig)
val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
print("Documents Size:" + documents.count)
val hashingTF = new HashingTF()
val tf = hashingTF.transform(documents)
for(tf_ <- tf) {
println(s"$tf_")
}
tf.cache()
val idf = new IDF().fit(tf)
val tfidf = idf.transform(tf)
println("tfidf size : " + tfidf.count)
for(tfidf_ <- tfidf) {
println(s"$tfidf_")
}
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:30,代码来源:TfIdfSample.scala
示例4: TfIdfSample
//设置package包名称以及导入依赖的类
package org.sparksamples.featureext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.feature.IDF
import org.sparksamples.Util
object TfIdfSample{
def main(args: Array[String]) {
//TODO replace with path specific to your machine
val file = Util.SPARK_HOME + "/README.md"
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val sc = new SparkContext(spConfig)
val documents: RDD[Seq[String]] = sc.textFile(file).map(_.split(" ").toSeq)
print("Documents Size:" + documents.count)
val hashingTF = new HashingTF()
val tf = hashingTF.transform(documents)
for(tf_ <- tf) {
println(s"$tf_")
}
tf.cache()
val idf = new IDF().fit(tf)
val tfidf = idf.transform(tf)
println("tfidf size : " + tfidf.count)
for(tfidf_ <- tfidf) {
println(s"$tfidf_")
}
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:31,代码来源:TfIdfSample.scala
示例5: EmailSpam
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
object EmailSpam extends App {
val conf = new SparkConf().setAppName("email-spam").setMaster("local[*]")
val sc = new SparkContext(conf)
val spam = sc.textFile("./enron1/spam/0052.2003-12-20.GP.spam.txt", 4)
val normal = sc.textFile("./enron1/ham/0022.1999-12-16.farmer.ham.txt", 4)
val tf = new HashingTF(numFeatures = 10000)
val spamFeatures = spam.map(email => tf.transform(email.split(" ")))
val normalFeatures = normal.map(email => tf.transform(email.split(" ")))
val positiveExamples = spamFeatures.map(features => LabeledPoint(1, features))
val negativeExamples = normalFeatures.map(features => LabeledPoint(0, features))
val trainingData = positiveExamples.union(negativeExamples)
trainingData.cache()
val model = new LogisticRegressionWithSGD().run(trainingData)
//Test on a positive example (spam) and a negative one (normal).
val posTest = tf.transform(
"insurance plan which change your life ...".split(" "))
val negTest = tf.transform(
"hi sorry yaar i forget tell you i cant come today".split(" "))
println("Prediction for positive test example: " + model.predict(posTest))
println("Prediction for negative test example: " + model.predict(negTest))
}