本文整理汇总了Python中pyspark.mllib.feature.HashingTF方法的典型用法代码示例。如果您正苦于以下问题:Python feature.HashingTF方法的具体用法?Python feature.HashingTF怎么用?Python feature.HashingTF使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.feature
的用法示例。
在下文中一共展示了feature.HashingTF方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parseTextRDDToIndex
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def parseTextRDDToIndex(self, data, label=True):
if label:
labels = data.map(lambda line: float(line.split(" ", 1)[0]))
documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
else:
documents = data.map(lambda line: line.split(" "))
tf = HashingTF().transform(documents)
tf.cache()
idfIgnore = IDF(minDocFreq=2).fit(tf)
index = idfIgnore.transform(tf)
if label:
return labels.zip(index).map(lambda line: LabeledPoint(line[0], line[1]))
else:
return index
示例2: produce_tfidf
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def produce_tfidf(x):
tf = HashingTF().transform(x)
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf)
return tfidf
# Load in reviews
示例3: textToIndex
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def textToIndex(self, text):
return HashingTF().transform(text.split(" "))
示例4: test_binary_term_freqs
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def test_binary_term_freqs(self):
hashingTF = HashingTF(100).setBinary(True)
doc = "a a b c c c".split(" ")
n = hashingTF.numFeatures
output = hashingTF.transform(doc).toArray()
expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
hashingTF.indexOf("b"): 1.0,
hashingTF.indexOf("c"): 1.0}).toArray()
for i in range(0, n):
self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
": expected " + str(expected[i]) + ", got " + str(output[i]))
示例5: avg_spam
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def avg_spam(juez, tweets):
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(tweets)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140)
featurizedData = hashingTF.transform(wordsData)
"""idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)"""
predictionsAndLabelsDF = juez.transform(featurizedData).groupBy("user_id").agg(
F.avg('predicted_label').alias("avg_spam"))
return predictionsAndLabelsDF
示例6: entrenar_spam
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def entrenar_spam(sc, sql_context, dir_spam, dir_no_spam, num_trees=20, max_depth=8):
input_spam = sc.textFile(dir_spam)
input_no_spam = sc.textFile(dir_no_spam)
spam = sql_context.read.json(input_spam).select("text").withColumn("label", F.lit(1.0))
no_spam = sql_context.read.json(input_no_spam).select("text").withColumn("label", F.lit(0.0))
training_data = spam.unionAll(no_spam)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(training_data)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140)
featurizedData = hashingTF.transform(wordsData)
"""idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)"""
seed = 1800009193L
(split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed)
test_set_df = split_20_df.cache()
training_set_df = split_80_df.cache()
rf = RandomForestClassifier().setLabelCol("label") \
.setPredictionCol("predicted_label") \
.setFeaturesCol("rawFeatures") \
.setSeed(100088121L) \
.setMaxDepth(max_depth) \
.setNumTrees(num_trees)
rf_pipeline = Pipeline()
rf_pipeline.setStages([rf])
reg_eval = MulticlassClassificationEvaluator(predictionCol="predicted_label", labelCol="label",
metricName="accuracy")
crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5)
param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
crossval.setEstimatorParamMaps(param_grid)
modelo = crossval.fit(training_set_df).bestModel
predictions_and_labels_df = modelo.transform(test_set_df)
accuracy = reg_eval.evaluate(predictions_and_labels_df)
return modelo, accuracy
示例7: get_tfidf_features
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def get_tfidf_features(txt_rdd):
hashingTF = HashingTF()
tf = hashingTF.transform(txt_rdd)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
return tfidf
示例8: extractKeywords_Train
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def extractKeywords_Train(self):
documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tf.cache()
idfIgnore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfIgnore.transform(tf)
tfidfIgnore.saveAsTextFile("AAA")
示例9: getRecommendation
# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def getRecommendation(self, user_id):
user_news, candidates_news, candidates_newsid = self.getUserReadNews(user_id)
all_news = user_news + candidates_news
# ??????????????
vectorizer = CountVectorizer()
# ??????????
X = vectorizer.fit_transform(all_news)
# ????????????
# word = vectorizer.get_feature_names()
transformer = TfidfTransformer()
# ?????X???TF-IDF?
tfidf = transformer.fit_transform(X).toarray()
# ?????? tfidf[i][j]??i?????tf-idf??
# print tfidf.toarray()
recommend_num = 10
recommend_per_news = recommend_num / len(user_news)
recommend_list = []
user_news_len = len(user_news)
candidates_news_len = len(candidates_news)
for i in range(user_news_len):
news_candidate_sim = []
for j in range(candidates_news_len):
sim = 1 - spatial.distance.cosine(tfidf[i], tfidf[j + user_news_len])
news_candidate_sim.append(sim)
k_max_index = (-np.array(news_candidate_sim)).argsort()[:recommend_per_news]
recommend_list.extend(k_max_index)
recommend_news_id = [candidates_newsid[i] for i in recommend_list]
return recommend_news_id
# def getKeywords(self):
#
# news = sc.parallelize(self.getUserReadNews())
# x = news.collect()
# hashing = HashingTF()
#
# news_tf = hashing.transform(news)
# idfIgnore = IDF(minDocFreq=2).fit(news_tf)
# result = idfIgnore.transform(news_tf)