Python feature.HashingTF方法代码示例

本文整理汇总了Python中pyspark.mllib.feature.HashingTF方法的典型用法代码示例。如果您正苦于以下问题：Python feature.HashingTF方法的具体用法？Python feature.HashingTF怎么用？Python feature.HashingTF使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.feature的用法示例。

在下文中一共展示了feature.HashingTF方法的9个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parseTextRDDToIndex

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def parseTextRDDToIndex(self, data, label=True):

        if label:
            labels = data.map(lambda line: float(line.split(" ", 1)[0]))
            documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
        else:
            documents = data.map(lambda line: line.split(" "))

        tf = HashingTF().transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        index = idfIgnore.transform(tf)

        if label:
            return labels.zip(index).map(lambda line: LabeledPoint(line[0], line[1]))
        else:
            return index

开发者ID:openstack，项目名称:meteos，代码行数:20，代码来源:meteos-script-1.6.0.py

示例2: produce_tfidf

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def produce_tfidf(x):
    tf = HashingTF().transform(x)
    idf = IDF(minDocFreq=5).fit(tf)
    tfidf = idf.transform(tf)
    return tfidf

# Load in reviews

开发者ID:lcdm-uiuc，项目名称:cs199-sp17，代码行数:9，代码来源:bayes_tfidf.py

示例3: textToIndex

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def textToIndex(self, text):
        return HashingTF().transform(text.split(" "))

开发者ID:openstack，项目名称:meteos，代码行数:4，代码来源:meteos-script-1.6.0.py

示例4: test_binary_term_freqs

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def test_binary_term_freqs(self):
        hashingTF = HashingTF(100).setBinary(True)
        doc = "a a b c c c".split(" ")
        n = hashingTF.numFeatures
        output = hashingTF.transform(doc).toArray()
        expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0,
                                      hashingTF.indexOf("b"): 1.0,
                                      hashingTF.indexOf("c"): 1.0}).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(output[i]))

开发者ID:alec-heif，项目名称:MIT-Thesis，代码行数:13，代码来源:tests.py

示例5: avg_spam

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def avg_spam(juez, tweets):
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(tweets)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140)
    featurizedData = hashingTF.transform(wordsData)

    """idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)"""

    predictionsAndLabelsDF = juez.transform(featurizedData).groupBy("user_id").agg(
        F.avg('predicted_label').alias("avg_spam"))

    return predictionsAndLabelsDF

开发者ID:JosemyDuarte，项目名称:twitterJudge，代码行数:17，代码来源:tools.py

示例6: entrenar_spam

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def entrenar_spam(sc, sql_context, dir_spam, dir_no_spam, num_trees=20, max_depth=8):
    input_spam = sc.textFile(dir_spam)
    input_no_spam = sc.textFile(dir_no_spam)

    spam = sql_context.read.json(input_spam).select("text").withColumn("label", F.lit(1.0))
    no_spam = sql_context.read.json(input_no_spam).select("text").withColumn("label", F.lit(0.0))

    training_data = spam.unionAll(no_spam)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(training_data)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140)
    featurizedData = hashingTF.transform(wordsData)

    """idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)"""

    seed = 1800009193L
    (split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    rf = RandomForestClassifier().setLabelCol("label") \
        .setPredictionCol("predicted_label") \
        .setFeaturesCol("rawFeatures") \
        .setSeed(100088121L) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([rf])

    reg_eval = MulticlassClassificationEvaluator(predictionCol="predicted_label", labelCol="label",
                                                 metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)
    modelo = crossval.fit(training_set_df).bestModel

    predictions_and_labels_df = modelo.transform(test_set_df)

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    return modelo, accuracy

开发者ID:JosemyDuarte，项目名称:twitterJudge，代码行数:50，代码来源:tools.py

示例7: get_tfidf_features

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def get_tfidf_features(txt_rdd):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt_rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf

开发者ID:hanhanwu，项目名称:Hanhan_Play_With_Social_Media，代码行数:10，代码来源:reddit_tfidf_LDA.py

示例8: extractKeywords_Train

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def extractKeywords_Train(self):
        documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])

        hashingTF = HashingTF()
        tf = hashingTF.transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        tfidfIgnore = idfIgnore.transform(tf)

        tfidfIgnore.saveAsTextFile("AAA")

开发者ID:Labyrinth108，项目名称:Content-Based-News-Recommendation-System-in-Spark，代码行数:13，代码来源:engine.py

示例9: getRecommendation

# 需要导入模块: from pyspark.mllib import feature [as 别名]
# 或者: from pyspark.mllib.feature import HashingTF [as 别名]
def getRecommendation(self, user_id):

        user_news, candidates_news, candidates_newsid = self.getUserReadNews(user_id)
        all_news = user_news + candidates_news

        # ??????????????
        vectorizer = CountVectorizer()
        # ??????????
        X = vectorizer.fit_transform(all_news)
        # ????????????
        # word = vectorizer.get_feature_names()

        transformer = TfidfTransformer()
        # ?????X???TF-IDF?
        tfidf = transformer.fit_transform(X).toarray()
        # ?????? tfidf[i][j]??i?????tf-idf??
        # print tfidf.toarray()

        recommend_num = 10
        recommend_per_news = recommend_num / len(user_news)
        recommend_list = []
        user_news_len = len(user_news)
        candidates_news_len = len(candidates_news)

        for i in range(user_news_len):
            news_candidate_sim = []
            for j in range(candidates_news_len):
                sim = 1 - spatial.distance.cosine(tfidf[i], tfidf[j + user_news_len])
                news_candidate_sim.append(sim)
            k_max_index = (-np.array(news_candidate_sim)).argsort()[:recommend_per_news]
            recommend_list.extend(k_max_index)

        recommend_news_id = [candidates_newsid[i] for i in recommend_list]
        return recommend_news_id

    # def getKeywords(self):
    #
    #     news = sc.parallelize(self.getUserReadNews())
    #     x = news.collect()
    #     hashing = HashingTF()
    #
    #     news_tf = hashing.transform(news)
    #     idfIgnore = IDF(minDocFreq=2).fit(news_tf)
    #     result = idfIgnore.transform(news_tf)

开发者ID:Labyrinth108，项目名称:Content-Based-News-Recommendation-System-in-Spark，代码行数:46，代码来源:engine.py

注：本文中的pyspark.mllib.feature.HashingTF方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。