當前位置: 首頁>>代碼示例>>Python>>正文


Python feature.IDF屬性代碼示例

本文整理匯總了Python中pyspark.mllib.feature.IDF屬性的典型用法代碼示例。如果您正苦於以下問題:Python feature.IDF屬性的具體用法?Python feature.IDF怎麽用?Python feature.IDF使用的例子?那麽, 這裏精選的屬性代碼示例或許可以為您提供幫助。您也可以進一步了解該屬性所在pyspark.mllib.feature的用法示例。


在下文中一共展示了feature.IDF屬性的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: parseTextRDDToIndex

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def parseTextRDDToIndex(self, data, label=True):

        if label:
            labels = data.map(lambda line: float(line.split(" ", 1)[0]))
            documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
        else:
            documents = data.map(lambda line: line.split(" "))

        tf = HashingTF().transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        index = idfIgnore.transform(tf)

        if label:
            return labels.zip(index).map(lambda line: LabeledPoint(line[0], line[1]))
        else:
            return index 
開發者ID:openstack,項目名稱:meteos,代碼行數:20,代碼來源:meteos-script-1.6.0.py

示例2: produce_tfidf

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def produce_tfidf(x):
    tf = HashingTF().transform(x)
    idf = IDF(minDocFreq=5).fit(tf)
    tfidf = idf.transform(tf)
    return tfidf

# Load in reviews 
開發者ID:lcdm-uiuc,項目名稱:cs199-sp17,代碼行數:9,代碼來源:bayes_tfidf.py

示例3: test_idf_model

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def test_idf_model(self):
        data = [
            Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
            Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
            Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
            Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
        ]
        model = IDF().fit(self.sc.parallelize(data, 2))
        idf = model.idf()
        self.assertEqual(len(idf), 11) 
開發者ID:alec-heif,項目名稱:MIT-Thesis,代碼行數:12,代碼來源:tests.py

示例4: get_tfidf_features

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def get_tfidf_features(txt_rdd):
    hashingTF = HashingTF()
    tf = hashingTF.transform(txt_rdd)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)

    return tfidf 
開發者ID:hanhanwu,項目名稱:Hanhan_Play_With_Social_Media,代碼行數:10,代碼來源:reddit_tfidf_LDA.py

示例5: classify_tweet

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def classify_tweet(tf):
	return IDF().fit(tf).transform(tf) 
開發者ID:xuwenyihust,項目名稱:Twitter-Hashtag-Tracking,代碼行數:4,代碼來源:analysis.py

示例6: extractKeywords_Train

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def extractKeywords_Train(self):
        documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])

        hashingTF = HashingTF()
        tf = hashingTF.transform(documents)
        tf.cache()

        idfIgnore = IDF(minDocFreq=2).fit(tf)
        tfidfIgnore = idfIgnore.transform(tf)

        tfidfIgnore.saveAsTextFile("AAA") 
開發者ID:Labyrinth108,項目名稱:Content-Based-News-Recommendation-System-in-Spark,代碼行數:13,代碼來源:engine.py

示例7: getRecommendation

# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def getRecommendation(self, user_id):

        user_news, candidates_news, candidates_newsid = self.getUserReadNews(user_id)
        all_news = user_news + candidates_news

        # ??????????????
        vectorizer = CountVectorizer()
        # ??????????
        X = vectorizer.fit_transform(all_news)
        # ????????????
        # word = vectorizer.get_feature_names()

        transformer = TfidfTransformer()
        # ?????X???TF-IDF?
        tfidf = transformer.fit_transform(X).toarray()
        # ?????? tfidf[i][j]??i?????tf-idf??
        # print tfidf.toarray()

        recommend_num = 10
        recommend_per_news = recommend_num / len(user_news)
        recommend_list = []
        user_news_len = len(user_news)
        candidates_news_len = len(candidates_news)

        for i in range(user_news_len):
            news_candidate_sim = []
            for j in range(candidates_news_len):
                sim = 1 - spatial.distance.cosine(tfidf[i], tfidf[j + user_news_len])
                news_candidate_sim.append(sim)
            k_max_index = (-np.array(news_candidate_sim)).argsort()[:recommend_per_news]
            recommend_list.extend(k_max_index)

        recommend_news_id = [candidates_newsid[i] for i in recommend_list]
        return recommend_news_id

    # def getKeywords(self):
    #
    #     news = sc.parallelize(self.getUserReadNews())
    #     x = news.collect()
    #     hashing = HashingTF()
    #
    #     news_tf = hashing.transform(news)
    #     idfIgnore = IDF(minDocFreq=2).fit(news_tf)
    #     result = idfIgnore.transform(news_tf) 
開發者ID:Labyrinth108,項目名稱:Content-Based-News-Recommendation-System-in-Spark,代碼行數:46,代碼來源:engine.py


注:本文中的pyspark.mllib.feature.IDF屬性示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。