本文整理匯總了Python中pyspark.mllib.feature.IDF屬性的典型用法代碼示例。如果您正苦於以下問題:Python feature.IDF屬性的具體用法?Python feature.IDF怎麽用?Python feature.IDF使用的例子?那麽, 這裏精選的屬性代碼示例或許可以為您提供幫助。您也可以進一步了解該屬性所在類pyspark.mllib.feature
的用法示例。
在下文中一共展示了feature.IDF屬性的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: parseTextRDDToIndex
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def parseTextRDDToIndex(self, data, label=True):
if label:
labels = data.map(lambda line: float(line.split(" ", 1)[0]))
documents = data.map(lambda line: line.split(" ", 1)[1].split(" "))
else:
documents = data.map(lambda line: line.split(" "))
tf = HashingTF().transform(documents)
tf.cache()
idfIgnore = IDF(minDocFreq=2).fit(tf)
index = idfIgnore.transform(tf)
if label:
return labels.zip(index).map(lambda line: LabeledPoint(line[0], line[1]))
else:
return index
示例2: produce_tfidf
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def produce_tfidf(x):
tf = HashingTF().transform(x)
idf = IDF(minDocFreq=5).fit(tf)
tfidf = idf.transform(tf)
return tfidf
# Load in reviews
示例3: test_idf_model
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def test_idf_model(self):
data = [
Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
]
model = IDF().fit(self.sc.parallelize(data, 2))
idf = model.idf()
self.assertEqual(len(idf), 11)
示例4: get_tfidf_features
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def get_tfidf_features(txt_rdd):
hashingTF = HashingTF()
tf = hashingTF.transform(txt_rdd)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
return tfidf
示例5: classify_tweet
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def classify_tweet(tf):
return IDF().fit(tf).transform(tf)
示例6: extractKeywords_Train
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def extractKeywords_Train(self):
documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:])
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tf.cache()
idfIgnore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfIgnore.transform(tf)
tfidfIgnore.saveAsTextFile("AAA")
示例7: getRecommendation
# 需要導入模塊: from pyspark.mllib import feature [as 別名]
# 或者: from pyspark.mllib.feature import IDF [as 別名]
def getRecommendation(self, user_id):
user_news, candidates_news, candidates_newsid = self.getUserReadNews(user_id)
all_news = user_news + candidates_news
# ??????????????
vectorizer = CountVectorizer()
# ??????????
X = vectorizer.fit_transform(all_news)
# ????????????
# word = vectorizer.get_feature_names()
transformer = TfidfTransformer()
# ?????X???TF-IDF?
tfidf = transformer.fit_transform(X).toarray()
# ?????? tfidf[i][j]??i?????tf-idf??
# print tfidf.toarray()
recommend_num = 10
recommend_per_news = recommend_num / len(user_news)
recommend_list = []
user_news_len = len(user_news)
candidates_news_len = len(candidates_news)
for i in range(user_news_len):
news_candidate_sim = []
for j in range(candidates_news_len):
sim = 1 - spatial.distance.cosine(tfidf[i], tfidf[j + user_news_len])
news_candidate_sim.append(sim)
k_max_index = (-np.array(news_candidate_sim)).argsort()[:recommend_per_news]
recommend_list.extend(k_max_index)
recommend_news_id = [candidates_newsid[i] for i in recommend_list]
return recommend_news_id
# def getKeywords(self):
#
# news = sc.parallelize(self.getUserReadNews())
# x = news.collect()
# hashing = HashingTF()
#
# news_tf = hashing.transform(news)
# idfIgnore = IDF(minDocFreq=2).fit(news_tf)
# result = idfIgnore.transform(news_tf)