本文整理汇总了Python中pyspark.mllib.recommendation.ALS.trainImplicit方法的典型用法代码示例。如果您正苦于以下问题:Python ALS.trainImplicit方法的具体用法?Python ALS.trainImplicit怎么用?Python ALS.trainImplicit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.recommendation.ALS
的用法示例。
在下文中一共展示了ALS.trainImplicit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _recommend
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def _recommend(self, train_ratings, users):
from pyspark.mllib.recommendation import ALS, Rating
# Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values
train_ratings['user'] = train_ratings['user'].astype('category')
train_ratings['item'] = train_ratings['item'].astype('category')
user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat
self.user_cat = user_cat
self.item_cat = item_cat
self.train_ratings = train_ratings
# Training the model
self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating))
if self.implicit:
model = ALS.trainImplicit(self.ratings, **self.spark_args)
else:
model = ALS.train(self.ratings, **self.spark_args)
# Getting predictions from the model
self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique())
self.predictions = model.predictAll(self.ratings_to_predict).collect()
# Presenting the recommendations as a DataFrame
self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions]
self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating'])
return self.predictions_df
示例2: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(argv):
Conf = (SparkConf().setAppName("recommendation"))
sc = SparkContext(conf=Conf)
sqlContext = SQLContext(sc)
dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
# argv[1] is the dump of training data in hdfs
# argv[2] is the user perferences
# User Hash Lookup stored into cassandra
user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
distinctUser = user_hash.distinct()
userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace = keyspace).save(mode="append")
# Product Hash Lookup stored into cassandra
product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
distinctProduct = product_hash.distinct()
productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace = keyspace).save(mode="append")
# Ratings for training
# ALS requires a java hash of string. This function does that and stores it as Rating Object
# for the algorithm to consume
ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))
model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")
sc.stop()
示例3: evaluate
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def evaluate(sc, raw_user_movies, raw_hot_movies):
movies_name = build_movies(raw_hot_movies)
user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
ratings = build_ratings(raw_user_movies, user_id_to_int)
num_iterations = 10
for rank in [10, 50]:
for lam in [1.0, 0.01, 0.0001]:
model = ALS.train(ratings, rank, num_iterations, lam)
user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
print predictions.take(3)
rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
print rates_and_preds.take(3)
mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse)
for rank in [10, 50]:
for lam in [1.0, 0.01, 0.0001]:
for alpha in [1.0, 40.0]:
model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha)
user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
print rates_and_preds.take(3)
mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print "(rank:%d, lambda: %f, alpha: %f, implicit ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
示例4: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(sc):
seed = 5L
iterations = 10
regularization_parameter = 0.1
rank = 4
data = sc.textFile("file:///Expedia/data/train1.csv")
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
new_data = sc.textFile("file:///Expedia/data/new_set.csv")
new_ratings = new_data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache()
complete_data = ratings.union(new_ratings).cache()
new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed,
iterations=iterations, lambda_=regularization_parameter)
# that not work need more invistigation
#predictions = new_ratings_model.predictAll(0,'83').collect()
predictions = new_ratings_model.predictAll(new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect()
recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2]
recommendations.take(5)
示例5: train
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1):
"""
train a mf model against the given parameters
"""
if alpha:
model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha)
else:
model = ALS.train(self.train_data, rank, iterations, lambda_)
return model
示例6: training_models
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def training_models(self, rank=5, seed=32, iterations=20, alpha=0.01, reg=0.01):
'''ALS training parameters:
rank - Number of latent factors.
iterations - Number of iterations of ALS. (default: 5)
lambda_ - Regularization parameter. (default: 0.01)
alpha - constant used in computing confidence. (default: 0.01)
seed - Random seed for initial matrix factorization model. (default: None)
'''
print (self.training.take(5), self.test.take(5))
weights = [.8, .2]
trainData_RDD, valData_RDD = self.training.randomSplit(weights, seed) # split training to training and validation sets
trainData_RDD.cache(), valData_RDD.cache()
print (trainData_RDD.count(), valData_RDD.count())
#X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).filter(lambda x: x[0] in set({92396, 198196, 111182, 2350, 46158})).cache()
X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).cache()
sum_ratings_val = valData_RDD.map(lambda x: x.rating).sum()
product_nums_for_users = X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).map(lambda x: x[1]).collect()
#print (X_val_RDD.collect())
print ('num of users', X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).count())
#print (product_num_for_users)
rank_lists = Rank_list(product_nums_for_users)
print (rank_lists)
#print (rank_lists[4])
#return
model = ALS.trainImplicit(trainData_RDD, rank, iterations=iterations,\
lambda_=reg, blocks=-1, alpha=alpha, nonnegative=False, seed=seed)
# prediced results for validation results
predictions_RDD = model.predictAll(X_val_RDD).map(lambda x: ((x[0], x[1]), x[2]))
ratings_and_preds_RDD = valData_RDD.map(lambda x: ((x[0], x[1]), x[2])).join(predictions_RDD)
print()
print('model training is convergenent')
print()
#return
MPR = self.percentage_ranking(ratings_and_preds_RDD, rank_lists, sum_ratings_val)
print ('Rank %s, reg %s, alpha %s, AvgRank = %s' % (rank, reg, alpha, MPR))
示例7: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(sc):
#load files
train_1 = sc.textFile("file:///Expedia/data/train_1.csv")
training_RDD = train_1.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
#load folds files
train_2 = sc.textFile("file:///Expedia/data/train_1.csv")
validation_RDD = train_2.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache()
train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache()
#Train model in tain set and cross validation set and choose the best model with
# the best RMSE in Cross validation set
seed = 5L
iterations = 10
regularization_parameter = [0.1, 0.5 , 1.0 ]
ranks = [4, 8]
errors = []
min_error = float('inf')
best_rank = -1
for rank, regularization_parameter in itertools.product(ranks, regularization_parameter):
#train implicit model in train set
model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
lambda_=regularization_parameter)
#Predict model in validation set
predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
#compute root mean square error in prediction validation set
rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
errors.append(rmse)
print 'For rank %s the RMSE is %s' % (rank, rmse)
if rmse < min_error:
min_error = rmse
best_rank = rank
print "The best model was trained with rank = %d and lambda = %.1f, " % (
best_rank , regularization_parameter) \
+ "and numIter = %d, and its RMSE on the validation set is %f." % (iterations,
min_error)
示例8: train_als
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def train_als(self):
self.ratings = self.df.select("user_id", "repo_id")\
.map(lambda x: Rating(x[0], x[1], 1.0))
rank = 10
numIterations = 20
model = ALS.trainImplicit(self.ratings, rank, numIterations, alpha=0.01)
testdata = self.ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = self.ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
model.save(self.sc, "ALS_model")
示例9: label
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def label(self, rank=50, numIterations=10, alpha=0.01):
"""
INPUT:
- rank: number of topics
- numIterations: number of iterations for matrix factorization
- alpha: learning rate
OUTPUT:
- data for training naive bayes with label, feature tuples
"""
als_model = ALS.trainImplicit(self.tfidf_rating, rank, numIterations, alpha)
index_label = als_model.userFeatures().map(lambda x: (x[0], np.argmax(x[1])))
index_feature = self.tfidf.zipWithIndex().map(lambda x: (x[1], x[0]))
index_label_feature = index_label.join(index_feature)
label_feature = index_label_feature.map(lambda x: x[1])
self.train_data = label_feature.map(lambda x: LabeledPoint(x[0], x[1]))
示例10: prepare_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def prepare_model(sc, filename, user_id, ratings_train):
if filename is None and os.path.exists(config.MSD_MODEL):
# load the trained model
print("\n\nLoading existing recommendation model from %s\n\n"
% config.MSD_MODEL)
model = MatrixFactorizationModel.load(sc, config.MSD_MODEL)
else:
# train a new model
print("\n\nRetraining recommendation model for User %s\n\n" % user_id)
rank, lambda_val = (
evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE))
rank, lambda_val = int(rank), float(lambda_val)
model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS,
lambda_val, nonnegative=True)
return model
示例11: cross_validation
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas):
# train models and evaluate them on the validation set
result_dict = {}
result_template = "rank:%d iters:%d lambda: %f"
bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1
numTraining = training.count()
numValidation = validation.count()
numTest = test.count()
if not IMPLICIT:
alphas = [1.0]
for rank, lmbda, numIter, alpha in itertools.product(ranks, lambdas, numIters, alphas):
if IMPLICIT:
model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True)
else:
model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True)
validationRmse = 0.0 #computeRmse(model, validation, numValidation)
print "RMSE (validation) = %f for the model trained with " % validationRmse + \
"rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha)
qe_results = qualitative_evaluation(model, candidates, id_title_map)
if (validationRmse < bestValidationRmse):
bestModel = model
bestValidationRmse = validationRmse
bestRank = rank
bestLambda = lmbda
bestNumIter = numIter
result_dict[result_template % (rank, numIter, lmbda)] = validationRmse
testRmse = 0.0 #computeRmse(bestModel, test, numTest)
# evaluate the best model on the test set
print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
+ "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse
# compare the best model with a naive baseline that always returns the mean rating
meanRating = training.union(validation).map(lambda x: x[2]).mean()
baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)
improvement = (baselineRmse - testRmse) / baselineRmse * 100
print "The best model improves the baseline by %.2f" % (improvement) + "%."
result_dict['BEST gain over baseline'] = improvement
return bestModel, result_dict
示例12: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(cores, prefs):
"""
ALS Algorithm to Recommend Subreddits to User based on User-defined preferences
args:
cores (int) : number of cores for spark job
prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial
"""
scfg=SparkConf()
scfg.set("spark.cores.max",cores)
sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)
try:
# prep data
raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
parsed_counts = raw_counts.map(lambda st: eval(st))
all_ratings = parsed_counts.map( tup_to_rating )
# assign user-identified preferred subreddits
raw_prefs = [ (999, x, 100) for x in prefs ]
my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)
# train model
model_input = all_ratings.union(my_prefs)
model = ALS.trainImplicit(model_input, 10, 10, alpha=.01, seed=5)
# candidate prefs for prediction
my_prefs_ids = set([javahash(x) for x in prefs])
all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)
predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()
final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)
output = list( final.take(30) )
sc.stop()
return output
except Exception, e:
print("App failed. Stopping gracefully")
sc.stop()
raise Exception(e)
示例13: execute_recommendation
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def execute_recommendation():
sc = SparkContext(appName="PythonCollaborativeFilteringExample")
#sc = SparkContext( 'local', 'pyspark')
#Load train data and train
train_file_name = get_training_file_name()
train_data = sc.textFile(train_file_name)
ratings = train_data.map(lambda l: l.split(','))\
.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
rank = 10
number_iteration = 10
model = ALS.trainImplicit(ratings, rank, number_iteration)
#load test data and do prediction
test_file_name = get_testing_file_name()
test_data = sc.textFile(test_file_name)
test_ranking=test_data.map(lambda l: l.split(','))\
.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
testdata = test_ranking.map(lambda p: (p[0], p[1]))
count_rdd=testdata.count()
if count_rdd > 0:
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
#predictions_lines = predictions.map(toCSVLine)
result_file = get_rdd_output()
predictions.saveAsTextFile(result_file)
count_rdd = predictions.count()
print("after prediction: count_rdd=",count_rdd)
else:
print("Error: empty testdata")
sc.stop()
示例14: home
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def home(request):
prefs = ["IAmA","funny","nfl"]
scfg=SparkConf()
scfg.set("spark.cores.max",64)
sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)
try:
# prep data
raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
parsed_counts = raw_counts.map(lambda st: eval(st))
all_ratings = parsed_counts.map( tup_to_rating )
# assign user-identified preferred subreddits
raw_prefs = [ (999, x, 100) for x in prefs ]
my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)
# train model
model_input = all_ratings.union(my_prefs)
model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)
# candidate prefs for prediction
my_prefs_ids = set([javahash(x) for x in prefs])
all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)
predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()
final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)
output = list( final.take(30) )
sc.stop()
except Exception, e:
print("App failed. Stopping gracefully")
sc.stop()
raise Exception(e)
示例15: calc_cf_mllib
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def calc_cf_mllib(y_training_data, num_partitions = 20):
"""
Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings
Args:
y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]
Returns:
predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].
"""
#Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
min_rating = y_training_data.map(lambda (user, item, rating): rating).min()
if max_rating == min_rating:
min_rating=0
#MLLIb has two methods, train and trainImplicit(). Implicit data will go between zero and 1
if min_rating==0 and max_rating==1:
model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5)
else:
model = ALS.train(y_training_data, rank = 10, iterations = 5)
#predict all user, item pairs
item_ids = y_training_data.map(lambda (u,i,r): i).distinct()
user_ids = y_training_data.map(lambda (u,i,r): u).distinct()
user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)
predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))
norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))
return norm_predictions