本文整理汇总了Python中pyspark.mllib.recommendation.ALS.train方法的典型用法代码示例。如果您正苦于以下问题:Python ALS.train方法的具体用法?Python ALS.train怎么用?Python ALS.train使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.recommendation.ALS
的用法示例。
在下文中一共展示了ALS.train方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: alq_spark
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def alq_spark(A, k, sc, **kwargs):
"""
Args:
- A: sign matrix (csr_matrix)
- k: number of clusters
- sc: the spark context
- kwargs: parameters for ALS.train except for ratings
https://spark.apache.org/docs/1.5.1/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS.train
Return:
X: np.ndarray (n x k)
Y: np.ndarray (k x n)
"""
edges = indexed_entries(A)
edges_rdd = sc.parallelize(edges)
model = ALS.train(edges_rdd, rank=k, **kwargs)
u_ft = model.userFeatures()
p_ft = model.productFeatures()
X = u_ft.sortByKey(ascending=True).collect()
Y = p_ft.sortByKey(ascending=True).collect()
X = np.array(list(zip(*X))[1])
Y = np.transpose(np.array(list(zip(*Y))[1]))
return X, Y
示例2: _recommend
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def _recommend(self, train_ratings, users):
from pyspark.mllib.recommendation import ALS, Rating
# Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values
train_ratings['user'] = train_ratings['user'].astype('category')
train_ratings['item'] = train_ratings['item'].astype('category')
user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat
self.user_cat = user_cat
self.item_cat = item_cat
self.train_ratings = train_ratings
# Training the model
self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating))
if self.implicit:
model = ALS.trainImplicit(self.ratings, **self.spark_args)
else:
model = ALS.train(self.ratings, **self.spark_args)
# Getting predictions from the model
self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique())
self.predictions = model.predictAll(self.ratings_to_predict).collect()
# Presenting the recommendations as a DataFrame
self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions]
self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating'])
return self.predictions_df
示例3: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def main():
training_data = sc.textFile(training_inputs)
testing_data = sc.textFile(testing_inputs)
training_ratings = training_data.map(get_tuple).cache()
testing_ratings = testing_data.map(get_tuple).cache()
testing_all = testing_ratings.map(lambda (uid, mid, rating): (uid, mid)).cache()
ratings = testing_ratings.map(to_Rating)
ranks = [2, 4, 8, 16, 32, 64, 128, 256]
reg_params = [0.1, 0.01]
for i in range(len(reg_params)):
RMSES = []
for rank in ranks:
model = ALS.train(training_ratings, rank=rank, lambda_=reg_params[i], seed=10)
predictions = model.predictAll(testing_all).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
RMSE = math.sqrt(MSE)
RMSES.append(RMSE)
plt.plot(range(len(ranks)), RMSES, label=str(reg_params[i]))
plt.xticks(range(len(ranks)), ranks, size='small')
plt.legend()
plt.show()
示例4: build_ALS_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def build_ALS_model(ratings):
# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 20
model = ALS.train(ratings, rank, numIterations)
return model
示例5: train
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def train(self):
"Train the model with new data and write to file"
user_lookup, course_lookup = self.__prepare_data()
# send list of (user_id, course_id, rating) triples to the ML algorithm
log.info('Loading ratings data')
ratings_RDD_raw = self.sc.parallelize(m.UserCourse.objects)
self.ratings_RDD = (ratings_RDD_raw
.filter(lambda ratings:
ratings.course_review.interest is not None)
.map(lambda ratings:
(user_lookup[str(ratings.user_id)],
course_lookup[ratings.course_id],
float(ratings.course_review.interest)))
).cache()
training_error, test_error = self._report_error(self.ratings_RDD)
log.info('Training model')
model = ALS.train(self.ratings_RDD,
_PARAMS['rank'],
_PARAMS['num_iterations'],
_PARAMS['reg_param'])
log.info('Model trained!')
model_path = os.path.join(os.path.dirname(__file__),
'%s/trained_model' % c.RECOMMENDATION_DIR)
if os.path.isdir(model_path):
rmtree(model_path)
model.save(self.sc, model_path)
self._report_metrics(num_courses=self.ratings_RDD.count(),
training_error=training_error,
test_error=test_error)
示例6: model_param_sweep
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def model_param_sweep(train, test):
#model params
iterations = 20
regularization_param_list = np.linspace(0.05, 0.2, 5)
#params used in keeping track of error between different ranks
rank_list = [4, 6, 8]
errors = np.zeros(len(regularization_param_list)*len(rank_list))
err = 0
min_error = float('inf')
max_class_rate = 0
best_rank = -1
best_iteration = -1
for rank in rank_list:
for reg in regularization_param_list:
model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg)
predictions = model.predictAll(test.rdd.map(lambda r: (r[0], r[1]) )).map(lambda x: ((int(x[0]), int(x[1])), float(x[2])) )
rates_and_preds = test.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
correct_count = rates_and_preds.filter(lambda r:( abs(r[1][0] - r[1][1]) < 1) or (r[1][0] < 6 and r[1][1] < 6) ).count()
total_count = rates_and_preds.count()
class_rate = correct_count*1./total_count
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
errors[err] = error
err += 1
print 'For rank=%s, regParam=%s the RMSE is %s with a correct classification rate of %0.3f' % (rank, reg, error, class_rate)
if class_rate > max_class_rate:
max_class_rate = class_rate
best_rank = (rank, reg)
print 'The best model was trained with (rank, regParam): %s and had class rate %0.3f' %(str(best_rank), max_class_rate)
示例7: fit_final_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def fit_final_model(train):
#model params
iterations = 20
reg = 0.0875
rank = 6
model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg)
return model
示例8: train
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def train(self, rank, iterations=10, lambda_=0.01, seed=0, **kwargs):
"""
Train the model.
Parameters
----------
rank : int
The number of factors in the underlying model. Generally, larger numbers of factors
lead to better models, but increase the memory required. A rank in the range of 10 to 200
is usually reasonable.
iterations : int, optional
The number of iterations to perform. With each iteration, the model improves. ALS
typically converges quickly, so a value of 10 is recommended.
lambda : float, optional
This parameter controls regularization, which controls overfitting. The higher the value of
lambda applies more regularization. The appropriate value here depends on the problem, and needs
to be tuned by train/test techniques, which measure overfitting.
Returns
-------
out: : model
A RecommenderModel. This can be used to make predidictions on how a user would rate an item.
"""
ratings = self._prepare_ratings()
model = ALS.train(ratings.to_rdd(),
rank,
iterations=iterations,
lambda_=lambda_,
seed=seed,
**kwargs)
return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
示例9: __train_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def __train_model(self, ratings_RDD):
"""Train the ALS model with the current dataset
"""
model = ALS.train(ratings_RDD, self.rank, seed=self.seed,
iterations=self.iterations, lambda_=self.regularization_parameter)
return model
示例10: __train_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def __train_model(self):
"""Train the ALS model with the current dataset
"""
logger.info("Training the ALS model...")
self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed,
iterations=self.iterations, lambda_=self.regularization_parameter)
logger.info("ALS model built!")
示例11: als
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def als(data):
train, test = data.randomSplit(weights=[0.8, 0.2])
X_train = train.map(lambda r : Rating(r[0], r[1], r[2]))
y = test.map(lambda r : ((r[0], r[1]), r[2]))
X_test = test.map(lambda r : (r[0], r[1]))
rank = 7
X_train.cache()
X_test.cache()
lambdas = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
numIterations = 10
nonnegative=True
bestModel = None
error = float('Inf')
errors = []
#Use ALS to predict play time for test users and choose the best parameter for lambda
for lmbda in lambdas:
model = ALS.train(X_train, rank, numIterations, lmbda, nonnegative=nonnegative)
y_hat = model.predictAll(X_test).map(lambda r : ((r[0], r[1]), r[2]))
ratesAndPreds = y.join(y_hat)
MSE = ratesAndPreds.map(lambda r : ((r[1][0]) - (r[1][1]))**2).mean()
errors.append(MSE)
if MSE < error:
bestModel = model
error = MSE
#Plot mean square error v.s. lambda
plt.plot(lambdas, errors, 'ro')
plt.xlabel(r'$\lambda$')
plt.ylabel(r'$MSE$')
plt.title(r'MSE v.s. $\lambda$')
plt.savefig('cross_validation_p.png')
#Make Prediction by using the best model
y_hat = model.predictAll(X_test).map(lambda r : (r[0], r[1], r[2]))
y_hat.map(toCVSLine).saveAsTextFile('prediction')
return bestModel, error
示例12: evaluate
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def evaluate(sc, raw_user_movies, raw_hot_movies):
movies_name = build_movies(raw_hot_movies)
user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
ratings = build_ratings(raw_user_movies, user_id_to_int)
num_iterations = 10
for rank in [10, 50]:
for lam in [1.0, 0.01, 0.0001]:
model = ALS.train(ratings, rank, num_iterations, lam)
user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
print predictions.take(3)
rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
print rates_and_preds.take(3)
mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse)
for rank in [10, 50]:
for lam in [1.0, 0.01, 0.0001]:
for alpha in [1.0, 40.0]:
model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha)
user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
print rates_and_preds.take(3)
mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print "(rank:%d, lambda: %f, alpha: %f, implicit ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
示例13: main
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def main():
""" Train and evaluate an ALS recommender.
"""
# Set up environment
sc = SparkContext("local[*]", "RecSys")
# Load and parse the data
data = sc.textFile("./data/ratings.dat")
ratings = data.map(parse_rating)
# Build the recommendation model using Alternating Least Squares
rank = 10
iterations = 20
model = ALS.train(ratings, rank, iterations)
movies = sc.textFile("./data/movies.dat")\
.map(parse_movie)
# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata)\
.map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = ratings.map(lambda r: ((r[0], r[1]), r[2]))\
.join(predictions)
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
示例14: find_best_model
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def find_best_model(data):
global bestRank
global bestLambda
global bestNumIter
bestRank = 0
bestLambda = -1.0
bestNumIter = -1
ranks = [8, 12]
lambdas = [0.1, 10.0]
numIters = [10, 20]
min_error = float('inf')
training, validation, test = data.randomSplit([0.6, 0.2, 0.2], 6)
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
ALS.checkpointInterval = 2
training_data = training.map(lambda xs: [int(x) for x in xs])
model = ALS.train(training_data, rank, numIter, lmbda)
validation_data = validation.map(lambda p: (int(p[0]), int(p[1])))
predictions = model.predictAll(validation_data).map(lambda r: ((r[0], r[1]), r[2]))
ratings_and_predictions = validation.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = sqrt(ratings_and_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean())
print 'For rank %s the RMSE is %s' % (rank, error)
if error < min_error:
min_error = error
bestRank = rank
bestLambda = lmbda
bestNumIter = numIter
print 'The best model was trained with rank %s' % bestRank
示例15: grid_search
# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import train [as 别名]
def grid_search(train_df, test_df, X_test_df, y_test):
ranks = [6] # , 8, 12, 18]
lambdas = list(np.arange(0.1, 0.5, 0.1))
numIters = [20]
bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
model = ALS.train(train_df, rank, numIter, lmbda)
validationRmse = computeRMSE(model, test_df, X_test_df, len(y_test))
print "RMSE (validation) = %f for the model trained with " % validationRmse + "rank = %d, lambda = %.1f, and numIter = %d." % (
rank,
lmbda,
numIter,
)
if validationRmse < bestValidationRmse:
bestModel = model
bestValidationRmse = validationRmse
bestRank = rank
bestLambda = lmbda
bestNumIter = numIter
testRmse = computeRMSE(bestModel, test_df, X_test_df, len(y_test))
# evaluate the best model on the test set
print "The best model was trained with rank = %d and lambda = %.1f, " % (
bestRank,
bestLambda,
) + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
return bestModel