当前位置: 首页>>代码示例>>Python>>正文


Python ALS.trainImplicit方法代码示例

本文整理汇总了Python中pyspark.mllib.recommendation.ALS.trainImplicit方法的典型用法代码示例。如果您正苦于以下问题:Python ALS.trainImplicit方法的具体用法?Python ALS.trainImplicit怎么用?Python ALS.trainImplicit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.recommendation.ALS的用法示例。


在下文中一共展示了ALS.trainImplicit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _recommend

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
    def _recommend(self, train_ratings, users):
        from pyspark.mllib.recommendation import ALS, Rating

        # Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values
        train_ratings['user'] = train_ratings['user'].astype('category')
        train_ratings['item'] = train_ratings['item'].astype('category')
        user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat
        self.user_cat = user_cat
        self.item_cat = item_cat
        self.train_ratings = train_ratings

        # Training the model
        self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating))
        if self.implicit:
            model = ALS.trainImplicit(self.ratings, **self.spark_args)
        else:
            model = ALS.train(self.ratings, **self.spark_args)

        # Getting predictions from the model
        self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique())
        self.predictions = model.predictAll(self.ratings_to_predict).collect()
        # Presenting the recommendations as a DataFrame
        self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions]
        self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating'])
        return self.predictions_df
开发者ID:halflings,项目名称:receval,代码行数:27,代码来源:recommender.py

示例2: main

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
开发者ID:Swebask,项目名称:RedditR--Insight-Data-Engineering-Project,代码行数:36,代码来源:engine.py

示例3: evaluate

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def evaluate(sc, raw_user_movies, raw_hot_movies):
    movies_name = build_movies(raw_hot_movies)
    user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
    ratings = build_ratings(raw_user_movies, user_id_to_int)
    num_iterations = 10
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            model =  ALS.train(ratings, rank, num_iterations, lam)
            user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
            predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
            print predictions.take(3)
            rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
            print rates_and_preds.take(3)
            mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse)
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            for alpha in [1.0, 40.0]:
                model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha)
                user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
                predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
                rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
                print rates_and_preds.take(3)
                mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
                print "(rank:%d, lambda: %f, alpha: %f, implicit  ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
开发者ID:dengshilong,项目名称:douban_recommender,代码行数:27,代码来源:recommender.py

示例4: main

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(sc):

    seed = 5L
    iterations = 10
    regularization_parameter = 0.1
    rank = 4


    data = sc.textFile("file:///Expedia/data/train1.csv")
    
    ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    
    new_data = sc.textFile("file:///Expedia/data/new_set.csv")
    
    new_ratings = new_data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache()
    
    complete_data = ratings.union(new_ratings).cache()
    
    new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed, 
                              iterations=iterations, lambda_=regularization_parameter)
                              
    
    # that not work need more invistigation                        
    #predictions = new_ratings_model.predictAll(0,'83').collect()
    predictions = new_ratings_model.predictAll(new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2]
    
    recommendations.take(5)
开发者ID:aaabed,项目名称:Capstone_Hotel_recommendation_system,代码行数:31,代码来源:cluster-recommend.py

示例5: train

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
    def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1):
        """
        train a mf model against the given parameters
        """
        if alpha:
            model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha)
        else:
            model = ALS.train(self.train_data, rank, iterations, lambda_)

        return model
开发者ID:farcryzry,项目名称:spala,代码行数:12,代码来源:MatrixFactorization.py

示例6: training_models

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
    def training_models(self, rank=5, seed=32, iterations=20, alpha=0.01, reg=0.01):
        '''ALS training parameters:
            rank - Number of latent factors.
            iterations - Number of iterations of ALS. (default: 5)
            lambda_ - Regularization parameter. (default: 0.01)
            alpha - constant used in computing confidence. (default: 0.01)
            seed - Random seed for initial matrix factorization model. (default: None)
        '''

        print (self.training.take(5), self.test.take(5))

        weights = [.8, .2]
        trainData_RDD, valData_RDD = self.training.randomSplit(weights, seed)  # split training to training and validation sets

        trainData_RDD.cache(), valData_RDD.cache()

        print (trainData_RDD.count(), valData_RDD.count())


        #X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).filter(lambda x: x[0] in set({92396, 198196, 111182, 2350, 46158})).cache()
        X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).cache()
   
        sum_ratings_val = valData_RDD.map(lambda x: x.rating).sum()

        product_nums_for_users = X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).map(lambda x: x[1]).collect()
        #print (X_val_RDD.collect())
        print ('num of users', X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).count())
        #print (product_num_for_users)
        rank_lists = Rank_list(product_nums_for_users)

        print (rank_lists)
        #print (rank_lists[4])

        #return

        model = ALS.trainImplicit(trainData_RDD, rank, iterations=iterations,\
                            lambda_=reg, blocks=-1, alpha=alpha, nonnegative=False, seed=seed)

        # prediced results for validation results
        predictions_RDD = model.predictAll(X_val_RDD).map(lambda x: ((x[0], x[1]), x[2]))
        ratings_and_preds_RDD = valData_RDD.map(lambda x: ((x[0], x[1]), x[2])).join(predictions_RDD)

        print()
        print('model training is convergenent')
        print()
        #return

        MPR = self.percentage_ranking(ratings_and_preds_RDD, rank_lists, sum_ratings_val)


        print ('Rank %s, reg %s, alpha %s, AvgRank = %s' % (rank, reg, alpha, MPR))
开发者ID:HsiangHung,项目名称:DataScienceProjects,代码行数:53,代码来源:alp.py

示例7: main

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(sc):
   

    #load files
    train_1 = sc.textFile("file:///Expedia/data/train_1.csv")
    training_RDD = train_1.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    
	#load folds files
    train_2 = sc.textFile("file:///Expedia/data/train_1.csv")
    validation_RDD = train_2.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()


    validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache()
    train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache()


    #Train model in tain set and cross validation set and choose the best model with 
    # the best RMSE in Cross validation set
    seed = 5L
    iterations = 10
    regularization_parameter = [0.1, 0.5 , 1.0 ] 
    ranks = [4, 8]
    errors = []

    min_error = float('inf')
    best_rank = -1

    for rank, regularization_parameter in itertools.product(ranks, regularization_parameter):
	    #train implicit model in train set  
        model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
        #Predict model in validation set              
        predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        
        #compute root mean square error in prediction validation set  
        rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors.append(rmse)
    
        print 'For rank %s the RMSE is %s' % (rank, rmse)
        if rmse < min_error:
            min_error = rmse
            best_rank = rank


    print "The best model was trained with rank = %d and lambda = %.1f, " % (
    best_rank , regularization_parameter) \
    + "and numIter = %d, and its RMSE on the validation set is %f." % (iterations,
    min_error)
开发者ID:aaabed,项目名称:Capstone_Hotel_recommendation_system,代码行数:51,代码来源:trainimplicit_model_script.py

示例8: train_als

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
    def train_als(self):
        self.ratings = self.df.select("user_id", "repo_id")\
            .map(lambda x: Rating(x[0], x[1], 1.0))
        
        rank = 10
        numIterations = 20
        model = ALS.trainImplicit(self.ratings, rank, numIterations, alpha=0.01)

        testdata = self.ratings.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = self.ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        print("Mean Squared Error = " + str(MSE))

        model.save(self.sc, "ALS_model")
开发者ID:viknat,项目名称:reinforce-collab,代码行数:17,代码来源:build_recommender.py

示例9: label

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
 def label(self, rank=50, numIterations=10, alpha=0.01):
     """
     INPUT:
     - rank: number of topics
     - numIterations: number of iterations for matrix factorization
     - alpha: learning rate
     OUTPUT:
     - data for training naive bayes with label, feature tuples
     """
     als_model = ALS.trainImplicit(self.tfidf_rating, rank, numIterations, alpha)
     index_label = als_model.userFeatures().map(lambda x: (x[0], np.argmax(x[1])))
     index_feature = self.tfidf.zipWithIndex().map(lambda x: (x[1], x[0]))
     index_label_feature = index_label.join(index_feature)
     label_feature = index_label_feature.map(lambda x: x[1])
     self.train_data = label_feature.map(lambda x: LabeledPoint(x[0], x[1]))
开发者ID:xiaoyubai,项目名称:wiki-search,代码行数:17,代码来源:model.py

示例10: prepare_model

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def prepare_model(sc, filename, user_id, ratings_train):
    if filename is None and os.path.exists(config.MSD_MODEL):
        # load the trained model
        print("\n\nLoading existing recommendation model from %s\n\n"
              % config.MSD_MODEL)
        model = MatrixFactorizationModel.load(sc, config.MSD_MODEL)
    else:
        # train a new model
        print("\n\nRetraining recommendation model for User %s\n\n" % user_id)
        rank, lambda_val = (
            evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE))
        rank, lambda_val = int(rank), float(lambda_val)
        model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS,
                                  lambda_val, nonnegative=True)

    return model
开发者ID:seanjh,项目名称:DSRecommendationSystems,代码行数:18,代码来源:msd_recommend.py

示例11: cross_validation

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas):
# train models and evaluate them on the validation set

    result_dict = {}
    result_template = "rank:%d  iters:%d  lambda: %f"
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    numTraining = training.count()
    numValidation = validation.count()
    numTest = test.count()
    if  not IMPLICIT:
        alphas = [1.0]
    for rank, lmbda, numIter, alpha in itertools.product(ranks, lambdas, numIters, alphas):
        if IMPLICIT:
            model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True)
        else:
            model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True)
        validationRmse = 0.0 #computeRmse(model, validation, numValidation)
        print "RMSE (validation) = %f for the model trained with " % validationRmse + \
              "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha)

        qe_results = qualitative_evaluation(model, candidates, id_title_map)

        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
        result_dict[result_template % (rank, numIter, lmbda)] = validationRmse
    testRmse = 0.0 #computeRmse(bestModel, test, numTest)
    # evaluate the best model on the test set
    print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
      + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
    result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse
    # compare the best model with a naive baseline that always returns the mean rating
    meanRating = training.union(validation).map(lambda x: x[2]).mean()
    baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)
    improvement = (baselineRmse - testRmse) / baselineRmse * 100
    print "The best model improves the baseline by %.2f" % (improvement) + "%."
    result_dict['BEST gain over baseline'] = improvement
    return bestModel, result_dict
开发者ID:cervisiarius,项目名称:wikimedia,代码行数:47,代码来源:als.py

示例12: main

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def main(cores, prefs):

	"""
	ALS Algorithm to Recommend Subreddits to User based on User-defined preferences
	
	args:
	cores (int) : number of cores for spark job
	prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial
	"""

	scfg=SparkConf()
	scfg.set("spark.cores.max",cores)
	sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

	try:
		# prep data
		raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
		parsed_counts = raw_counts.map(lambda st: eval(st))
		all_ratings = parsed_counts.map( tup_to_rating )
		# assign user-identified preferred subreddits
		raw_prefs = [ (999, x, 100) for x in prefs ]
		my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

		# train model
		model_input = all_ratings.union(my_prefs)
		model = ALS.trainImplicit(model_input, 10, 10, alpha=.01, seed=5)

		# candidate prefs for prediction
		my_prefs_ids = set([javahash(x) for x in prefs])
		all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
		candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

		predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

		final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

		output = list( final.take(30) )
		sc.stop()
		return output
	except Exception, e:
		print("App failed. Stopping gracefully")
		sc.stop()
		raise Exception(e)
开发者ID:calhank,项目名称:reddiculous,代码行数:45,代码来源:subreddit_preferences.py

示例13: execute_recommendation

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def execute_recommendation():

    sc = SparkContext(appName="PythonCollaborativeFilteringExample")
    #sc = SparkContext( 'local', 'pyspark')
    
    #Load train data and train
    train_file_name = get_training_file_name()
    train_data = sc.textFile(train_file_name)
    ratings = train_data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    
    rank = 10
    number_iteration = 10
    model = ALS.trainImplicit(ratings, rank, number_iteration)
    
    #load test data and do prediction 
    test_file_name = get_testing_file_name()
    test_data = sc.textFile(test_file_name)
    test_ranking=test_data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    
    
    testdata = test_ranking.map(lambda p: (p[0], p[1]))
    count_rdd=testdata.count()
    
    
    
    if count_rdd > 0:
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        
        #predictions_lines = predictions.map(toCSVLine)
        result_file = get_rdd_output()
        predictions.saveAsTextFile(result_file)
    
        count_rdd = predictions.count()
        print("after prediction: count_rdd=",count_rdd)
    else:
        print("Error: empty testdata")
        

    sc.stop()
开发者ID:FREDOKOK,项目名称:Expedia,代码行数:43,代码来源:USE_ALS_V0.3.PY

示例14: home

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def home(request):

	prefs = ["IAmA","funny","nfl"]

	scfg=SparkConf()
	scfg.set("spark.cores.max",64)
	sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

	try:
		# prep data
		raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
		parsed_counts = raw_counts.map(lambda st: eval(st))
		all_ratings = parsed_counts.map( tup_to_rating )
		# assign user-identified preferred subreddits
		raw_prefs = [ (999, x, 100) for x in prefs ]
		my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

		# train model
		model_input = all_ratings.union(my_prefs)
		model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

		# candidate prefs for prediction
		my_prefs_ids = set([javahash(x) for x in prefs])
		all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
		candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

		predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

		final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

		output = list( final.take(30) )
		sc.stop()
	except Exception, e:
		print("App failed. Stopping gracefully")
		sc.stop()
		raise Exception(e)
开发者ID:calhank,项目名称:reddiculous,代码行数:38,代码来源:views.py

示例15: calc_cf_mllib

# 需要导入模块: from pyspark.mllib.recommendation import ALS [as 别名]
# 或者: from pyspark.mllib.recommendation.ALS import trainImplicit [as 别名]
def calc_cf_mllib(y_training_data, num_partitions = 20):
    """
    Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
    min_rating = y_training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    #MLLIb has two methods, train and trainImplicit().  Implicit data will go between zero and 1
    if min_rating==0 and max_rating==1:
        model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5)
    else:
        model = ALS.train(y_training_data, rank = 10, iterations = 5)

    #predict all user, item pairs
    item_ids = y_training_data.map(lambda (u,i,r): i).distinct()
    user_ids = y_training_data.map(lambda (u,i,r): u).distinct()
    user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)

    predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))

    norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))


    return norm_predictions
开发者ID:codeaudit,项目名称:hermes,代码行数:38,代码来源:cf.py


注:本文中的pyspark.mllib.recommendation.ALS.trainImplicit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。