本文整理匯總了Python中pyspark.ml.regression.LinearRegression類的典型用法代碼示例。如果您正苦於以下問題:Python LinearRegression類的具體用法?Python LinearRegression怎麽用?Python LinearRegression使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了LinearRegression類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train
def train(self, rdd):
"""
This ignores the optimizer parameter since it makes config difficult for Linear Regression.
:return: Trained model to be passed to test.
"""
options = self.options
if options.loss == "l2":
if options.reg_type in ["none", "l1", "l2"]:
return LinearRegressionWithSGD.train(data=rdd,
iterations=options.num_iterations,
step=options.step_size,
miniBatchFraction=1.0,
regParam=options.reg_param,
regType=options.reg_type)
elif options.reg_type == "elastic-net": # use spark.ml
lr = MLLinearRegression(maxIter=options.num_iterations, regParam=options.reg_param,
elasticNetParam=options.elastic_net_param)
# TODO: Do not include time for conversion to DataFrame (but this currently matches
# the Scala tests)
df = rdd.toDF()
lrModel = lr.fit(df)
return LinearRegressionModel(lrModel.weights, lrModel.intercept)
else:
raise Exception("GLMRegressionTest cannot run with loss = %s, reg_type = %s" \
% (options.loss, options.reg_type))
else:
raise Exception("GLMRegressionTest does not recognize loss: %s" % options.loss)
示例2: test_java_object_gets_detached
def test_java_object_gets_detached(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
summary = model.summary
self.assertIsInstance(model, JavaWrapper)
self.assertIsInstance(summary, JavaWrapper)
self.assertIsInstance(model, JavaParams)
self.assertNotIsInstance(summary, JavaParams)
error_no_object = 'Target Object ID does not exist for this gateway'
self.assertIn("LinearRegression_", model._java_obj.toString())
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
model.__del__()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
try:
summary.__del__()
except:
pass
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
summary._java_obj.toString()
示例3: test_linear_regression_pmml_basic
def test_linear_regression_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1)
model = lr.fit(df)
path = tempfile.mkdtemp()
lr_path = path + "/lr-pmml"
model.write().format("pmml").save(lr_path)
pmml_text_list = self.sc.textFile(lr_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
示例4: test_linear_regression
def test_linear_regression(self):
lr = LinearRegression(maxIter=1)
path = tempfile.mkdtemp()
lr_path = path + "/lr"
lr.save(lr_path)
lr2 = LinearRegression.load(lr_path)
self.assertEqual(lr2.uid, lr2.maxIter.parent,
"Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)"
% (lr2.uid, lr2.maxIter.parent))
self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
"Loaded LinearRegression instance default params did not match " +
"original defaults")
try:
rmtree(path)
except OSError:
pass
示例5: test_linear_regression_with_huber_loss
def test_linear_regression_with_huber_loss(self):
data_path = "data/mllib/sample_linear_regression_data.txt"
df = self.spark.read.format("libsvm").load(data_path)
lir = LinearRegression(loss="huber", epsilon=2.0)
model = lir.fit(df)
expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
1.2612, -0.333, -0.5694, -0.6311, 0.6053]
expectedIntercept = 0.1607
expectedScale = 9.758
self.assertTrue(
np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
示例6: test_linear_regression_summary
def test_linear_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
self.assertAlmostEqual(s.meanSquaredError, 0.0)
self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
self.assertAlmostEqual(s.r2, 1.0, 2)
self.assertAlmostEqual(s.r2adj, 1.0, 2)
self.assertTrue(isinstance(s.residuals, DataFrame))
self.assertEqual(s.numInstances, 2)
self.assertEqual(s.degreesOfFreedom, 1)
devResiduals = s.devianceResiduals
self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned
# The child class LinearRegressionTrainingSummary runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
示例7: daysSinceEpoch
ratingsPerDayDict = ratingsRDD.map(lambda x: x.split("\t")) \
.map(lambda x: daysSinceEpoch(int(x[3]))) \
.countByValue()
# prepare data frame as required by MLLib
data = spark.sparkContext.parallelize(ratingsPerDayDict.items()) \
.map(lambda x: (float(x[1]), Vectors.dense(float(x[0]))))
df = data.toDF(["label", "features"])
# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]
# Now create the linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
# Train the model using our training data
model = lir.fit(trainingDF)
# Generate predictions for test data using our linear regression model
fullPredictions = model.transform(testDF).cache()
# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])
# Zip them together
predictionAndLabel = predictions.zip(labels).collect()
# Print out the predicted and actual values for each point
示例8: LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint
data= [LabeledPoint(0.0, Vectors.dense([0.0]),), LabeledPoint(0.99, Vectors.dense([1.0])), LabeledPoint(2.0, Vectors.dense([2.0])), LabeledPoint(3.01, Vectors.dense([3.0]))]
training = sqlContext.createDataFrame(data)
lr = LinearRegression(maxIter=100, regParam=0.05, elasticNetParam=0.8)
lrModel = lr.fit(training)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))
示例9: _train_model_spark
def _train_model_spark(self, data):
df = self._prepare_data_spark(data)
input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
self.TODAY_PRICE}))
if self.ann_hidden_nodes_num is None:
self.ann_hidden_nodes_num = input_num / 2 + 1
ann_layers = [input_num,
# input_num / 3 * 2,
# input_num / 3,
self.ann_hidden_nodes_num,
2]
self.logger.info('layer settings are {}'.format(ann_layers))
self.logger.info('training method is {}'.format(self._train_method))
self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
if isinstance(self._train_method, dict):
if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
self._model[self.CHANGE_AMOUNT].stop_server()
self._model = {self.CHANGE_AMOUNT: None,
self.CHANGE_DIRECTION: None}
if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
maxIter=self.linear_regression_training_times,
regParam=self.linear_regression_regularization_parameter,
predictionCol='AmountPrediction')
self._model[self.CHANGE_AMOUNT] = lr.fit(df)
elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
numTrees=self.random_forest_tree_number,
maxDepth=self.random_forest_tree_max_depth,
predictionCol='AmountPrediction')
self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
ann_layers[-1] = 1
self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
num_workers=self.spark_worker_numbers,
epoch=self.ann_epoch_number,
featuresCol="features",
labelCol=self.CHANGE_AMOUNT,
predictionCol='AmountPrediction'
)
self._model[self.CHANGE_AMOUNT].fit(df)
else:
self.logger.warn('Unsupported training method {}'.format(self._train_method))
raise ValueError('Unsupported training method {}'.format(self._train_method))
if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
maxIter=self.logistic_regression_training_times,
regParam=self.linear_regression_regularization_parameter,
predictionCol='DirPrediction')
self._model[self.CHANGE_DIRECTION] = lr.fit(df)
elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
numTrees=self.random_forest_tree_number,
maxDepth=self.random_forest_tree_max_depth,
predictionCol='DirPrediction')
self._model[self.CHANGE_DIRECTION] = rfc.fit(df)
elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
ann_layers[-1] = 2
mlpc = MultilayerPerceptronClassifier(featuresCol="features",
labelCol=self.CHANGE_DIRECTION,
layers=ann_layers,
predictionCol='DirPrediction')
self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)
else:
self.logger.warn('Unsupported training method {}'.format(self._train_method))
raise ValueError('Unsupported training method {}'.format(self._train_method))
else:
if self._train_method == self.LINEAR_REGRESSION:
lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
regParam=self.linear_regression_regularization_parameter,
maxIter=self.linear_regression_training_times)
self._model = lr.fit(df)
elif self._train_method == self.RANDOM_FOREST:
rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
predictionCol='prediction',
numTrees=self.random_forest_tree_number,
maxDepth=self.random_forest_tree_max_depth)
self._model = rfr.fit(df)
elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
ann_layers[-1] = 1
if self._model is not None:
self._model.stop_server()
self.logger.warn('layers are {}'.format(ann_layers))
self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
num_workers=self.spark_worker_numbers, epoch=100,
featuresCol="features", labelCol=self.TARGET_PRICE,
predictionCol='prediction'
)
self._model.fit(df)
else:
self.logger.warn('Unsupported training method {}'.format(self._train_method))
#.........這裏部分代碼省略.........
示例10: print
#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)
#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)
#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)
#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))
#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()
#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
示例11: print
ZN_,
price_
FROM temp_sql_table """)
print (spark_sql_output.take(10))
trainingData=spark_sql_output.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData.show()
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)
(trainingData, testData) = trainingData.randomSplit([0.7, 0.3])
#################### SPARK ML ####################
# Define LinearRegression algorithm
lr = LinearRegression()
# Fit 2 models, using different regularization parameters
modelA = lr.fit(trainingData, {lr.regParam:0.0})
modelB = lr.fit(trainingData, {lr.regParam:100.0})
# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)
predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
示例12: print
#VECTORIZE TRAIN DATA
energi_nuclear_train = ssc.textFileStream("train_nuclear.txt")
energi_nuclear_train_labeled = energi_nuclear_train.map(parse_train)
energi_nuclear_train_labeled_DF = SQLContext.createDataFrame(energi_nuclear_train_labeled["label", "features"])
print(energi_nuclear_train_labeled_DF)
#VECTORIZE TEST DATA
energi_nuclear_test = ssc.textFileStream("test_nuclear.txt")
energi_nuclear_test_labeled = energi_nuclear_test.map(parse_test)
energi_nuclear_test_labeled_DF = SQLContext.createDataFrame(energi_nuclear_test_labeled["label", "features"])
print(energi_nuclear_test_labeled_DF)
#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_nuclear_train_labeled_DF)
#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))
#Predict On the tested data
predictions = lrModel.transform(energi_nuclear_test_labeled_DF)
predictions.select("prediction","label", "features").show()
#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
示例13: return
pred = d_copy['success_metric']
d.pop('success_metric', None)
values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data
return (pred, Vectors.dense(values))
# training set
trainParsed = sc.parallelize(map(parsePoint, train_dict))
# test set
testParsed = sc.parallelize(map(parsePoint, test_dict))
## create validation set
trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"])
testDf = sqlContext.createDataFrame(testParsed, ["label", "features"])
lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
lm_model_fit = lm_model.fit(trainDf)
lm_transform = lm_model_fit.transform(trainDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression training Mean Squared Error = " + str(MSE))
lm_transform = lm_model_fit.transform(testDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression testing Mean Squared Error = " + str(MSE))
res = results.collect()
predsAndLabels = sc.parallelize([i.asDict().values() for i in res])
metrics = RegressionMetrics(predsAndLabels)
示例14: LinearRegression
# Load the JSON strings as a Spark Dataframe.
natality_data = spark.read.json(table_json)
# Create a view so that Spark SQL queries can be run against the data.
natality_data.createOrReplaceTempView("natality")
# As a precaution, run a query in Spark SQL to ensure no NULL values exist.
sql_query = """
SELECT *
from natality
where weight_pounds is not null
and mother_age is not null
and father_age is not null
and gestation_weeks is not null
"""
clean_data = spark.sql(sql_query)
# Create an input DataFrame for Spark ML using the above function.
training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label",
"features"])
training_data.cache()
# Construct a new LinearRegression object and fit the training data.
lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")
model = lr.fit(training_data)
# Print the model summary.
print "Coefficients:" + str(model.coefficients)
print "Intercept:" + str(model.intercept)
print "R^2:" + str(model.summary.r2)
model.summary.residuals.show()
示例15: LinearRegression
df = spark.read.load("/data/regression")
# COMMAND ----------
from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print lr.explainParams()
lrModel = lr.fit(df)
# COMMAND ----------
summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2
# COMMAND ----------
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
.setFamily("gaussian")\
.setLink("identity")\
.setMaxIter(10)\
.setRegParam(0.3)\
.setLinkPredictionCol("linkOut")
print glr.explainParams()
開發者ID:yehonatc,項目名稱:Spark-The-Definitive-Guide,代碼行數:31,代碼來源:Advanced_Analytics_and_Machine_Learning-Chapter_27_Regression.py