本文整理匯總了Python中pyspark.ml.tuning.CrossValidator.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python CrossValidator.fit方法的具體用法?Python CrossValidator.fit怎麽用?Python CrossValidator.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.tuning.CrossValidator
的用法示例。
在下文中一共展示了CrossValidator.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: train_with_tune
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def train_with_tune(input_df):
# https://spark.apache.org/docs/latest/ml-tuning.html
# 構建模型訓練流程
lr = LogisticRegression()
pipeline = Pipeline(stages=[lr])
# 構建超參空間
paramGrid = ParamGridBuilder() \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
# 隻做一次切分
# tvs = TrainValidationSplit(estimator=pipeline,
# estimatorParamMaps=paramGrid,
# evaluator=BinaryClassificationEvaluator(),
# # 80% of the data will be used for training, 20% for validation.
# trainRatio=0.8)
# k-fold cross validation
cross_val = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=3)
# train and find the best
cvModel = cross_val.fit(input_df)
return cvModel.bestModel
示例2: train_lg
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def train_lg(training_data, collection):
# Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
pipeline1 = Pipeline(stages=[hashingTF, idf])
# Fit the pipeline1 to training documents.
model1 = pipeline1.fit(training_data)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline2 = Pipeline(stages=[model1, lr])
paramGrid = ParamGridBuilder() \
.addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
.addGrid(lr.regParam, [0.1, 0.01]) \
.build()
crossval = CrossValidator(estimator=pipeline2,
estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(),
numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training_data)
# model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
# + collection["Id"] + '_'
# + collection["name"])
# cvModel.save(sc, model_path)
return cvModel
示例3: test_save_load_simple_estimator
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_save_load_simple_estimator(self):
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
# test save/load of CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvPath = temp_path + "/cv"
cv.save(cvPath)
loadedCV = CrossValidator.load(cvPath)
self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
# test save/load of CrossValidatorModel
cvModelPath = temp_path + "/cvModel"
cvModel.save(cvModelPath)
loadedModel = CrossValidatorModel.load(cvModelPath)
self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
示例4: build_decisionTree
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def build_decisionTree(path):
df = load_data(path)
avg_age=find_avg_age(df)
df = data_preparation(df, avg_age)
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
si_model = stringIndexer.fit(df)
df = si_model.transform(df)
df.show(truncate=False)
dt = DecisionTreeClassifier(labelCol='indexed')
grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例5: buildModel
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def buildModel(data, label):
"""
Build a pipeline to classify `label` against the rest of classes using Binary Regression Classification
:param data: the training data as a DF
:param label: 0..C-1 where C is the number of classes
:param shouldDisplayGraph: True to plot the graph illustrating the classification
:return: the model as a Transformer
"""
logging.info('building model for label = %d, type = %s' % (label, type(label)))
lr = LogisticRegression()
pipeline = Pipeline(stages=[lr])
paramGrid = ParamGridBuilder()\
.addGrid(lr.maxIter, [100])\
.addGrid(lr.elasticNetParam, [0.0, 1.0])\
.addGrid(lr.fitIntercept, [True, False])\
.build()
crossValidator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
evaluator=BinaryClassificationEvaluator(), numFolds=15)
dataDF = data.map(lambda point: LabeledPoint(0 if point.label == label else 1, point.features)).toDF()
model = crossValidator.fit(dataDF)
return model
示例6: create_models
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def create_models(sqlContext, modelDataframe):
modelDataframe.registerTempTable("modelDataframeTable")
# Create dataframes to use on the positive and negative models
pos = sqlContext.sql("SELECT pos_label AS label, features FROM modelDataframeTable")
neg = sqlContext.sql("SELECT neg_label AS label, features FROM modelDataframeTable")
# Initialize two logistic regression models.
# Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
# This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
posEvaluator = BinaryClassificationEvaluator()
negEvaluator = BinaryClassificationEvaluator()
# There are a few parameters associated with logistic regression. We do not know what they are a priori.
# We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
# We will assume the parameter is 1.0. Grid search takes forever.
posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
# We initialize a 5 fold cross-validation pipeline.
posCrossval = CrossValidator(
estimator=poslr,
evaluator=posEvaluator,
estimatorParamMaps=posParamGrid,
numFolds=2)
negCrossval = CrossValidator(
estimator=neglr,
evaluator=negEvaluator,
estimatorParamMaps=negParamGrid,
numFolds=2)
# Although crossvalidation creates its own train/test sets for
# tuning, we still need a labeled test set, because it is not
# accessible from the crossvalidator (argh!)
# Split the data 50/50
posTrain, posTest = pos.randomSplit([0.5, 0.5])
negTrain, negTest = neg.randomSplit([0.5, 0.5])
# Train the models
print("Training positive classifier...")
posModel = posCrossval.fit(posTrain)
print("Training negative classifier...")
negModel = negCrossval.fit(negTrain)
# Once we train the models, we don't want to do it again. We can save the models and load them again later.
posModel.write().overwrite().save("models/posModel")
negModel.write().overwrite().save("models/negModel")
示例7: test_parallel_evaluation
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_parallel_evaluation(self):
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
evaluator = BinaryClassificationEvaluator()
# test save/load of CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cv.setParallelism(1)
cvSerialModel = cv.fit(dataset)
cv.setParallelism(2)
cvParallelModel = cv.fit(dataset)
self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
示例8: pipelineRF
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def pipelineRF(dataDF):
"""
:param train_data:
:return:
"""
print('pipeline starting...')
labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
.fit(dataDF)
#dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
# maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')
rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
maxBins=40,seed=13)
pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])
paramGrid = ParamGridBuilder()\
.addGrid(rfEstimator.maxDepth,[5,10,30])\
.addGrid(rfEstimator.numTrees,[20,50,100]).build()
evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
rawPredictionCol='rawPrediction',
metricName='areaUnderROC')
cv = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds=10)
cvModel = cv.fit(dataDF)
print("pipeline end..., cvModel was fit using parameters:\n")
pprint(cvModel.explainParams())
predictionDF = cvModel.transform(dataDF)
selected = predictionDF\
.select('label','indexLabel','prediction','rawPrediction','probability')
for row in selected.take(5):
print row
aucMetric = evaluator.evaluate(selected)
print("auc of test data is:%.3f" % aucMetric)
示例9: test_expose_sub_models
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_expose_sub_models(self):
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
numFolds = 3
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
numFolds=numFolds, collectSubModels=True)
def checkSubModels(subModels):
self.assertEqual(len(subModels), numFolds)
for i in range(numFolds):
self.assertEqual(len(subModels[i]), len(grid))
cvModel = cv.fit(dataset)
checkSubModels(cvModel.subModels)
# Test the default value for option "persistSubModel" to be "true"
testSubPath = temp_path + "/testCrossValidatorSubModels"
savingPathWithSubModels = testSubPath + "cvModel3"
cvModel.save(savingPathWithSubModels)
cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
checkSubModels(cvModel3.subModels)
cvModel4 = cvModel3.copy()
checkSubModels(cvModel4.subModels)
savingPathWithoutSubModels = testSubPath + "cvModel2"
cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
self.assertEqual(cvModel2.subModels, None)
for i in range(numFolds):
for j in range(len(grid)):
self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
示例10: buil_lrmodel
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def buil_lrmodel(path):
df = load_data(path)
#-------------------- preparing the dataset -------------------------------------------
avg_age = find_avg_age(df)
df = data_preparation(df, avg_age)
print "count = " , df.count()
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
#------------------ Build a model ----------------------------------------------------
lr = LogisticRegression(maxIter=10, regParam=0.01)
model = lr.fit(df)
prediction = model.transform(df)
prediction.show(truncate=False)
evaluator = BinaryClassificationEvaluator()
print "classification evaluation :" , evaluator.evaluate(prediction)
#-------------- selecting models with cross validation -----------------------------------
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
.addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例11: main
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def main():
'''
takes one input argument :: Location of the directory for training and test data files.
:return: Print output on console for the area under the ROC curve.
'''
conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training data.
model = pipeline.fit(trainDF)
numFeatures = (1000, 5000, 10000)
regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()
cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)
# Evaluate the model on testing data
testDF = sqlCt.read.parquet("20news_test.parquet")
prediction = model.transform(testDF)
evaluator = BinaryClassificationEvaluator()
model_cv = cv.fit(trainDF)
prediction_cv = model_cv.transform(testDF)
print evaluator.evaluate(prediction)
print evaluator.evaluate(prediction_cv)
開發者ID:PranavGoel,項目名稱:Python-Spark---Matrix-Multiplication---ML-pipeline,代碼行數:41,代碼來源:ml_pipeline.py
示例12: test_save_load_nested_estimator
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_save_load_nested_estimator(self):
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
ova = OneVsRest(classifier=LogisticRegression())
lr1 = LogisticRegression().setMaxIter(100)
lr2 = LogisticRegression().setMaxIter(150)
grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
evaluator = MulticlassClassificationEvaluator()
# test save/load of CrossValidator
cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvPath = temp_path + "/cv"
cv.save(cvPath)
loadedCV = CrossValidator.load(cvPath)
self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
originalParamMap = cv.getEstimatorParamMaps()
loadedParamMap = loadedCV.getEstimatorParamMaps()
for i, param in enumerate(loadedParamMap):
for p in param:
if p.name == "classifier":
self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
else:
self.assertEqual(param[p], originalParamMap[i][p])
# test save/load of CrossValidatorModel
cvModelPath = temp_path + "/cvModel"
cvModel.save(cvModelPath)
loadedModel = CrossValidatorModel.load(cvModelPath)
self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
示例13: test_fit_minimize_metric
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_fit_minimize_metric(self):
dataset = self.spark.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="rmse")
grid = (ParamGridBuilder()
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
bestModel = cvModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
示例14: main
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def main():
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet(training_input)
testDF = sqlCt.read.parquet(testing_input)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
evaluator = BinaryClassificationEvaluator()
# no parameter tuning
hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
model_notuning = pipeline_notuning.fit(trainDF)
prediction_notuning = model_notuning.transform(testDF)
notuning_output = evaluator.evaluate(prediction_notuning)
# for cross validation
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20)
paramGrid = ParamGridBuilder()\
.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
.build()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cvModel = cv.fit(trainDF)
# Make predictions on test documents. cvModel uses the best model found.
best_prediction = cvModel.transform(testDF)
best_output = evaluator.evaluate(best_prediction)
s = str(notuning_output) + '\n' + str(best_output)
output_data = sc.parallelize([s])
output_data.saveAsTextFile(output)
示例15: test_copy
# 需要導入模塊: from pyspark.ml.tuning import CrossValidator [as 別名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 別名]
def test_copy(self):
dataset = self.spark.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="rmse")
grid = (ParamGridBuilder()
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvCopied = cv.copy()
self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid)
cvModel = cv.fit(dataset)
cvModelCopied = cvModel.copy()
for index in range(len(cvModel.avgMetrics)):
self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index])
< 0.0001)