当前位置: 首页>>代码示例>>Python>>正文


Python CrossValidator.fit方法代码示例

本文整理汇总了Python中pyspark.ml.tuning.CrossValidator.fit方法的典型用法代码示例。如果您正苦于以下问题:Python CrossValidator.fit方法的具体用法?Python CrossValidator.fit怎么用?Python CrossValidator.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.tuning.CrossValidator的用法示例。


在下文中一共展示了CrossValidator.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_with_tune

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def train_with_tune(input_df):
    # https://spark.apache.org/docs/latest/ml-tuning.html
    # 构建模型训练流程
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    # 构建超参空间
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    # 只做一次切分
    # tvs = TrainValidationSplit(estimator=pipeline,
    #                            estimatorParamMaps=paramGrid,
    #                            evaluator=BinaryClassificationEvaluator(),
    #                            # 80% of the data will be used for training, 20% for validation.
    #                            trainRatio=0.8)

    # k-fold cross validation
    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(),
                               numFolds=3)

    # train and find the best
    cvModel = cross_val.fit(input_df)
    return cvModel.bestModel
开发者ID:haiy,项目名称:test_project,代码行数:29,代码来源:3_applist_model.py

示例2: train_lg

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
开发者ID:hosamshahin,项目名称:Spring2016_IR_Project,代码行数:32,代码来源:text_classification_02.py

示例3: test_save_load_simple_estimator

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
开发者ID:Brett-A,项目名称:spark,代码行数:31,代码来源:test_tuning.py

示例4: build_decisionTree

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
开发者ID:PranavGoel,项目名称:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代码行数:30,代码来源:spark.py

示例5: buildModel

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def buildModel(data, label):
    """
    Build a pipeline to classify `label` against the rest of classes using Binary Regression Classification

    :param data: the training data as a DF
    :param label: 0..C-1 where C is the number of classes
    :param shouldDisplayGraph: True to plot the graph illustrating the classification
    :return: the model as a Transformer
    """
    logging.info('building model for label = %d, type = %s' % (label, type(label)))
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    paramGrid = ParamGridBuilder()\
        .addGrid(lr.maxIter, [100])\
        .addGrid(lr.elasticNetParam, [0.0, 1.0])\
        .addGrid(lr.fitIntercept, [True, False])\
        .build()
    crossValidator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                                    evaluator=BinaryClassificationEvaluator(), numFolds=15)

    dataDF = data.map(lambda point: LabeledPoint(0 if point.label == label else 1, point.features)).toDF()
    model = crossValidator.fit(dataDF)

    return model
开发者ID:huylu,项目名称:iris-pyspark,代码行数:27,代码来源:iris_mlpipeline.py

示例6: create_models

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def create_models(sqlContext, modelDataframe):
    modelDataframe.registerTempTable("modelDataframeTable")

    # Create dataframes to use on the positive and negative models
    pos = sqlContext.sql("SELECT pos_label AS label, features FROM modelDataframeTable")
    neg = sqlContext.sql("SELECT neg_label AS label, features FROM modelDataframeTable")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(
        estimator=poslr,
        evaluator=posEvaluator,
        estimatorParamMaps=posParamGrid,
        numFolds=2)
    negCrossval = CrossValidator(
        estimator=neglr,
        evaluator=negEvaluator,
        estimatorParamMaps=negParamGrid,
        numFolds=2)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.write().overwrite().save("models/posModel")
    negModel.write().overwrite().save("models/negModel")
开发者ID:jganley,项目名称:reddit-sentiment-analysis,代码行数:47,代码来源:analysis.py

示例7: test_parallel_evaluation

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cv.setParallelism(1)
        cvSerialModel = cv.fit(dataset)
        cv.setParallelism(2)
        cvParallelModel = cv.fit(dataset)
        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
开发者ID:Brett-A,项目名称:spark,代码行数:22,代码来源:test_tuning.py

示例8: pipelineRF

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def pipelineRF(dataDF):
    """

    :param train_data:
    :return:
    """

    print('pipeline starting...')
    labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
    featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
                                    .fit(dataDF)

    #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
    #                                      maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')

    rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
                                         maxBins=40,seed=13)

    pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])

    paramGrid = ParamGridBuilder()\
        .addGrid(rfEstimator.maxDepth,[5,10,30])\
        .addGrid(rfEstimator.numTrees,[20,50,100]).build()

    evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
                                             rawPredictionCol='rawPrediction',
                                             metricName='areaUnderROC')
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)

    cvModel = cv.fit(dataDF)
    print("pipeline end..., cvModel  was fit using parameters:\n")
    pprint(cvModel.explainParams())


    predictionDF = cvModel.transform(dataDF)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.take(5):
        print row

    aucMetric = evaluator.evaluate(selected)
    print("auc of test data is:%.3f" % aucMetric)
开发者ID:WeihuaLei,项目名称:LearnSpark,代码行数:48,代码来源:credit_prediction.py

示例9: test_expose_sub_models

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
开发者ID:Brett-A,项目名称:spark,代码行数:45,代码来源:test_tuning.py

示例10: buil_lrmodel

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
开发者ID:PranavGoel,项目名称:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代码行数:42,代码来源:spark.py

示例11: main

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
开发者ID:PranavGoel,项目名称:Python-Spark---Matrix-Multiplication---ML-pipeline,代码行数:41,代码来源:ml_pipeline.py

示例12: test_save_load_nested_estimator

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_save_load_nested_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
开发者ID:Brett-A,项目名称:spark,代码行数:41,代码来源:test_tuning.py

示例13: test_fit_minimize_metric

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_fit_minimize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
开发者ID:Brett-A,项目名称:spark,代码行数:24,代码来源:test_tuning.py

示例14: main

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:40,代码来源:spark_ml_pipline.py

示例15: test_copy

# 需要导入模块: from pyspark.ml.tuning import CrossValidator [as 别名]
# 或者: from pyspark.ml.tuning.CrossValidator import fit [as 别名]
    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvCopied = cv.copy()
        self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid)

        cvModel = cv.fit(dataset)
        cvModelCopied = cvModel.copy()
        for index in range(len(cvModel.avgMetrics)):
            self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index])
                            < 0.0001)
开发者ID:Brett-A,项目名称:spark,代码行数:25,代码来源:test_tuning.py


注:本文中的pyspark.ml.tuning.CrossValidator.fit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。