當前位置: 首頁>>代碼示例>>Python>>正文


Python tuning.CrossValidator類代碼示例

本文整理匯總了Python中pyspark.ml.tuning.CrossValidator的典型用法代碼示例。如果您正苦於以下問題:Python CrossValidator類的具體用法?Python CrossValidator怎麽用?Python CrossValidator使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了CrossValidator類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: train_with_tune

def train_with_tune(input_df):
    # https://spark.apache.org/docs/latest/ml-tuning.html
    # 構建模型訓練流程
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    # 構建超參空間
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    # 隻做一次切分
    # tvs = TrainValidationSplit(estimator=pipeline,
    #                            estimatorParamMaps=paramGrid,
    #                            evaluator=BinaryClassificationEvaluator(),
    #                            # 80% of the data will be used for training, 20% for validation.
    #                            trainRatio=0.8)

    # k-fold cross validation
    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(),
                               numFolds=3)

    # train and find the best
    cvModel = cross_val.fit(input_df)
    return cvModel.bestModel
開發者ID:haiy,項目名稱:test_project,代碼行數:27,代碼來源:3_applist_model.py

示例2: train_lg

    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
開發者ID:hosamshahin,項目名稱:Spring2016_IR_Project,代碼行數:30,代碼來源:text_classification_02.py

示例3: build_decisionTree

def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
開發者ID:PranavGoel,項目名稱:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代碼行數:28,代碼來源:spark.py

示例4: buildModel

def buildModel(data, label):
    """
    Build a pipeline to classify `label` against the rest of classes using Binary Regression Classification

    :param data: the training data as a DF
    :param label: 0..C-1 where C is the number of classes
    :param shouldDisplayGraph: True to plot the graph illustrating the classification
    :return: the model as a Transformer
    """
    logging.info('building model for label = %d, type = %s' % (label, type(label)))
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[lr])

    paramGrid = ParamGridBuilder()\
        .addGrid(lr.maxIter, [100])\
        .addGrid(lr.elasticNetParam, [0.0, 1.0])\
        .addGrid(lr.fitIntercept, [True, False])\
        .build()
    crossValidator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                                    evaluator=BinaryClassificationEvaluator(), numFolds=15)

    dataDF = data.map(lambda point: LabeledPoint(0 if point.label == label else 1, point.features)).toDF()
    model = crossValidator.fit(dataDF)

    return model
開發者ID:huylu,項目名稱:iris-pyspark,代碼行數:25,代碼來源:iris_mlpipeline.py

示例5: test_save_load_simple_estimator

    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
開發者ID:Brett-A,項目名稱:spark,代碼行數:29,代碼來源:test_tuning.py

示例6: pipelineRF

def pipelineRF(dataDF):
    """

    :param train_data:
    :return:
    """

    print('pipeline starting...')
    labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF)
    featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\
                                    .fit(dataDF)

    #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5,
    #                                      maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy')

    rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features',
                                         maxBins=40,seed=13)

    pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator])

    paramGrid = ParamGridBuilder()\
        .addGrid(rfEstimator.maxDepth,[5,10,30])\
        .addGrid(rfEstimator.numTrees,[20,50,100]).build()

    evaluator =BinaryClassificationEvaluator(labelCol='indexLabel',
                                             rawPredictionCol='rawPrediction',
                                             metricName='areaUnderROC')
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=10)

    cvModel = cv.fit(dataDF)
    print("pipeline end..., cvModel  was fit using parameters:\n")
    pprint(cvModel.explainParams())


    predictionDF = cvModel.transform(dataDF)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.take(5):
        print row

    aucMetric = evaluator.evaluate(selected)
    print("auc of test data is:%.3f" % aucMetric)
開發者ID:WeihuaLei,項目名稱:LearnSpark,代碼行數:46,代碼來源:credit_prediction.py

示例7: create_models

def create_models(sqlContext, modelDataframe):
    modelDataframe.registerTempTable("modelDataframeTable")

    # Create dataframes to use on the positive and negative models
    pos = sqlContext.sql("SELECT pos_label AS label, features FROM modelDataframeTable")
    neg = sqlContext.sql("SELECT neg_label AS label, features FROM modelDataframeTable")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(
        estimator=poslr,
        evaluator=posEvaluator,
        estimatorParamMaps=posParamGrid,
        numFolds=2)
    negCrossval = CrossValidator(
        estimator=neglr,
        evaluator=negEvaluator,
        estimatorParamMaps=negParamGrid,
        numFolds=2)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.write().overwrite().save("models/posModel")
    negModel.write().overwrite().save("models/negModel")
開發者ID:jganley,項目名稱:reddit-sentiment-analysis,代碼行數:45,代碼來源:analysis.py

示例8: test_expose_sub_models

    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
開發者ID:Brett-A,項目名稱:spark,代碼行數:43,代碼來源:test_tuning.py

示例9: buil_lrmodel

def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
開發者ID:PranavGoel,項目名稱:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代碼行數:40,代碼來源:spark.py

示例10: main

def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)
開發者ID:PranavGoel,項目名稱:Python-Spark---Matrix-Multiplication---ML-pipeline,代碼行數:39,代碼來源:ml_pipeline.py

示例11: test_fit_minimize_metric

    def test_fit_minimize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
開發者ID:Brett-A,項目名稱:spark,代碼行數:22,代碼來源:test_tuning.py

示例12: main

def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)
開發者ID:Veterun,項目名稱:SparkPythonHanhan,代碼行數:38,代碼來源:spark_ml_pipline.py

示例13: test_copy

    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvCopied = cv.copy()
        self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid)

        cvModel = cv.fit(dataset)
        cvModelCopied = cvModel.copy()
        for index in range(len(cvModel.avgMetrics)):
            self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index])
                            < 0.0001)
開發者ID:Brett-A,項目名稱:spark,代碼行數:23,代碼來源:test_tuning.py

示例14: test_save_load_trained_model

    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for CrossValidator will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        lrModel = cvModel.bestModel

        cvModelPath = temp_path + "/cvModel"
        lrModel.save(cvModelPath)
        loadedLrModel = LogisticRegressionModel.load(cvModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
開發者ID:Brett-A,項目名稱:spark,代碼行數:23,代碼來源:test_tuning.py

示例15: train_with_tune

def train_with_tune(input_df):
    # https://spark.apache.org/docs/latest/ml-tuning.html
    # build a model with GridSearch
    xgboost_params = {
        "eta": 0.023,
        "max_depth": 10,
        "min_child_weight": 0.3,
        "subsample": 0.7,
        "colsample_bytree": 0.82,
        "colsample_bylevel": 0.9,
        "eval_metric": "auc",
        "seed": 49,
        "silent": 1,
        "objective": "binary:logistic",
        "round": 10,
        "nWorkers": 2
    }
    xgb_model = XGBoostClassifier(xgboost_params)
    pipeline = Pipeline(stages=[xgb_model])

    # build the hyperparameter space
    paramGrid = ParamGridBuilder() \
        .addGrid(xgb_model.max_depth, [3, 7]) \
        .addGrid(xgb_model.min_child_weight, [0.1, 0.2, 0.3]) \
        .build()


    # k-fold cross validation
    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=paramGrid,
                               evaluator=BinaryClassificationEvaluator(rawPredictionCol="probabilities"),
                               numFolds=3)

    # train and find the best
    cvModel = cross_val.fit(input_df)
    return cvModel.bestModel
開發者ID:haiy,項目名稱:test_project,代碼行數:36,代碼來源:4_xgb.py


注:本文中的pyspark.ml.tuning.CrossValidator類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。