当前位置: 首页>>代码示例>>Python>>正文


Python classification.LogisticRegression类代码示例

本文整理汇总了Python中pyspark.ml.classification.LogisticRegression的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegression类的具体用法?Python LogisticRegression怎么用?Python LogisticRegression使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了LogisticRegression类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_logistic_regression_summary

 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
开发者ID:Bella-Lin,项目名称:spark,代码行数:28,代码来源:tests.py

示例2: test_default_read_write

    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
开发者ID:JingchengDu,项目名称:spark,代码行数:26,代码来源:test_persistence.py

示例3: test_binomial_logistic_regression_with_bound

    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_algorithms.py

示例4: test_logistic_regression

 def test_logistic_regression(self):
     lr = LogisticRegression(maxIter=1)
     path = tempfile.mkdtemp()
     lr_path = path + "/logreg"
     lr.save(lr_path)
     lr2 = LogisticRegression.load(lr_path)
     self.assertEqual(lr2.uid, lr2.maxIter.parent,
                      "Loaded LogisticRegression instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (lr2.uid, lr2.maxIter.parent))
     self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
                      "Loaded LogisticRegression instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
开发者ID:Bella-Lin,项目名称:spark,代码行数:17,代码来源:tests.py

示例5: test_multinomial_logistic_regression_with_bound

    def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:17,代码来源:test_algorithms.py

示例6: test_int_to_float

 def test_int_to_float(self):
     from pyspark.mllib.linalg import Vectors
     df = self.sc.parallelize([
         Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
     lr = LogisticRegression(elasticNetParam=0)
     lr.fit(df)
     lr.setElasticNetParam(0)
     lr.fit(df)
开发者ID:dikei,项目名称:spark,代码行数:8,代码来源:tests.py

示例7: train

 def train(self, rdd):
     """
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.reg_type == "elastic-net":  # use spark.ml
         lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                   elasticNetParam=options.elastic_net_param)
         # TODO: Do not include time for conversion to DataFrame (but this currently matches
         #       the Scala tests)
         df = rdd.toDF()
         lrModel = lr.fit(df)
         numFeatures = len(lrModel.weights)
         numClasses = 2
         return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
                                        numFeatures, numClasses)
     else:
         if options.loss == "logistic":
             if options.optimizer == "sgd":
                 return LogisticRegressionWithSGD.train(data=rdd,
                                                        iterations=options.num_iterations,
                                                        step=options.step_size,
                                                        miniBatchFraction=1.0,
                                                        regParam=options.reg_param,
                                                        regType=options.reg_type)
             elif options.optimizer == "l-bfgs":
                 return LogisticRegressionWithLBFGS.train(data=rdd,
                                                          iterations=options.num_iterations,
                                                          regParam=options.reg_param,
                                                          regType=options.reg_type,
                                                          tolerance=0.0)
             else:
                 raise Exception("GLMClassificationTest cannot run with loss = %s,"
                                 " optimizer = %s" % (options.loss, options.optimizer))
         elif options.loss == "hinge":
             if options.optimizer == "sgd":
                 return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
                                         step=options.step_size, regParam=options.reg_param,
                                         miniBatchFraction=1.0, regType=options.reg_type)
         else:
             raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
开发者ID:AlexGKing,项目名称:spark-perf,代码行数:41,代码来源:mllib_tests.py

示例8: buil_lrmodel

def buil_lrmodel(path):

    df = load_data(path)

    #-------------------- preparing the dataset -------------------------------------------

    avg_age = find_avg_age(df)
    df = data_preparation(df, avg_age)

    print "count = " , df.count()

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    #------------------ Build a model ----------------------------------------------------
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    model = lr.fit(df)

    prediction = model.transform(df)
    prediction.show(truncate=False)

    evaluator = BinaryClassificationEvaluator()
    print "classification evaluation :" , evaluator.evaluate(prediction)


    #-------------- selecting models with cross validation -----------------------------------
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
                            .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)


    return cvModel,avg_age
开发者ID:PranavGoel,项目名称:Apache_Spark-MlLiB-Titanic-Kaggle-Competition,代码行数:40,代码来源:spark.py

示例9: anom_with_lr

def anom_with_lr():
  try:
    prepared_data = split_data()
    train = prepared_data['train']
    test = prepared_data['test']
    for_finding_more = prepared_data['for_finding_more']
    lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do 
    #any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection.
    #With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points. 
    t0 = time()
    model = lr.fit(train)
    tt = time() - t0
    print "Classifier trained in {0} seconds".format(round(tt,3)) 
    
    t0 = time()
    predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only
    tt = time() - t0
    print "Prediction made in {0} seconds".format(round(tt,3))
 
    #Adding proabability to test data set for calibration
    labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5)))   
    labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression')   
 
    test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)        
    fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
    print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))
    
    for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float
    for_finding_more = for_finding_more.toDF(["label", "predicted_prob"])
    for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc())
    for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has 
    #probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return 
开发者ID:bibudhlahiri,项目名称:healthcare,代码行数:38,代码来源:analyze_anomaly_full_data.py

示例10: test_multiclass_logistic_regression_summary

 def test_multiclass_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], [])),
                                      (2.0, 2.0, Vectors.dense(2.0)),
                                      (2.0, 2.0, Vectors.dense(1.9))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertAlmostEqual(s.accuracy, 0.75, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
     self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
     self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
开发者ID:Brett-A,项目名称:spark,代码行数:37,代码来源:test_training_summary.py

示例11: map

#input 
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
pca = PCA(k=1000, inputCol="features", outputCol="pca_features")
model_pca = pca.fit(df)
rdd_pca = model_pca.transform(df).select(["label","pca_features"])
rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features')
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])
lr = LogisticRegression(maxIter=100, regParam=0.01)
model = lr.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/lr_pca_1000_001")

开发者ID:bgutta,项目名称:Marketing-Response,代码行数:29,代码来源:logistic_SGD_ML.py

示例12: LogisticRegression

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("MulticlassLogisticRegressionWithElasticNet") \
        .getOrCreate()

    # $example on$
    # Load training data
    training = spark \
        .read \
        .format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # Print the coefficients and intercept for multinomial logistic regression
    print("Coefficients: \n" + str(lrModel.coefficientMatrix))
    print("Intercept: " + str(lrModel.interceptVector))

    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)
开发者ID:lhfei,项目名称:spark-in-action,代码行数:31,代码来源:multiclass_logistic_regression_with_elastic_net.py

示例13: time

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:30,代码来源:script3_ter.py

示例14: SparkContext

from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext(appName="ML Example")
sc.setLogLevel("FATAL")
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
开发者ID:cnglen,项目名称:learning_spark,代码行数:31,代码来源:ml_demo.py

示例15: test_default_read_write_default_params

    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata['defaultParamMap']
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata['sparkVersion'] = '2.3.0'
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)
开发者ID:JingchengDu,项目名称:spark,代码行数:37,代码来源:test_persistence.py


注:本文中的pyspark.ml.classification.LogisticRegression类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。