本文整理汇总了Python中pyspark.ml.classification.LogisticRegression类的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegression类的具体用法?Python LogisticRegression怎么用?Python LogisticRegression使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LogisticRegression类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_logistic_regression_summary
def test_logistic_regression_summary(self):
from pyspark.mllib.linalg import Vectors
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
示例2: test_default_read_write
def test_default_read_write(self):
temp_path = tempfile.mkdtemp()
lr = LogisticRegression()
lr.setMaxIter(50)
lr.setThreshold(.75)
writer = DefaultParamsWriter(lr)
savePath = temp_path + "/lr"
writer.save(savePath)
reader = DefaultParamsReadable.read()
lr2 = reader.load(savePath)
self.assertEqual(lr.uid, lr2.uid)
self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())
# test overwrite
lr.setThreshold(.8)
writer.overwrite().save(savePath)
reader = DefaultParamsReadable.read()
lr3 = reader.load(savePath)
self.assertEqual(lr.uid, lr3.uid)
self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
示例3: test_binomial_logistic_regression_with_bound
def test_binomial_logistic_regression_with_bound(self):
df = self.spark.createDataFrame(
[(1.0, 1.0, Vectors.dense(0.0, 5.0)),
(0.0, 2.0, Vectors.dense(1.0, 2.0)),
(1.0, 3.0, Vectors.dense(2.0, 1.0)),
(0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
lor = LogisticRegression(regParam=0.01, weightCol="weight",
lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
upperBoundsOnIntercepts=Vectors.dense(0.0))
model = lor.fit(df)
self.assertTrue(
np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
示例4: test_logistic_regression
def test_logistic_regression(self):
lr = LogisticRegression(maxIter=1)
path = tempfile.mkdtemp()
lr_path = path + "/logreg"
lr.save(lr_path)
lr2 = LogisticRegression.load(lr_path)
self.assertEqual(lr2.uid, lr2.maxIter.parent,
"Loaded LogisticRegression instance uid (%s) "
"did not match Param's uid (%s)"
% (lr2.uid, lr2.maxIter.parent))
self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
"Loaded LogisticRegression instance default params did not match " +
"original defaults")
try:
rmtree(path)
except OSError:
pass
示例5: test_multinomial_logistic_regression_with_bound
def test_multinomial_logistic_regression_with_bound(self):
data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = self.spark.read.format("libsvm").load(data_path)
lor = LogisticRegression(regParam=0.01,
lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
model = lor.fit(df)
expected = [[4.593, 4.5516, 9.0099, 12.2904],
[1.0, 8.1093, 7.0, 10.0],
[3.041, 5.0, 8.0, 11.0]]
for i in range(0, len(expected)):
self.assertTrue(
np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
self.assertTrue(
np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
示例6: test_int_to_float
def test_int_to_float(self):
from pyspark.mllib.linalg import Vectors
df = self.sc.parallelize([
Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
lr = LogisticRegression(elasticNetParam=0)
lr.fit(df)
lr.setElasticNetParam(0)
lr.fit(df)
示例7: train
def train(self, rdd):
"""
:return: Trained model to be passed to test.
"""
options = self.options
if options.reg_type == "elastic-net": # use spark.ml
lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
elasticNetParam=options.elastic_net_param)
# TODO: Do not include time for conversion to DataFrame (but this currently matches
# the Scala tests)
df = rdd.toDF()
lrModel = lr.fit(df)
numFeatures = len(lrModel.weights)
numClasses = 2
return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
numFeatures, numClasses)
else:
if options.loss == "logistic":
if options.optimizer == "sgd":
return LogisticRegressionWithSGD.train(data=rdd,
iterations=options.num_iterations,
step=options.step_size,
miniBatchFraction=1.0,
regParam=options.reg_param,
regType=options.reg_type)
elif options.optimizer == "l-bfgs":
return LogisticRegressionWithLBFGS.train(data=rdd,
iterations=options.num_iterations,
regParam=options.reg_param,
regType=options.reg_type,
tolerance=0.0)
else:
raise Exception("GLMClassificationTest cannot run with loss = %s,"
" optimizer = %s" % (options.loss, options.optimizer))
elif options.loss == "hinge":
if options.optimizer == "sgd":
return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
step=options.step_size, regParam=options.reg_param,
miniBatchFraction=1.0, regType=options.reg_type)
else:
raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
示例8: buil_lrmodel
def buil_lrmodel(path):
df = load_data(path)
#-------------------- preparing the dataset -------------------------------------------
avg_age = find_avg_age(df)
df = data_preparation(df, avg_age)
print "count = " , df.count()
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
#------------------ Build a model ----------------------------------------------------
lr = LogisticRegression(maxIter=10, regParam=0.01)
model = lr.fit(df)
prediction = model.transform(df)
prediction.show(truncate=False)
evaluator = BinaryClassificationEvaluator()
print "classification evaluation :" , evaluator.evaluate(prediction)
#-------------- selecting models with cross validation -----------------------------------
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
.addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例9: anom_with_lr
def anom_with_lr():
try:
prepared_data = split_data()
train = prepared_data['train']
test = prepared_data['test']
for_finding_more = prepared_data['for_finding_more']
lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do
#any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection.
#With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points.
t0 = time()
model = lr.fit(train)
tt = time() - t0
print "Classifier trained in {0} seconds".format(round(tt,3))
t0 = time()
predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only
tt = time() - t0
print "Prediction made in {0} seconds".format(round(tt,3))
#Adding proabability to test data set for calibration
labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5)))
labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression')
test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)
fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count()
fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))
for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float
for_finding_more = for_finding_more.toDF(["label", "predicted_prob"])
for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc())
for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has
#probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99
except Exception:
print("Exception in user code:")
traceback.print_exc(file = sys.stdout)
return
示例10: test_multiclass_logistic_regression_summary
def test_multiclass_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], [])),
(2.0, 2.0, Vectors.dense(2.0)),
(2.0, 2.0, Vectors.dense(1.9))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 0.75, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
示例11: map
#input
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24
def helper1(r):
features=[]
try:
fe = r[1:-1]
for i in range(len(fe)):
features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
target = float(r[-1])
ID=float(r[0])
return target, Vectors.dense(features)
except:
return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
pca = PCA(k=1000, inputCol="features", outputCol="pca_features")
model_pca = pca.fit(df)
rdd_pca = model_pca.transform(df).select(["label","pca_features"])
rdd_pca1 = rdd_pca.withColumnRenamed('pca_features', 'features')
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])
lr = LogisticRegression(maxIter=100, regParam=0.01)
model = lr.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/lr_pca_1000_001")
示例12: LogisticRegression
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("MulticlassLogisticRegressionWithElasticNet") \
.getOrCreate()
# $example on$
# Load training data
training = spark \
.read \
.format("libsvm") \
.load("data/mllib/sample_multiclass_classification_data.txt")
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
# Fit the model
lrModel = lr.fit(training)
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))
trainingSummary = lrModel.summary
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
print(objective)
示例13: time
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[18]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
print "Fitting the classifier on selected features"
t0 = time()
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[19]:
print "Testing precision of the model"
t0 = time()
示例14: SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext(appName="ML Example")
sc.setLogLevel("FATAL")
sqlContext = SQLContext(sc)
# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
示例15: test_default_read_write_default_params
def test_default_read_write_default_params(self):
lr = LogisticRegression()
self.assertFalse(lr.isSet(lr.getParam("threshold")))
lr.setMaxIter(50)
lr.setThreshold(.75)
# `threshold` is set by user, default param `predictionCol` is not set by user.
self.assertTrue(lr.isSet(lr.getParam("threshold")))
self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))
writer = DefaultParamsWriter(lr)
metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
self.assertTrue("defaultParamMap" in metadata)
reader = DefaultParamsReadable.read()
metadataStr = json.dumps(metadata, separators=[',', ':'])
loadedMetadata = reader._parseMetaData(metadataStr, )
reader.getAndSetParams(lr, loadedMetadata)
self.assertTrue(lr.isSet(lr.getParam("threshold")))
self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))
# manually create metadata without `defaultParamMap` section.
del metadata['defaultParamMap']
metadataStr = json.dumps(metadata, separators=[',', ':'])
loadedMetadata = reader._parseMetaData(metadataStr, )
with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
reader.getAndSetParams(lr, loadedMetadata)
# Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
metadata['sparkVersion'] = '2.3.0'
metadataStr = json.dumps(metadata, separators=[',', ':'])
loadedMetadata = reader._parseMetaData(metadataStr, )
reader.getAndSetParams(lr, loadedMetadata)