本文整理汇总了Python中pyspark.ml.classification.LogisticRegression.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegression.fit方法的具体用法?Python LogisticRegression.fit怎么用?Python LogisticRegression.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.classification.LogisticRegression
的用法示例。
在下文中一共展示了LogisticRegression.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_int_to_float
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def test_int_to_float(self):
from pyspark.mllib.linalg import Vectors
df = self.sc.parallelize([
Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
lr = LogisticRegression(elasticNetParam=0)
lr.fit(df)
lr.setElasticNetParam(0)
lr.fit(df)
示例2: test_logistic_regression_summary
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def test_logistic_regression_summary(self):
from pyspark.mllib.linalg import Vectors
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
示例3: test_binomial_logistic_regression_with_bound
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def test_binomial_logistic_regression_with_bound(self):
df = self.spark.createDataFrame(
[(1.0, 1.0, Vectors.dense(0.0, 5.0)),
(0.0, 2.0, Vectors.dense(1.0, 2.0)),
(1.0, 3.0, Vectors.dense(2.0, 1.0)),
(0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
lor = LogisticRegression(regParam=0.01, weightCol="weight",
lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
upperBoundsOnIntercepts=Vectors.dense(0.0))
model = lor.fit(df)
self.assertTrue(
np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
示例4: test_multinomial_logistic_regression_with_bound
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def test_multinomial_logistic_regression_with_bound(self):
data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = self.spark.read.format("libsvm").load(data_path)
lor = LogisticRegression(regParam=0.01,
lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
model = lor.fit(df)
expected = [[4.593, 4.5516, 9.0099, 12.2904],
[1.0, 8.1093, 7.0, 10.0],
[3.041, 5.0, 8.0, 11.0]]
for i in range(0, len(expected)):
self.assertTrue(
np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
self.assertTrue(
np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
示例5: train
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def train(self, rdd):
"""
:return: Trained model to be passed to test.
"""
options = self.options
if options.reg_type == "elastic-net": # use spark.ml
lr = MLLogisticRegression(maxIter=options.num_iterations, regParam=options.reg_param,
elasticNetParam=options.elastic_net_param)
# TODO: Do not include time for conversion to DataFrame (but this currently matches
# the Scala tests)
df = rdd.toDF()
lrModel = lr.fit(df)
numFeatures = len(lrModel.weights)
numClasses = 2
return LogisticRegressionModel(lrModel.weights, lrModel.intercept,
numFeatures, numClasses)
else:
if options.loss == "logistic":
if options.optimizer == "sgd":
return LogisticRegressionWithSGD.train(data=rdd,
iterations=options.num_iterations,
step=options.step_size,
miniBatchFraction=1.0,
regParam=options.reg_param,
regType=options.reg_type)
elif options.optimizer == "l-bfgs":
return LogisticRegressionWithLBFGS.train(data=rdd,
iterations=options.num_iterations,
regParam=options.reg_param,
regType=options.reg_type,
tolerance=0.0)
else:
raise Exception("GLMClassificationTest cannot run with loss = %s,"
" optimizer = %s" % (options.loss, options.optimizer))
elif options.loss == "hinge":
if options.optimizer == "sgd":
return SVMWithSGD.train(data=rdd, iterations=options.num_iterations,
step=options.step_size, regParam=options.reg_param,
miniBatchFraction=1.0, regType=options.reg_type)
else:
raise Exception("GLMClassificationTest does not recognize loss: %s" % options.loss)
示例6: buil_lrmodel
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def buil_lrmodel(path):
df = load_data(path)
#-------------------- preparing the dataset -------------------------------------------
avg_age = find_avg_age(df)
df = data_preparation(df, avg_age)
print "count = " , df.count()
df = df.drop('Cabin')
df = df.drop('Ticket')
df = df.drop('Name')
#------------------ Build a model ----------------------------------------------------
lr = LogisticRegression(maxIter=10, regParam=0.01)
model = lr.fit(df)
prediction = model.transform(df)
prediction.show(truncate=False)
evaluator = BinaryClassificationEvaluator()
print "classification evaluation :" , evaluator.evaluate(prediction)
#-------------- selecting models with cross validation -----------------------------------
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\
.addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
prediction = cvModel.transform(df)
prediction.show(truncate=False)
print "classification evaluation :" , evaluator.evaluate(prediction)
return cvModel,avg_age
示例7: anom_with_lr
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def anom_with_lr():
try:
prepared_data = split_data()
train = prepared_data['train']
test = prepared_data['test']
for_finding_more = prepared_data['for_finding_more']
lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0.0) #We set regParam = 0 to make it comparable with LogisticRegressionWithSGD that we used before, which does not do
#any regularization by default. With regParam = 0, value of elasticNetParam should not matter. elasticNetParam = 0 is Ridge regression (L2), keeps all features. elasticNetParam = 1 is LASSO (L1), performs feature selection.
#With regParam = 0, test accuracy is 0.9454, fpr is 0.0713, fnr is 0.0375, on a sample of 50K test data points.
t0 = time()
model = lr.fit(train)
tt = time() - t0
print "Classifier trained in {0} seconds".format(round(tt,3))
t0 = time()
predictions = model.transform(test) #Feed the test DataFrame as-is, do not need to feed the features only
tt = time() - t0
print "Prediction made in {0} seconds".format(round(tt,3))
#Adding proabability to test data set for calibration
labelsAndPreds = predictions.map(lambda p: (p.label, p.prediction, round(p.probability[1], 5)))
labelsAndPreds.toDF(["label", "predicted_label", "predicted_prob"]).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/labelsAndPreds/logistic_regression')
test_accuracy = labelsAndPreds.filter(lambda (v, p, r): v == p).count()/float(test_data_size)
fpr = labelsAndPreds.filter(lambda (v, p, r): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 0).count()
fnr = labelsAndPreds.filter(lambda (v, p, r): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p, r): v == 1).count()
print "Test accuracy is {0}, fpr is {1}, fnr is {2}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4))
for_finding_more = model.transform(for_finding_more).map(lambda p: (p.label, round(p.probability[1], 5))) #toDF() in next line did not work without round(): some issue with float
for_finding_more = for_finding_more.toDF(["label", "predicted_prob"])
for_finding_more = for_finding_more.orderBy(for_finding_more.predicted_prob.desc())
for_finding_more.select('predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save(home_folder + '/healthcare/data/cloudera_challenge/additional_10000_from_spark') #Top one has
#probability of 0.9999, last one has probability 0.05159, 75 of them above 0.99
except Exception:
print("Exception in user code:")
traceback.print_exc(file = sys.stdout)
return
示例8: test_multiclass_logistic_regression_summary
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def test_multiclass_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], [])),
(2.0, 2.0, Vectors.dense(2.0)),
(2.0, 2.0, Vectors.dense(1.9))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 0.75, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
示例9: sex_to_bin
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
test = test.select('Pclass', 'Sex', 'SibSp', 'Parch')
train = sex_to_bin(train)
test = sex_to_bin(test)
print "number of men in train and test resp. : %d, %d" \
%(train.select('Sex').map(lambda x: x.Sex).sum() \
,test.select('Sex').map(lambda x: x.Sex).sum())
# format train for Logistic Regression as (label, features)
ntrain = train.map(lambda x: Row(label = float(x[0]) \
,features = Vectors.dense(x[1:]))).toDF().cache() # Logistic Regression is iterative, need caching
ntest = test.map(lambda x: Row(features = Vectors.dense(x[0:]))).toDF()
lr = LogisticRegression(maxIter = 100, regParam = 0.1)
model = lr.fit(ntrain)
pred = model.transform(ntest).select('prediction').map(lambda x: x.prediction)
# configure the submission format as follows
submit = sqlCtx.createDataFrame(testPassengerId.zip(pred), ["PassengerId", "Survived"])
"""
NOTE: rdd1.zip(rdd2) works provided that both RDDs have the same partitioner and the same number
of elements per partition, otherwise should either repartition or can do:
submit = sqlCtx.createDataFrame(pred.zipWithIndex().map(lambda x: (x[1]+892L, x[0])), ["PassengerId", "Survived"])
where 891L is the number training samples
"""
os.chdir(DATADIR)
# file is small so can save pandas.DataFrame as csv
submit.toPandas().to_csv("prediction.csv", index = False)
# if not, should saveAsTextFile:
# submit.rdd.saveAsTextFile("/home/ehsan/Python/PySpark/Titanic/data/prediction")
示例10: time
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
print "Fitting the classifier on selected features"
t0 = time()
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[19]:
print "Testing precision of the model"
t0 = time()
dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache()
df_valid_pred = lrModel.transform(dfValidIndexed).cache()
res=evaluator.evaluate(df_valid_pred)
print res
示例11: Row
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])),
Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])),
Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])),
Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))])
# Create a LogisticRegression instance with maxIter = 10.
# This instance is an Estimator.
lr = LogisticRegression(maxIter=10)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
# We may also set parameters using setter methods.
lr.setRegParam(0.01)
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
# Since model1 is a Model (i.e., a Transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters:\n")
pprint.pprint(model1.extractParamMap())
# We may alternatively specify parameters using a parameter map.
# paramMap overrides all lr parameters set earlier.
paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"}
# Now learn a new model using the new parameters.
model2 = lr.fit(training, paramMap)
print("Model 2 was fit using parameters:\n")
示例12: LogisticRegression
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
# COMMAND ----------
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")
# COMMAND ----------
print lr.explainParams()
# COMMAND ----------
fittedLR = lr.fit(train)
# COMMAND ----------
train, test = df.randomSplit([0.7, 0.3])
# COMMAND ----------
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")
# COMMAND ----------
开发者ID:yehonatc,项目名称:Spark-The-Definitive-Guide,代码行数:31,代码来源:Advanced_Analytics_and_Machine_Learning-Chapter_24_Advanced_Analytics_and_Machine_Learning.py
示例13: VectorAssembler
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
(161.6, 61.2, 28)]).toDF("height", "weight", "age")
training.show(truncate=False)
assembler = VectorAssembler(inputCols=["height", "weight", "age"], outputCol="features")
# training 데이터에 features 컬럼 추가
assembled_training = assembler.transform(training)
assembled_training.show(truncate=False)
# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")
# 모델 생성
model = lr.fit(assembled_training)
# 예측값 생성
model.transform(assembled_training).show()
# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])
# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)
# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()
path1 = "/Users/beginspark/Temp/regression-model"
path2 = "/Users/beginspark/Temp/pipelinemodel"
示例14: train_logistic
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
def train_logistic(df):
lr = LogisticRegression(maxIter=LR_MAX_ITER, regParam=LR_REG_PARAM)
return lr, lr.fit(df)
示例15: oneHotEncodeColumns
# 需要导入模块: from pyspark.ml.classification import LogisticRegression [as 别名]
# 或者: from pyspark.ml.classification.LogisticRegression import fit [as 别名]
dfhot = oneHotEncodeColumns(dfnumeric, ["workclass", "education", "marital_status", "occupation", "relationship", "race", "native_country"])
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfhot.columns[0:-1])
lpoints = va.transform(dfhot).select("features", "income").withColumnRenamed("income", "label")
#section 8.2.3
splits = lpoints.randomSplit([0.8, 0.2])
adulttrain = splits[0].cache()
adultvalid = splits[1].cache()
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)
lrmodel.weights
lrmodel.intercept
#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()
bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)