本文整理汇总了Python中pyspark.ml.classification.LogisticRegression方法的典型用法代码示例。如果您正苦于以下问题:Python classification.LogisticRegression方法的具体用法?Python classification.LogisticRegression怎么用?Python classification.LogisticRegression使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.classification
的用法示例。
在下文中一共展示了classification.LogisticRegression方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_featurizer_in_pipeline
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def test_featurizer_in_pipeline(self):
"""
Tests that featurizer fits into an MLlib Pipeline.
Does not test how good the featurization is for generalization.
"""
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
modelName=self.name)
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
pipeline = Pipeline(stages=[featurizer, lr])
# add arbitrary labels to run logistic regression
# TODO: it's weird that the test fails on some combinations of labels. check why.
label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))
lrModel = pipeline.fit(train_df)
# see if we at least get the training examples right.
# with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
pred_df_collected = lrModel.transform(train_df).collect()
for row in pred_df_collected:
self.assertEqual(int(row.prediction), row.label)
示例2: test_one_vs_rest
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def test_one_vs_rest(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
data = self.spark.read.format("libsvm").load(input_path)
lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
ovr = OneVsRest(classifier=lr)
model = ovr.fit(data)
feature_count = data.first()[1].size
model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [
predicted.toPandas().prediction.values.astype(numpy.float32),
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlOneVsRest")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例3: main
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def main():
# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet(training_input)
testDF = sqlCt.read.parquet(testing_input)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
evaluator = BinaryClassificationEvaluator()
# no parameter tuning
hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
model_notuning = pipeline_notuning.fit(trainDF)
prediction_notuning = model_notuning.transform(testDF)
notuning_output = evaluator.evaluate(prediction_notuning)
# for cross validation
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20)
paramGrid = ParamGridBuilder()\
.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
.addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
.build()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
cvModel = cv.fit(trainDF)
# Make predictions on test documents. cvModel uses the best model found.
best_prediction = cvModel.transform(testDF)
best_output = evaluator.evaluate(best_prediction)
s = str(notuning_output) + '\n' + str(best_output)
output_data = sc.parallelize([s])
output_data.saveAsTextFile(output)
示例4: main
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
示例5: dump_training_info
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def dump_training_info(blorModel):
"""
This function is useful for debugging when we do not converge to a
solution during LogisticRegression.
"""
trainingSummary = blorModel.summary
print("Total iterations: %d" % trainingSummary.totalIterations)
print("Intercepts: " + str(blorModel.intercept))
print("Coefficients: " + str(blorModel.coefficients))
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
print(objective)
示例6: compute_regression
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def compute_regression(spark, rdd_list, regParam, elasticNetParam):
df0 = spark.sparkContext.union(rdd_list).toDF()
blor = LogisticRegression(
maxIter=50,
regParam=regParam,
weightCol="weight",
elasticNetParam=elasticNetParam,
)
blorModel = blor.fit(df0)
return blorModel
示例7: spark_model_iris
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def spark_model_iris(iris_df):
feature_names, iris_pandas_df, iris_spark_df = iris_df
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
pipeline = Pipeline(stages=[assembler, lr])
# Fit the model
model = pipeline.fit(iris_spark_df)
preds_df = model.transform(iris_spark_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
return SparkModelWithData(model=model,
spark_df=iris_spark_df,
pandas_df=iris_pandas_df,
predictions=preds)
示例8: spark_model_estimator
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def spark_model_estimator(iris_df, spark_context):
feature_names, iris_pandas_df, iris_spark_df = iris_df
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
features_df = assembler.transform(iris_spark_df)
lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
# Fit the model
model = lr.fit(features_df)
preds_df = model.transform(features_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
return SparkModelWithData(model=model,
spark_df=features_df,
pandas_df=iris_pandas_df,
predictions=preds)
示例9: test_LogisticRegression_spark2skl
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def test_LogisticRegression_spark2skl(self):
lr = LogisticRegression().fit(self.df)
skl_lr = self.converter.toSKLearn(lr)
self.assertTrue(isinstance(skl_lr, SKL_LogisticRegression),
"Expected sklearn LogisticRegression but found type %s" % type(skl_lr))
self._compare_GLMs(skl_lr, lr)
# Make sure this doesn't throw an error
skl_lr.predict_proba(self.X)
示例10: test_model_logistic_regression_binary_class
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def test_model_logistic_regression_binary_class(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
original_data = self.spark.read.format("libsvm").load(input_path)
#
# truncate the features
#
self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
VectorUDT())
data = original_data.selectExpr("label", "truncateFeatures(features) as features")
lr = LogisticRegression(maxIter=100, tol=0.0001)
model = lr.fit(data)
# the name of the input for Logistic Regression is 'features'
C = model.numFeatures
model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, C]))])
self.assertTrue(model_onnx is not None)
# run the model
import pandas
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
# known error in onnxruntime 0.3.0 case
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlLogisticRegression")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例11: main
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def main(argv):
# Name of prediction column
label = argv[1]
start = time.time()
spark = SparkSession.builder \
.master("local[*]") \
.appName("datasetClassifier") \
.getOrCreate()
data = spark.read.parquet(argv[0]).cache()
vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count : {featureCount}")
classCount = int(data.select(label).distinct().count())
print(f"Class count : {classCount}")
print(f"Dataset size (unbalanced) : {data.count()}")
data.groupby(label).count().show(classCount)
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced) : {data.count()}")
data.groupby(label).count().show(classCount)
testFraction = 0.3
seed = 123
# DecisionTree
dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# RandomForest
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# LogisticRegression
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# MultilayerPerceptronClassifier
layers = [featureCount, 10, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
.setBlockSize(128) \
.setSeed(1234) \
.setMaxIter(200)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
end = time.time()
print("Time: %f sec." %(end-start))
示例12: test_model_pipeline_4_stage
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import LogisticRegression [as 别名]
def test_model_pipeline_4_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))
stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
stages.append(LogisticRegression(maxIter=100, tol=0.0001))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('income', StringTensorType([1, 1])),
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'income': test_data.select('income').toPandas().values,
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
expected = [
predicted.toPandas().label.values.astype(numpy.float32),
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_4Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)