本文整理汇总了Python中pyspark.ml.linalg.Vectors.sparse方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.sparse方法的具体用法?Python Vectors.sparse怎么用?Python Vectors.sparse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.linalg.Vectors
的用法示例。
在下文中一共展示了Vectors.sparse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_small_sparse
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_small_sparse(self):
xor = [(0.0, Vectors.sparse(2,[0,1],[0.0,0.0])),
(0.0, Vectors.sparse(2,[0,1],[1.0,1.0])),
(1.0, Vectors.sparse(2,[0],[1.0])),
(1.0, Vectors.sparse(2,[1],[1.0]))]
processed = self.spark.createDataFrame(xor, ["label", "features"])
mg=build_graph(SparkFlowTests.create_model)
spark_model = SparkAsyncDL(
inputCol='features',
tensorflowGraph=mg,
tfInput='x:0',
tfLabel='y:0',
tfOutput='outer/Sigmoid:0',
tfOptimizer='adam',
tfLearningRate=.1,
iters=35,
partitions=2,
predictionCol='predicted',
labelCol='label'
)
assert spark_model.fit(processed).transform(processed).collect() is not None
示例2: test_linear_regression_pmml_basic
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_linear_regression_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1)
model = lr.fit(df)
path = tempfile.mkdtemp()
lr_path = path + "/lr-pmml"
model.write().format("pmml").save(lr_path)
pmml_text_list = self.sc.textFile(lr_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
示例3: test_onevsrest
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_onevsrest(self):
temp_path = tempfile.mkdtemp()
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))] * 10,
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
model = ovr.fit(df)
ovrPath = temp_path + "/ovr"
ovr.save(ovrPath)
loadedOvr = OneVsRest.load(ovrPath)
self._compare_pipelines(ovr, loadedOvr)
modelPath = temp_path + "/ovrModel"
model.save(modelPath)
loadedModel = OneVsRestModel.load(modelPath)
self._compare_pipelines(model, loadedModel)
示例4: test_gaussian_mixture_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_gaussian_mixture_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
gmm = GaussianMixture(k=2)
model = gmm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertTrue(isinstance(s.probability, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 3)
示例5: test_model_linear_regression_basic
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_model_linear_regression_basic(self):
data = self.spark.createDataFrame([
(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))
], ["label", "weight", "features"])
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
model = lr.fit(data)
# the name of the input is 'features'
C = model.numFeatures
model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))])
self.assertTrue(model_onnx is not None)
# run the model
import pandas
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlLinearRegressor_Basic")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例6: test_aft_regression_survival
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_aft_regression_survival(self):
data = self.spark.createDataFrame([
(1.0, Vectors.dense(1.0), 1.0),
(1e-40, Vectors.sparse(1, [], []), 0.0)
], ["label", "features", "censor"])
gbt = AFTSurvivalRegression()
model = gbt.fit(data)
feature_count = data.first()[1].size
model_onnx = convert_sparkml(model, 'Sparkml AFTSurvivalRegression', [
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [
predicted.toPandas().prediction.values.astype(numpy.float32),
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlAFTSurvivalRegression")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例7: test_model_polynomial_expansion
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_model_polynomial_expansion(self):
data = self.spark.createDataFrame([
(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
(Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
(Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)
], ["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(data)
# the input name should match that of what StringIndexer.inputCol
feature_count = data.first()[0].size
N = data.count()
model_onnx = convert_sparkml(model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))])
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
expected = predicted.toPandas().pca_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例8: test_gbt_regressor
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_gbt_regressor(self):
data = self.spark.createDataFrame([
(1.0, Vectors.dense(1.0)),
(0.0, Vectors.sparse(1, [], []))
], ["label", "features"])
gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
model = gbt.fit(data)
feature_count = data.first()[1].size
model_onnx = convert_sparkml(model, 'Sparkml GBTRegressor', [
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [
predicted.toPandas().prediction.values.astype(numpy.float32),
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlGBTRegressor")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例9: test_java_object_gets_detached
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_java_object_gets_detached(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
summary = model.summary
self.assertIsInstance(model, JavaWrapper)
self.assertIsInstance(summary, JavaWrapper)
self.assertIsInstance(model, JavaParams)
self.assertNotIsInstance(summary, JavaParams)
error_no_object = 'Target Object ID does not exist for this gateway'
self.assertIn("LinearRegression_", model._java_obj.toString())
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
model.__del__()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
try:
summary.__del__()
except:
pass
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
summary._java_obj.toString()
示例10: test_persistence
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
df = self.spark.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
示例11: test_linear_regression_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_linear_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
self.assertAlmostEqual(s.meanSquaredError, 0.0)
self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
self.assertAlmostEqual(s.r2, 1.0, 2)
self.assertAlmostEqual(s.r2adj, 1.0, 2)
self.assertTrue(isinstance(s.residuals, DataFrame))
self.assertEqual(s.numInstances, 2)
self.assertEqual(s.degreesOfFreedom, 1)
devResiduals = s.devianceResiduals
self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned
# The child class LinearRegressionTrainingSummary runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
示例12: test_binary_logistic_regression_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_binary_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
self.assertAlmostEqual(s.accuracy, 1.0, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
示例13: test_multiclass_logistic_regression_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_multiclass_logistic_regression_summary(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], [])),
(2.0, 2.0, Vectors.dense(2.0)),
(2.0, 2.0, Vectors.dense(1.9))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.labels, list))
self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
self.assertTrue(isinstance(s.precisionByLabel, list))
self.assertTrue(isinstance(s.recallByLabel, list))
self.assertTrue(isinstance(s.fMeasureByLabel(), list))
self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
self.assertAlmostEqual(s.accuracy, 0.75, 2)
self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
示例14: test_bisecting_kmeans_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_bisecting_kmeans_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2)
model = bkm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 20)
示例15: test_copy
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def test_copy(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
ovr1 = ovr.copy({lr.maxIter: 10})
self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
model = ovr.fit(df)
model1 = model.copy({model.predictionCol: "indexed"})
self.assertEqual(model1.getPredictionCol(), "indexed")