本文整理汇总了Python中pyspark.ml.linalg.Vectors.dense方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.dense方法的具体用法?Python Vectors.dense怎么用?Python Vectors.dense使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.linalg.Vectors
的用法示例。
在下文中一共展示了Vectors.dense方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: append_features
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def append_features(df, *cols):
"""Append features from columns to the features vector.
Parameters
----------
df : pyspark.sql.DataFrame
cols : list of str
Returns
-------
pyspark.sql.DataFrame
"""
def add_features(feat, *other):
raw = feat.toArray()
return Vectors.dense(np.append(raw, list(map(float, other))))
add_features_udf = F.udf(add_features, VectorUDT())
new_feat_list = df.schema['features'].metadata['features'] + cols
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
示例2: zero_features
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def zero_features(df, *feature_names):
"""Zero out features in the feature vector.
Parameters
----------
df : pyspark.sql.DataFrame
feature_names : list of str
Returns
-------
pyspark.sql.DataFrame
"""
features = df.schema['features'].metadata['features']
idxs = [features.index(name) for name in feature_names]
def zero_features(feat):
raw = feat.toArray()
for idx in idxs:
raw[idx] = 0.
return Vectors.dense(raw)
zero_features_udf = F.udf(zero_features, VectorUDT())
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, zero_features_udf('features'), {'features': features}))
示例3: test_vector_to_array
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_to_array(spark_test_ctx):
from pyspark.ml.linalg import Vectors
from pyspark.mllib.linalg import Vectors as OldVectors
df = spark_test_ctx.spark.createDataFrame([
(Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
(Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))
], ["vec", "oldVec"])
converter1 = make_spark_converter(df)
with converter1.make_tf_dataset(num_epochs=1) as dataset:
iterator = dataset.make_one_shot_iterator()
tensor = iterator.get_next()
with tf.Session() as sess:
ts = sess.run(tensor)
assert np.float32 == ts.vec.dtype.type
assert np.float32 == ts.oldVec.dtype.type
vec_col = ts.vec[ts.vec[:, 0].argsort()]
old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()]
assert (2, 3) == ts.vec.shape
assert (2, 3) == ts.oldVec.shape
assert ([1., 2., 3.] == vec_col[0]).all() and \
([5., 6., 7.] == vec_col[1]).all()
assert ([10., 20., 30.] == old_vec_col[0]).all() and \
([50., 60., 70] == old_vec_col[1]).all()
示例4: test_vector_size_hint
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_size_hint(self):
df = self.spark.createDataFrame(
[(0, Vectors.dense([0.0, 10.0, 0.5])),
(1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
(2, Vectors.dense([2.0, 12.0]))],
["id", "vector"])
sizeHint = VectorSizeHint(
inputCol="vector",
handleInvalid="skip")
sizeHint.setSize(3)
self.assertEqual(sizeHint.getSize(), 3)
output = sizeHint.transform(df).head().vector
expected = DenseVector([0.0, 10.0, 0.5])
self.assertEqual(output, expected)
示例5: test_save_load_trained_model
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_save_load_trained_model(self):
# This tests saving and loading the trained model only.
# Save/load for CrossValidator will be added later: SPARK-13786
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
lrModel = cvModel.bestModel
cvModelPath = temp_path + "/cvModel"
lrModel.save(cvModelPath)
loadedLrModel = LogisticRegressionModel.load(cvModelPath)
self.assertEqual(loadedLrModel.uid, lrModel.uid)
self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
示例6: test_parallel_evaluation
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_parallel_evaluation(self):
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
evaluator = BinaryClassificationEvaluator()
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
tvs.setParallelism(1)
tvsSerialModel = tvs.fit(dataset)
tvs.setParallelism(2)
tvsParallelModel = tvs.fit(dataset)
self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
示例7: test_linear_regression_pmml_basic
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_linear_regression_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1)
model = lr.fit(df)
path = tempfile.mkdtemp()
lr_path = path + "/lr-pmml"
model.write().format("pmml").save(lr_path)
pmml_text_list = self.sc.textFile(lr_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
示例8: test_onevsrest
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_onevsrest(self):
temp_path = tempfile.mkdtemp()
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))] * 10,
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
model = ovr.fit(df)
ovrPath = temp_path + "/ovr"
ovr.save(ovrPath)
loadedOvr = OneVsRest.load(ovrPath)
self._compare_pipelines(ovr, loadedOvr)
modelPath = temp_path + "/ovrModel"
model.save(modelPath)
loadedModel = OneVsRestModel.load(modelPath)
self._compare_pipelines(model, loadedModel)
示例9: test_gaussian_mixture_summary
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_gaussian_mixture_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
gmm = GaussianMixture(k=2)
model = gmm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertTrue(isinstance(s.probability, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 3)
示例10: test_multinomial_logistic_regression_with_bound
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_multinomial_logistic_regression_with_bound(self):
data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = self.spark.read.format("libsvm").load(data_path)
lor = LogisticRegression(regParam=0.01,
lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
model = lor.fit(df)
expected = [[4.593, 4.5516, 9.0099, 12.2904],
[1.0, 8.1093, 7.0, 10.0],
[3.041, 5.0, 8.0, 11.0]]
for i in range(0, len(expected)):
self.assertTrue(
np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
self.assertTrue(
np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
示例11: test_raw_and_probability_prediction
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_raw_and_probability_prediction(self):
data_path = "data/mllib/sample_multiclass_classification_data.txt"
df = self.spark.read.format("libsvm").load(data_path)
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
blockSize=128, seed=123)
model = mlp.fit(df)
test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
result = model.transform(test).head()
expected_prediction = 2.0
expected_probability = [0.0, 0.0, 1.0]
expected_rawPrediction = [57.3955, -124.5462, 67.9943]
self.assertTrue(result.prediction, expected_prediction)
self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
示例12: _toSparkGLM
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def _toSparkGLM(self, model):
""" Private method for converting a GLM to a Spark model
TODO: Add model parameters as well.
"""
skl_cls = type(model)
py_cls = self._skl2spark_classes[skl_cls].py
jvm_cls_name = self._skl2spark_classes[skl_cls].jvm
intercept = model.intercept_
weights = model.coef_
if len(np.shape(weights)) == 1\
or (len(np.shape(weights)) == 2 and np.shape(weights)[0] == 1):
# Binary classification
uid = _randomUID(skl_cls)
_java_model = _new_java_obj(self.sc, jvm_cls_name, uid, Vectors.dense(weights), float(intercept))
return py_cls(_java_model)
elif len(np.shape(weights)) == 2 and skl_cls == SKL_LogisticRegression:
# Multiclass label
raise ValueError("Converter.toSpark cannot convert a multiclass sklearn Logistic" +
" Regression model to Spark because Spark does not yet support" +
" multiclass. Given model is for %d classes." %
np.shape(weights)[0])
else:
raise Exception("Converter.toSpark experienced unknown error when trying to convert" +
" a model of type: " + type(model) + " " + len(np.shape(weights)))
示例13: test_chi_sq_selector
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_chi_sq_selector(self):
data = self.spark.createDataFrame([
(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)
], ["features", "label"])
selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
model = selector.fit(data)
print(model.selectedFeatures)
# the input name should match that of what StringIndexer.inputCol
feature_count = data.first()[0].size
N = data.count()
model_onnx = convert_sparkml(model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([N, feature_count]))])
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
expected = predicted.toPandas().selectedFeatures.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例14: test_vector_slicer
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_slicer(self):
data = self.spark.createDataFrame([
(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
(Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
(Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
feature_count = data.first()[0].array.size
model_onnx = convert_sparkml(model, 'Sparkml VectorSlicer',
[('features', FloatTensorType([1, feature_count]))])
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
expected = predicted.toPandas().sliced.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例15: test_model_linear_regression_basic
# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_model_linear_regression_basic(self):
data = self.spark.createDataFrame([
(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))
], ["label", "weight", "features"])
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
model = lr.fit(data)
# the name of the input is 'features'
C = model.numFeatures
model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))])
self.assertTrue(model_onnx is not None)
# run the model
import pandas
predicted = model.transform(data)
data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlLinearRegressor_Basic")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)