当前位置: 首页>>代码示例>>Python>>正文


Python Vectors.dense方法代码示例

本文整理汇总了Python中pyspark.ml.linalg.Vectors.dense方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.dense方法的具体用法?Python Vectors.dense怎么用?Python Vectors.dense使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.linalg.Vectors的用法示例。


在下文中一共展示了Vectors.dense方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: append_features

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list})) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:21,代码来源:feature_engineering.py

示例2: zero_features

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features})) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:25,代码来源:feature_engineering.py

示例3: test_vector_to_array

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_to_array(spark_test_ctx):
    from pyspark.ml.linalg import Vectors
    from pyspark.mllib.linalg import Vectors as OldVectors
    df = spark_test_ctx.spark.createDataFrame([
        (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
        (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))
    ], ["vec", "oldVec"])
    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset(num_epochs=1) as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.vec.dtype.type
    assert np.float32 == ts.oldVec.dtype.type
    vec_col = ts.vec[ts.vec[:, 0].argsort()]
    old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()]
    assert (2, 3) == ts.vec.shape
    assert (2, 3) == ts.oldVec.shape
    assert ([1., 2., 3.] == vec_col[0]).all() and \
           ([5., 6., 7.] == vec_col[1]).all()
    assert ([10., 20., 30.] == old_vec_col[0]).all() and \
           ([50., 60., 70] == old_vec_col[1]).all() 
开发者ID:uber,项目名称:petastorm,代码行数:25,代码来源:test_spark_dataset_converter.py

示例4: test_vector_size_hint

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:tests.py

示例5: test_save_load_trained_model

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for CrossValidator will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        lrModel = cvModel.bestModel

        cvModelPath = temp_path + "/cvModel"
        lrModel.save(cvModelPath)
        loadedLrModel = LogisticRegressionModel.load(cvModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:25,代码来源:tests.py

示例6: test_parallel_evaluation

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvs.setParallelism(1)
        tvsSerialModel = tvs.fit(dataset)
        tvs.setParallelism(2)
        tvsParallelModel = tvs.fit(dataset)
        self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:19,代码来源:tests.py

示例7: test_linear_regression_pmml_basic

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_linear_regression_pmml_basic(self):
        # Most of the validation is done in the Scala side, here we just check
        # that we output text rather than parquet (e.g. that the format flag
        # was respected).
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1)
        model = lr.fit(df)
        path = tempfile.mkdtemp()
        lr_path = path + "/lr-pmml"
        model.write().format("pmml").save(lr_path)
        pmml_text_list = self.sc.textFile(lr_path).collect()
        pmml_text = "\n".join(pmml_text_list)
        self.assertIn("Apache Spark", pmml_text)
        self.assertIn("PMML", pmml_text) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:tests.py

示例8: test_onevsrest

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_onevsrest(self):
        temp_path = tempfile.mkdtemp()
        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                         (1.0, Vectors.sparse(2, [], [])),
                                         (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                        ["label", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(df)
        ovrPath = temp_path + "/ovr"
        ovr.save(ovrPath)
        loadedOvr = OneVsRest.load(ovrPath)
        self._compare_pipelines(ovr, loadedOvr)
        modelPath = temp_path + "/ovrModel"
        model.save(modelPath)
        loadedModel = OneVsRestModel.load(modelPath)
        self._compare_pipelines(model, loadedModel) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:19,代码来源:tests.py

示例9: test_gaussian_mixture_summary

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:19,代码来源:tests.py

示例10: test_multinomial_logistic_regression_with_bound

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_multinomial_logistic_regression_with_bound(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lor = LogisticRegression(regParam=0.01,
                                 lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
        model = lor.fit(df)
        expected = [[4.593, 4.5516, 9.0099, 12.2904],
                    [1.0, 8.1093, 7.0, 10.0],
                    [3.041, 5.0, 8.0, 11.0]]
        for i in range(0, len(expected)):
            self.assertTrue(
                np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
        self.assertTrue(
            np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:19,代码来源:tests.py

示例11: test_raw_and_probability_prediction

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
                                             blockSize=128, seed=123)
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [57.3955, -124.5462, 67.9943]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
        self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:tests.py

示例12: _toSparkGLM

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def _toSparkGLM(self, model):
        """ Private method for converting a GLM to a Spark model
        TODO: Add model parameters as well.
        """
        skl_cls = type(model)
        py_cls = self._skl2spark_classes[skl_cls].py
        jvm_cls_name = self._skl2spark_classes[skl_cls].jvm
        intercept = model.intercept_
        weights = model.coef_
        if len(np.shape(weights)) == 1\
                or (len(np.shape(weights)) == 2 and np.shape(weights)[0] == 1):
            # Binary classification
            uid = _randomUID(skl_cls)
            _java_model = _new_java_obj(self.sc, jvm_cls_name, uid, Vectors.dense(weights), float(intercept))
            return py_cls(_java_model)
        elif len(np.shape(weights)) == 2 and skl_cls == SKL_LogisticRegression:
            # Multiclass label
            raise ValueError("Converter.toSpark cannot convert a multiclass sklearn Logistic" +
                             " Regression model to Spark because Spark does not yet support" +
                             " multiclass.  Given model is for %d classes." %
                             np.shape(weights)[0])
        else:
            raise Exception("Converter.toSpark experienced unknown error when trying to convert" +
                            " a model of type: " + type(model) + "  " + len(np.shape(weights))) 
开发者ID:databricks,项目名称:spark-sklearn,代码行数:26,代码来源:converter.py

示例13: test_chi_sq_selector

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_chi_sq_selector(self):
        data = self.spark.createDataFrame([
            (Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
            (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
            (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)
        ], ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
        model = selector.fit(data)
        print(model.selectedFeatures)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
开发者ID:onnx,项目名称:onnxmltools,代码行数:26,代码来源:test_chi_sql_selector.py

示例14: test_vector_slicer

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_vector_slicer(self):
        data = self.spark.createDataFrame([
            (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
            (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
            (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
        model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])

        feature_count = data.first()[0].array.size
        model_onnx = convert_sparkml(model, 'Sparkml VectorSlicer',
                                     [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().sliced.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
开发者ID:onnx,项目名称:onnxmltools,代码行数:22,代码来源:test_vector_slicer.py

示例15: test_model_linear_regression_basic

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import dense [as 别名]
def test_model_linear_regression_basic(self):
        data = self.spark.createDataFrame([
            (1.0, 2.0, Vectors.dense(1.0)),
            (0.0, 2.0, Vectors.sparse(1, [], []))
        ], ["label", "weight", "features"])
        lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
        model = lr.fit(data)
        # the name of the input is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([1, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        import pandas
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlLinearRegressor_Basic")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5) 
开发者ID:onnx,项目名称:onnxmltools,代码行数:23,代码来源:test_linear_regressor.py


注:本文中的pyspark.ml.linalg.Vectors.dense方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。