当前位置: 首页>>代码示例>>Python>>正文


Python Vectors.sparse方法代码示例

本文整理汇总了Python中pyspark.ml.linalg.Vectors.sparse方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.sparse方法的具体用法?Python Vectors.sparse怎么用?Python Vectors.sparse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.linalg.Vectors的用法示例。


在下文中一共展示了Vectors.sparse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ztest_toPandas

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)
开发者ID:Sandy4321,项目名称:spark-sklearn,代码行数:17,代码来源:converter_test.py

示例2: test_java_object_gets_detached

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
开发者ID:Brett-A,项目名称:spark,代码行数:37,代码来源:test_wrapper.py

示例3: test_persistence

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
开发者ID:Brett-A,项目名称:spark,代码行数:36,代码来源:test_algorithms.py

示例4: mldemo

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
开发者ID:cyliustack,项目名称:benchmark,代码行数:20,代码来源:spark_ml_basic.py

示例5: test_output_columns

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
开发者ID:Brett-A,项目名称:spark,代码行数:12,代码来源:test_algorithms.py

示例6: test_parallelism_doesnt_change_output

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_algorithms.py

示例7: test_support_for_weightCol

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_algorithms.py

示例8: test_copy

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_algorithms.py

示例9: test_bisecting_kmeans_summary

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
开发者ID:Brett-A,项目名称:spark,代码行数:17,代码来源:test_training_summary.py

示例10: test_linear_regression_pmml_basic

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
开发者ID:JingchengDu,项目名称:spark,代码行数:18,代码来源:test_persistence.py

示例11: test_gaussian_mixture_summary

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
开发者ID:Brett-A,项目名称:spark,代码行数:19,代码来源:test_training_summary.py

示例12: test_onevsrest

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
开发者ID:JingchengDu,项目名称:spark,代码行数:19,代码来源:test_persistence.py

示例13: parse

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
def parse(line):
    obj = json.loads(line)
    fc = obj[featureCol]
    if "size" not in fc and "type" not in fc:
        feature_size = len(fc)
        dic = [(i, a) for i, a in enumerate(fc)]
        sv = SparseVector(len(fc), dic)
    elif "size" not in fc and "type" in fc and fc["type"] == 1:
        values = fc["values"]
        feature_size = len(values)
        dic = [(i, a) for i, a in enumerate(values)]
        sv = SparseVector(len(values), dic)

    else:
        feature_size = fc["size"]
        sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"])))
    return sv
开发者ID:allwefantasy,项目名称:streamingpro,代码行数:19,代码来源:batchPredict.py

示例14: test_binary_logistic_regression_summary

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_binary_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
开发者ID:Brett-A,项目名称:spark,代码行数:43,代码来源:test_training_summary.py

示例15: test_linear_regression_summary

# 需要导入模块: from pyspark.ml.linalg import Vectors [as 别名]
# 或者: from pyspark.ml.linalg.Vectors import sparse [as 别名]
 def test_linear_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertAlmostEqual(s.r2adj, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     self.assertEqual(s.degreesOfFreedom, 1)
     devResiduals = s.devianceResiduals
     self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned
     # The child class LinearRegressionTrainingSummary runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
开发者ID:Brett-A,项目名称:spark,代码行数:41,代码来源:test_training_summary.py


注:本文中的pyspark.ml.linalg.Vectors.sparse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。