当前位置: 首页>>代码示例>>Python>>正文


Python KMeans.train方法代码示例

本文整理汇总了Python中pyspark.mllib.clustering.KMeans.train方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.train方法的具体用法?Python KMeans.train怎么用?Python KMeans.train使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.clustering.KMeans的用法示例。


在下文中一共展示了KMeans.train方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: detect

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def detect(self, k, t):
        #Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        #Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        #Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        #Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t) 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:22,代码来源:anomalies_detection.py

示例2: detect

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sparkCt.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t) 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:22,代码来源:anomalies_detection.py

示例3: test_kmeans_deterministic

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans_deterministic(self):
        from pyspark.mllib.clustering import KMeans
        X = range(0, 100, 10)
        Y = range(0, 100, 10)
        data = [[x, y] for x, y in zip(X, Y)]
        clusters1 = KMeans.train(self.sc.parallelize(data),
                                 3, initializationMode="k-means||",
                                 seed=42, initializationSteps=7, epsilon=1e-4)
        clusters2 = KMeans.train(self.sc.parallelize(data),
                                 3, initializationMode="k-means||",
                                 seed=42, initializationSteps=7, epsilon=1e-4)
        centers1 = clusters1.centers
        centers2 = clusters2.centers
        for c1, c2 in zip(centers1, centers2):
            # TODO: Allow small numeric difference.
            self.assertTrue(array_equal(c1, c2)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:tests.py

示例4: train_coarse

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def train_coarse(sc, split_vecs, V, seed=None):
    """
    Perform KMeans on each split of the data with V clusters each.
    """

    # Cluster first split
    first = split_vecs.map(lambda x: x[0])
    first.cache()
    print 'Total training set size: %d' % first.count()
    print 'Starting training coarse quantizer...'
    C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    first.unpersist()

    # Cluster second split
    second = split_vecs.map(lambda x: x[1])
    second.cache()
    print 'Starting training coarse quantizer...'
    C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    second.unpersist()

    return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters) 
开发者ID:yahoo,项目名称:lopq,代码行数:25,代码来源:train_model.py

示例5: fit

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def fit(self, Z):
        """Compute k-means clustering.

        Parameters
        ----------
        Z : ArrayRDD or DictRDD containing array-like or sparse matrix
            Train data.

        Returns
        -------
        self
        """
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))
        if self.init == 'k-means||':
            self._mllib_model = MLlibKMeans.train(
                X.unblock(),
                self.n_clusters,
                maxIterations=self.max_iter,
                initializationMode="k-means||")
            self.cluster_centers_ = self._mllib_model.centers
        else:
            models = X.map(lambda X: super(SparkKMeans, self).fit(X))
            models = models.map(lambda model: model.cluster_centers_).collect()
            return super(SparkKMeans, self).fit(np.concatenate(models)) 
开发者ID:lensacom,项目名称:sparkit-learn,代码行数:27,代码来源:k_means_.py

示例6: generate_kmeans_model

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def generate_kmeans_model(rdd, k):
    return KMeans.train(rdd, k, maxIterations=10, runs=30,
                                initializationMode="random", seed=50, initializationSteps=5, epsilon=1e-4) 
开发者ID:hanhanwu,项目名称:Hanhan-Spark-Python,代码行数:5,代码来源:word2vec_kmeans.py

示例7: test_bisecting_kmeans

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_bisecting_kmeans(self):
        from pyspark.mllib.clustering import BisectingKMeans
        data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
        bskm = BisectingKMeans()
        model = bskm.train(self.sc.parallelize(data, 2), k=4)
        p = array([0.0, 0.0])
        rdd_p = self.sc.parallelize([p])
        self.assertEqual(model.predict(p), model.predict(rdd_p).first())
        self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
        self.assertEqual(model.k, len(model.clusterCenters)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:12,代码来源:tests.py

示例8: test_kmeans

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans(self):
        from pyspark.mllib.clustering import KMeans
        data = [
            [0, 1.1],
            [0, 1.2],
            [1.1, 0],
            [1.2, 0],
        ]
        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||",
                                initializationSteps=7, epsilon=1e-4)
        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3])) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:14,代码来源:tests.py

示例9: test_gmm

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_gmm(self):
        from pyspark.mllib.clustering import GaussianMixture
        data = self.sc.parallelize([
            [1, 2],
            [8, 9],
            [-4, -3],
            [-6, -7],
        ])
        clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
                                         maxIterations=10, seed=1)
        labels = clusters.predict(data).collect()
        self.assertEqual(labels[0], labels[1])
        self.assertEqual(labels[2], labels[3]) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:15,代码来源:tests.py

示例10: test_gmm_deterministic

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_gmm_deterministic(self):
        from pyspark.mllib.clustering import GaussianMixture
        x = range(0, 100, 10)
        y = range(0, 100, 10)
        data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
        clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001,
                                          maxIterations=10, seed=63)
        clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
                                          maxIterations=10, seed=63)
        for c1, c2 in zip(clusters1.weights, clusters2.weights):
            self.assertEqual(round(c1, 7), round(c2, 7)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:13,代码来源:tests.py

示例11: test_clustering

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_clustering(self):
        from pyspark.mllib.clustering import KMeans
        data = [
            self.scipy_matrix(3, {1: 1.0}),
            self.scipy_matrix(3, {1: 1.1}),
            self.scipy_matrix(3, {2: 1.0}),
            self.scipy_matrix(3, {2: 1.1})
        ]
        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3])) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:13,代码来源:tests.py

示例12: test_classification

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:39,代码来源:tests.py

示例13: test_regression

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:39,代码来源:tests.py

示例14: test_trainOn_model

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(
            centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offset for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        self.ssc.start()

        # Give enough time to train the model.
        def condition():
            finalModel = stkm.latestModel()
            self.assertTrue(all(finalModel.centers == array(initCenters)))
            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
            return True
        self._eventually(condition, catch_assertions=True) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:28,代码来源:tests.py

示例15: test_fpgrowth

# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_fpgrowth(self):
        data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
        rdd = self.sc.parallelize(data, 2)
        model1 = FPGrowth.train(rdd, 0.6, 2)
        # use default data partition number when numPartitions is not specified
        model2 = FPGrowth.train(rdd, 0.6)
        self.assertEqual(sorted(model1.freqItemsets().collect()),
                         sorted(model2.freqItemsets().collect())) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:10,代码来源:tests.py


注:本文中的pyspark.mllib.clustering.KMeans.train方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。