本文整理汇总了Python中pyspark.mllib.clustering.KMeans.train方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.train方法的具体用法?Python KMeans.train怎么用?Python KMeans.train使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.clustering.KMeans
的用法示例。
在下文中一共展示了KMeans.train方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: detect
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def detect(self, k, t):
#Encoding categorical features using one-hot.
df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
df1.show(n=2, truncate=False)
#Clustering points using KMeans
features = df1.select("features").rdd.map(lambda row: row[0]).cache()
model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)
#Adding the prediction column to df1
modelBC = sc.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=3, truncate=False)
#Adding the score column to df2; The higher the score, the more likely it is an anomaly
df3 = self.addScore(df2).cache()
df3.show(n=3, truncate=False)
return df3.where(df3.score > t)
示例2: detect
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def detect(self, k, t):
# Encoding categorical features using one-hot.
df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
df1.show(n=2, truncate=False)
# Clustering points using KMeans
features = df1.select("features").rdd.map(lambda row: row[0]).cache()
model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20)
# Adding the prediction column to df1
modelBC = sparkCt.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=3, truncate=False)
# Adding the score column to df2; The higher the score, the more likely it is an anomaly
df3 = self.addScore(df2).cache()
df3.show(n=3, truncate=False)
return df3.where(df3.score > t)
示例3: test_kmeans_deterministic
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans_deterministic(self):
from pyspark.mllib.clustering import KMeans
X = range(0, 100, 10)
Y = range(0, 100, 10)
data = [[x, y] for x, y in zip(X, Y)]
clusters1 = KMeans.train(self.sc.parallelize(data),
3, initializationMode="k-means||",
seed=42, initializationSteps=7, epsilon=1e-4)
clusters2 = KMeans.train(self.sc.parallelize(data),
3, initializationMode="k-means||",
seed=42, initializationSteps=7, epsilon=1e-4)
centers1 = clusters1.centers
centers2 = clusters2.centers
for c1, c2 in zip(centers1, centers2):
# TODO: Allow small numeric difference.
self.assertTrue(array_equal(c1, c2))
示例4: train_coarse
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def train_coarse(sc, split_vecs, V, seed=None):
"""
Perform KMeans on each split of the data with V clusters each.
"""
# Cluster first split
first = split_vecs.map(lambda x: x[0])
first.cache()
print 'Total training set size: %d' % first.count()
print 'Starting training coarse quantizer...'
C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed)
print '... done training coarse quantizer.'
first.unpersist()
# Cluster second split
second = split_vecs.map(lambda x: x[1])
second.cache()
print 'Starting training coarse quantizer...'
C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed)
print '... done training coarse quantizer.'
second.unpersist()
return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters)
示例5: fit
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def fit(self, Z):
"""Compute k-means clustering.
Parameters
----------
Z : ArrayRDD or DictRDD containing array-like or sparse matrix
Train data.
Returns
-------
self
"""
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
check_rdd(X, (np.ndarray, sp.spmatrix))
if self.init == 'k-means||':
self._mllib_model = MLlibKMeans.train(
X.unblock(),
self.n_clusters,
maxIterations=self.max_iter,
initializationMode="k-means||")
self.cluster_centers_ = self._mllib_model.centers
else:
models = X.map(lambda X: super(SparkKMeans, self).fit(X))
models = models.map(lambda model: model.cluster_centers_).collect()
return super(SparkKMeans, self).fit(np.concatenate(models))
示例6: generate_kmeans_model
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def generate_kmeans_model(rdd, k):
return KMeans.train(rdd, k, maxIterations=10, runs=30,
initializationMode="random", seed=50, initializationSteps=5, epsilon=1e-4)
示例7: test_bisecting_kmeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_bisecting_kmeans(self):
from pyspark.mllib.clustering import BisectingKMeans
data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
bskm = BisectingKMeans()
model = bskm.train(self.sc.parallelize(data, 2), k=4)
p = array([0.0, 0.0])
rdd_p = self.sc.parallelize([p])
self.assertEqual(model.predict(p), model.predict(rdd_p).first())
self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
self.assertEqual(model.k, len(model.clusterCenters))
示例8: test_kmeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans(self):
from pyspark.mllib.clustering import KMeans
data = [
[0, 1.1],
[0, 1.2],
[1.1, 0],
[1.2, 0],
]
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||",
initializationSteps=7, epsilon=1e-4)
self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
示例9: test_gmm
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_gmm(self):
from pyspark.mllib.clustering import GaussianMixture
data = self.sc.parallelize([
[1, 2],
[8, 9],
[-4, -3],
[-6, -7],
])
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001,
maxIterations=10, seed=1)
labels = clusters.predict(data).collect()
self.assertEqual(labels[0], labels[1])
self.assertEqual(labels[2], labels[3])
示例10: test_gmm_deterministic
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_gmm_deterministic(self):
from pyspark.mllib.clustering import GaussianMixture
x = range(0, 100, 10)
y = range(0, 100, 10)
data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001,
maxIterations=10, seed=63)
clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001,
maxIterations=10, seed=63)
for c1, c2 in zip(clusters1.weights, clusters2.weights):
self.assertEqual(round(c1, 7), round(c2, 7))
示例11: test_clustering
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_clustering(self):
from pyspark.mllib.clustering import KMeans
data = [
self.scipy_matrix(3, {1: 1.0}),
self.scipy_matrix(3, {1: 1.1}),
self.scipy_matrix(3, {2: 1.0}),
self.scipy_matrix(3, {2: 1.1})
]
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
示例12: test_classification
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
示例13: test_regression
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
示例14: test_trainOn_model
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_trainOn_model(self):
"""Test the model on toy data with four clusters."""
stkm = StreamingKMeans()
initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
stkm.setInitialCenters(
centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])
# Create a toy dataset by setting a tiny offset for each point.
offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
batches = []
for offset in offsets:
batches.append([[offset[0] + center[0], offset[1] + center[1]]
for center in initCenters])
batches = [self.sc.parallelize(batch, 1) for batch in batches]
input_stream = self.ssc.queueStream(batches)
stkm.trainOn(input_stream)
self.ssc.start()
# Give enough time to train the model.
def condition():
finalModel = stkm.latestModel()
self.assertTrue(all(finalModel.centers == array(initCenters)))
self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
return True
self._eventually(condition, catch_assertions=True)
示例15: test_fpgrowth
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_fpgrowth(self):
data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
rdd = self.sc.parallelize(data, 2)
model1 = FPGrowth.train(rdd, 0.6, 2)
# use default data partition number when numPartitions is not specified
model2 = FPGrowth.train(rdd, 0.6)
self.assertEqual(sorted(model1.freqItemsets().collect()),
sorted(model2.freqItemsets().collect()))