本文整理匯總了Python中pyspark.mllib.clustering.GaussianMixture類的典型用法代碼示例。如果您正苦於以下問題:Python GaussianMixture類的具體用法?Python GaussianMixture怎麽用?Python GaussianMixture使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了GaussianMixture類的11個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_gmm_deterministic
def test_gmm_deterministic(self):
from pyspark.mllib.clustering import GaussianMixture
x = range(0, 100, 10)
y = range(0, 100, 10)
data = self.sc.parallelize([[a, b] for a, b in zip(x, y)])
clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63)
clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63)
for c1, c2 in zip(clusters1.weights, clusters2.weights):
self.assertEquals(round(c1, 7), round(c2, 7))
示例2: test_gmm_with_initial_model
def test_gmm_with_initial_model(self):
from pyspark.mllib.clustering import GaussianMixture
data = self.sc.parallelize([
(-10, -5), (-9, -4), (10, 5), (9, 4)
])
gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001,
maxIterations=10, seed=63)
gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001,
maxIterations=10, seed=63, initialModel=gmm1)
self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
示例3: gmm_spark
def gmm_spark(sc, X=None, clusters=3):
if X is None:
X = users_as_parallelizable_sparse_data(users)
X = sc.parallelize(X)
gmm = GaussianMixture.train(X, k=clusters)
for i in range(2):
print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray())
示例4: test_gmm
def test_gmm(self):
from pyspark.mllib.clustering import GaussianMixture
data = self.sc.parallelize([[1, 2], [8, 9], [-4, -3], [-6, -7]])
clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=100, seed=56)
labels = clusters.predict(data).collect()
self.assertEquals(labels[0], labels[1])
self.assertEquals(labels[2], labels[3])
示例5: array
from numpy import array
from pyspark import SparkContext
import matplotlib.pyplot as plt
import numpy as np
#plt.figure()
sc=SparkContext()
data=sc.textFile("./coord.txt")
#test_plot=np.genfromtxt("./coord.txt",delimiter=',',dtype=float)
#plt.plot(test_plot[:,1],test_plot[:,0],'ro')
#plt.show()
parsedData=data.map(lambda line: array([float(x) for x in line.strip().split(',')]))
l=3
gmm = GaussianMixture.train(parsedData,l)
#x=np.zeros(90000)
#y=np.zeros(90000)
#for i in range(0,l):
#print "w= ",gmm.weights[i]
#print "sigma= ",gmm.gaussians[i].sigma.toArray()
#print "mu= ",gmm.gaussians[i].mu
#x1=gmm.weights[0]*np.random.multivariate_normal(gmm.gaussians[0].mu,gmm.gaussians[0].sigma.toArray(),90000)
#x2=gmm.weights[1]*np.random.multivariate_normal(gmm.gaussians[1].mu,gmm.gaussians[1].sigma.toArray(),90000)
file = open("./GMM.txt",'w')
for j in range(0,l):
file.write(str(gmm.weights[j])+'\n')
示例6: enumerate
row_num = info_df.filter(info_df.high == 'IT').count()
for index, repo in enumerate(repos):
for pk_aids in repo:
elements = repo.get(pk_aids)
for element in elements:
for col_index, col in enumerate(cols):
if element.get(col) is not None:
rows[index].get(pk_aids)[col_index]=element.get(col)
print(element.get(col))
for index, row in enumerate(rows):
for pk_aids in row:
if rows[index].get(pk_aids) is not None:
if index == 0:
data = rows[index].get(pk_aids)
else:
data = np.concatenate((data, rows[index].get(pk_aids)), axis=0)
print(data)
#Parameters:
#data – RDD of data points
#k – Number of components
#convergenceTol – Threshold value to check the convergence criteria. Defaults to 1e-3
#maxIterations – Number of iterations. Default to 100
#seed – Random Seed
#initialModel – GaussianMixtureModel for initializing learning
model = GaussianMixture.train(data, 10, convergenceTol=0.0001,maxIterations=50)
labels = model.predict(data).collect()
print
示例7: range
df = pd.DataFrame(l, index = ['gp1_P', 'gp2_P', 'gp3_P', 'gp4_P', 'gp5_P', 'gp6_P'],
columns = ['gp1_R', 'gp2_R', 'gp3_R', 'gp4_R', 'gp5_R', 'gp6_R'])
df
# ### Interprétation (à finir)
Avec Kmeans, 2 groupes se distinguent : 4 et 6
Le groupe gp1_P regroupe 123 des individus et mélange nettement gp1_R / gp2_R / gp3_R
# ## Gaussian Mixture
# In[12]:
from pyspark.mllib.clustering import GaussianMixture
# Construction du model avc le mm dataTrain que Kmeans
gmm = GaussianMixture.train(dataTrain, 6)
# sortie des parameters du modele
for i in range(2):
print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
"sigma = ", gmm.gaussians[i].sigma.toArray())
# ### Interprétation (à finir)
# # Mesures d'évaluation (en cours)
# In[30]:
from pyspark.mllib.evaluation import MultilabelMetrics
示例8: SparkConf
:param convergenceTol: Convergence threshold. Default to 1e-3
:param maxIterations: Number of EM iterations to perform. Default to 100
:param seed: Random seed
"""
parser = argparse.ArgumentParser()
parser.add_argument('inputFile', help='Input File')
parser.add_argument('k', type=int, help='Number of clusters')
parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold')
parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations')
parser.add_argument('--seed', default=random.getrandbits(19),
type=long, help='Random seed')
args = parser.parse_args()
conf = SparkConf().setAppName("GMM")
sc = SparkContext(conf=conf)
lines = sc.textFile(args.inputFile)
data = lines.map(parseVector)
model = GaussianMixture.train(data, args.k, args.convergenceTol,
args.maxIterations, args.seed)
for i in range(args.k):
print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
"sigma = ", model.gaussians[i].sigma.toArray()))
print("\n")
print(("The membership value of each vector to all mixture components (first 100): ",
model.predictSoft(data).take(100)))
print("\n")
print(("Cluster labels (first 100): ", model.predict(data).take(100)))
sc.stop()
示例9: SparkContext
# -*- coding:utf-8 -*-
""""
Program: GMM
Description: 調用spark內置的GMM算法示例
Author: zhenglei - [email protected]
Date: 2016-01-14 13:38:58
Last modified: 2016-01-14 13:50:11
Python release: 2.7
"""
# 調用spark內部的kmeans算法實現完成機器學習實戰中的第十章示例
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.clustering import GaussianMixture
if __name__ == '__main__':
sc = SparkContext()
datas = sc.textFile('testSet.txt')
clusters_num = 4
parseData = datas.map(lambda x: array([float(y) for y in x.split('\t')]))
model = GaussianMixture.train(parseData, clusters_num, maxIterations=10)
clusters = [[] for i in range(clusters_num)]
labels = model.predict(parseData).collect()
nums = len(labels)
for i in xrange(nums):
clusters[labels[i]].append(parseData.collect()[i])
print clusters
sc.stop()
示例10: dict
# print data1.take(5)
# Without converting the features into dense vectors, transformation with zero mean will raise
# exception on sparse vector.
# data2 will be unit variance and zero mean.
data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
parsedData = data2.map (lambda x: x[1])
parsedData.cache()
modelList = [];
d = dict()
noClusters = 5
convergenceTol = 1e-3
maxIterations = 1000
seed = random.getrandbits(19)
# Build the model (cluster the data)
gmm = GaussianMixture.train(parsedData, noClusters, convergenceTol,
maxIterations, seed)
# output parameters of model
for i in range(noOfClusters):
print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
"sigma = ", gmm.gaussians[i].sigma.toArray())
"""
for clusterSize in range(2, 21, 2):
# Build the model (cluster the data)
clusters = KMeans.train(parsedData, clusterSize, maxIterations=10,runs=10, initializationMode="random")
modelList.append(clusters)
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
示例11: SparkContext
from pyspark import SparkContext
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
if __name__ == "__main__":
sc = SparkContext(appName="GaussianMixtureExample") # SparkContext
### Local default options
k=2 # "k" (int) Set the number of Gaussians in the mixture model. Default: 2
convergenceTol=0.001 # "convergenceTol" (double) Set the largest change in log-likelihood at which convergence is considered to have occurred.
maxIterations=150 # "maxIterations" (int) Set the maximum number of iterations to run. Default: 100
seed=None # "seed" (long) Set the random seed
# Load and parse the data
data = sc.textFile("/var/mdp-cloud/gmm_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')]))
# filteredData = data.filter(lambda arr: int(arr[1]) != 0)
# Build and save the model (cluster the data)
gmm = GaussianMixture.train(parsedData, k, convergenceTol=0.001, maxIterations=150, seed=None)
# gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
# gmm.save(sc, "GaussianMixtureModel_CV")
# The following line would load the model
# sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
# output parameters of model
for i in range(k):
print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
"sigma = ", gmm.gaussians[i].sigma.toArray())
sc.stop()