本文整理汇总了Python中pyspark.mllib.clustering.KMeans.train方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.train方法的具体用法?Python KMeans.train怎么用?Python KMeans.train使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.clustering.KMeans
的用法示例。
在下文中一共展示了KMeans.train方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_kmeans_deterministic
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans_deterministic(self):
from pyspark.mllib.clustering import KMeans
X = range(0, 100, 10)
Y = range(0, 100, 10)
data = [[x, y] for x, y in zip(X, Y)]
clusters1 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
clusters2 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
centers1 = clusters1.centers
centers2 = clusters2.centers
for c1, c2 in zip(centers1, centers2):
# TODO: Allow small numeric difference.
self.assertTrue(array_equal(c1, c2))
示例2: clusterKMeanSpark
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def clusterKMeanSpark(matrix,k):
m = transformInRealMatrix(matrix)
sc = SparkContext(appName="Jsonizer: Remove stop words")
parsedData = sc.parallelize(m)
y = []
x = []
clustersControl = range(k,k+1)
for kc in clustersControl:
clusters = KMeans.train(parsedData, kc, maxIterations=50000,runs=200, initializationMode="k-means||",epsilon=0.0001)
clu = []
def error(point,clust):
center = clust.centers[clust.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
WSSSE = parsedData.map(lambda point: error(point,clusters)).reduce(lambda x, y: x + y)
for n in m:
clu += [clusters.predict(np.array(n))]
x += [kc]
y += [WSSSE]
#print(kc,WSSSE)
#plt.plot(x,y)
#plt.ylabel('some numbers')
#plt.show()
ret = [[] for i in range(0,max(clu)+1)]
for i in range(0,len(clu)):
ret[clu[i]] += [i]
sc.stop()
return ret
示例3: train_subquantizers
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def train_subquantizers(sc, split_vecs, M, subquantizer_clusters, model, seed=None):
"""
Project each data point into it's local space and compute subquantizers by clustering
each fine split of the locally projected data.
"""
b = sc.broadcast(model)
def project_local(x):
x = np.concatenate(x)
coarse = b.value.predict_coarse(x)
return b.value.project(x, coarse)
projected = split_vecs.map(project_local)
# Split the vectors into the subvectors
split_vecs = projected.map(lambda x: np.split(x, M))
split_vecs.cache()
subquantizers = []
for split in xrange(M):
data = split_vecs.map(lambda x: x[split])
data.cache()
sub = KMeans.train(data, subquantizer_clusters, initializationMode='random', maxIterations=10, seed=seed)
data.unpersist()
subquantizers.append(np.vstack(sub.clusterCenters))
return (subquantizers[:len(subquantizers) / 2], subquantizers[len(subquantizers) / 2:])
示例4: main
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
logger.info('Loading pickled noun to vector dictionary')
# Load noun to vector dictionary
with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as pickled:
noun_to_vect_dict = pickle.load(pickled)
# Create vector array from mapping
vectors = np.array(noun_to_vect_dict.values())
max_k = int(sqrt(len(vectors) / 2.0))
# Define search space for k
numbers_of_clusters = reversed(range(MIN_K, max_k))
# For each k
for i, k in enumerate(numbers_of_clusters):
# Initialize Spark Context
sc = ps.SparkContext()
# Load data
data = sc.parallelize(vectors, 1024)
logger.info('Trial %i of %i, %i clusters', (i + 1), max_k - 1, k)
# Calculate cluster
kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10,
initializationMode='k-means||')
logger.info('Calculating WSSSE')
# Calculate WSSSE
WSSSE = data.map(lambda point: error(kmeans_model, point)) \
.reduce(lambda x, y: x + y)
logger.info('Writing WSSSE')
# Write k and WSSSE
with open(path.join(OUT_FILES_LOC, 'elbow_data.txt'), 'a') as elbow_data:
elbow_data.write(str(k) + '\t' + str(WSSSE) + '\n')
sc.stop()
示例5: main
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def main(sc):
stopset = set(stopwords.words('english'))
tweets = sc.textFile('hdfs:/adi/sample.txt')
words = tweets.map(lambda word: word.split(" "))
wordArr = []
for wArr in words.collect():
tempArr = []
for w in wArr:
if not w in stopset:
tempArr.append(w)
wordArr.append(tempArr)
# Open a file
# print wordArr
#tokens = sc.textFile("hdfs:/adi/tokens1.txt")
# Load documents (one per line).
documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
numDims = 100000
hashingTF = HashingTF(numDims)
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
tfidf.count()
model = KMeans.train(tfidf, 5)
model.save(sc,"tweetModel1")
print("Final centers: " + str(model.clusterCenters))
# print("Total Cost: " + str(model.computeCost(data)))
sc.stop()
示例6: test_kmeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def test_kmeans(self):
from pyspark.mllib.clustering import KMeans
data = [[0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0]]
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
示例7: main
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
# Load in pickled noun to vector dictionary
logger.info('Loading pickled noun to vector dictionary')
# Load noun to vector dictionary
with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as f:
noun_to_vect_dict = pickle.load(f)
# Create vectors array
vectors = noun_to_vect_dict.values()
# Initialize Spark Context
sc = ps.SparkContext('local[*]')
# Load data
data = sc.parallelize(vectors, 1024)
# Create and fit a KMeans model to the data
logger.info('Fitting KMeans model')
kmeans_model = KMeans.train(data, N_CLUSTERS, maxIterations=10, runs=10,
initializationMode='k-means||')
# Create a list of labels corresponding to vectors
logger.info('Labeling vectors')
labels = [kmeans_model.predict(vector) for vector in vectors]
# Write to text file
logger.info('Writing labels to file')
with open(path.join(OUT_FILE_LOC, 'labels.txt'), 'w') as f:
for label in labels:
f.write(str(label) + '\n')
示例8: KMeansModel
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def KMeansModel(dataPath, label, k, character, master):
sc = SparkContext(master)
data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))
if label == 0:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
else:
label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()
train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
model = km.train(train_data, k)
predict_data = train_data.collect()
train = len(predict_data)
acc = 0
for i in range(len(label_sum)):
ksum = np.zeros(k, dtype = int)
cur_label = label_sum[i][0]
for j in range(train):
if label[j] == cur_label:
ksum[model.predict(predict_data[j])] += 1
acc += max(ksum)
string = "KMeans Result: \n"
center = model.centers
for i in range(k):
cur = str(i) + ":" + str(center[i]) + '\n'
string += cur
string = string + "Acc: " + str((float(acc)/train) * 100) + "%"
sc.stop()
return string
示例9: kMeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def kMeans(vecs, clusterNum):
clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random")
if pv.outputDebugMsg:
Utils.logMessage("\nKmean cluster finished")
return clusters
示例10: fit
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def fit(self, Z):
"""Compute k-means clustering.
Parameters
----------
Z : ArrayRDD or DictRDD containing array-like or sparse matrix
Train data.
Returns
-------
self
"""
X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
check_rdd(X, (np.ndarray, sp.spmatrix))
if self.init == 'k-means||':
self._mllib_model = MLlibKMeans.train(
X.unblock(),
self.n_clusters,
maxIterations=self.max_iter,
initializationMode="k-means||")
self.cluster_centers_ = self._mllib_model.centers
else:
models = X.map(lambda X: super(SparkKMeans, self).fit(X))
models = models.map(lambda model: model.cluster_centers_).collect()
return super(SparkKMeans, self).fit(np.concatenate(models))
示例11: train_model
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def train_model(self, dataframe, k, model_name):
'''
use data to train model
:param dataframe: all columns for train
:param k:k value
:param model_name:the trained model
:return:None
'''
data = self.prepare_data(dataframe)
# train to get model
model = KMeans.train(data, k)
# create model saving path
path = self.base + model_name
# try to delete the old model if it exists
try:
import subprocess
subprocess.call(["hadoop", "fs", "-rm", "-f", path])
except:
pass
# save new model on hdfs
model.save(self.sc, path)
# print all cluster of the model
for c in model.clusterCenters:
l = []
for i in c:
i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
l.append(float(i))
print(l)
示例12: kmeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def kmeans(iterations, theRdd):
def error(point):
center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
clusters = KMeans.train(theRdd, iterations, maxIterations=10,
runs=10, initializationMode="random")
WSSSE = theRdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
return WSSSE, clusters
示例13: main
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def main(arg1, arg2):
sc = SparkContext(appName="KMeans")
lines = sc.textFile(arg1)
data = lines.map(parseVector)
k = int(arg2)
model = KMeans.train(data, k)
print("Final centers: " + str(model.clusterCenters))
print("Total Cost: " + str(model.computeCost(data)))
sc.stop()
示例14: spark_KMeans
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def spark_KMeans(train_data):
maxIterations = 10
runs = 20
numClusters = [2,3,4,5,6,7,8,9,10,11,12,13,14]
errors = []
for k in numClusters:
model = KMeans.train(train_data, k, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
WSSSE = model.computeCost(train_data)
errors.append(WSSSE)
plt.plot(numClusters, errors, 'ro')
plt.xlabel(r'k')
plt.ylabel(r'inertia')
plt.title(r'inertia v.s. k')
plt.savefig('kmeans_cross_validation.png')
bestModel = KMeans.train(train_data, 6, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
return bestModel
示例15: build_cluster_model
# 需要导入模块: from pyspark.mllib.clustering import KMeans [as 别名]
# 或者: from pyspark.mllib.clustering.KMeans import train [as 别名]
def build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs):
"""Perform the clustering of vectors using K-means.
Returns:
k means model learned from the training data in
tfidf_vectors_rdd
"""
# Build the model (cluster the training data)
return KMeans.train(tfidf_vectors_rdd, num_clusters, maxIterations=max_iterations, runs=runs)