当前位置: 首页>>代码示例>>Python>>正文


Python KMeans.fit方法代码示例

本文整理汇总了Python中pyspark.ml.clustering.KMeans.fit方法的典型用法代码示例。如果您正苦于以下问题:Python KMeans.fit方法的具体用法?Python KMeans.fit怎么用?Python KMeans.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.clustering.KMeans的用法示例。


在下文中一共展示了KMeans.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: kmeans

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def kmeans(df):
	kmeans = KMeans(k=2,seed=1)
	model = kmeans.fit(df)
	centers = model.clusterCenters()
	print len(centers)
	kmFeatures = model.transform(df).select("features", "prediction")
	dfwrite(kmFeatures,'kmFeatures')	
开发者ID:eason001,项目名称:imBot,代码行数:9,代码来源:yispark.py

示例2: test_kmeans_cosine_distance

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
 def test_kmeans_cosine_distance(self):
     data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
             (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
             (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
     model = kmeans.fit(df)
     result = model.transform(df).collect()
     self.assertTrue(result[0].prediction == result[1].prediction)
     self.assertTrue(result[2].prediction == result[3].prediction)
     self.assertTrue(result[4].prediction == result[5].prediction)
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_algorithms.py

示例3: clustering

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
开发者ID:sam46,项目名称:Yelper,代码行数:13,代码来源:project.py

示例4: elbow

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def elbow(elbowset, clusters):
	wsseList = []	
	for k in clusters:
		print("Training for cluster size {} ".format(k))
		kmeans = KM(k = k, seed = 1)
		model = kmeans.fit(elbowset)
		transformed = model.transform(elbowset)
		featuresAndPrediction = transformed.select("features", "prediction")

		W = computeCost(featuresAndPrediction, model)
		print("......................WSSE = {} ".format(W))

		wsseList.append(W)
	return wsseList
开发者ID:words-sdsc,项目名称:coursera,代码行数:16,代码来源:utils.py

示例5: test_kmeans_summary

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
开发者ID:Brett-A,项目名称:spark,代码行数:17,代码来源:test_training_summary.py

示例6: test_kmean_pmml_basic

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
开发者ID:Brett-A,项目名称:spark,代码行数:18,代码来源:test_persistence.py

示例7: kmeans

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def kmeans(inputdir,df,alg,k):
	from pyspark.ml.clustering import KMeans
        from numpy import array
        from math import sqrt	
	kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features")
        model = kmeans.fit(df)
        kmFeatures = model.transform(df).select("labels", "prediction")
        erFeatures = model.transform(df).select("features", "prediction")
	###Evaluation
        rows = erFeatures.collect()
        WSSSE = 0
        for i in rows:
		WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])]))
        print("Within Set Sum of Squared Error = " + str(WSSSE))

	output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE)
	return output_data
开发者ID:eason001,项目名称:imPro,代码行数:19,代码来源:views.py

示例8: cluster

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def cluster():
    ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))

    spark = SparkSession.builder\
                        .master("local")\
                        .appName("Word Count")\
                        .config("spark.some.config.option", "some-value")\
                        .getOrCreate()

    df = spark.createDataFrame([["0"],
                                ["1"],
                                ["2"],
                                ["3"],
                                ["4"]],
                               ["id"])
    df.show()

    vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
    new_df = vecAssembler.transform(df)

    kmeans = KMeans(k=2, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    print(transformed.show())
开发者ID:softlang,项目名称:wikionto,代码行数:26,代码来源:explore.py

示例9: KMeans

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
开发者ID:BaiBenny,项目名称:spark,代码行数:32,代码来源:kmeans_example.py

示例10: print

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
#Place the means and std.dev values in a broadcast variable
bcMeans = sc.broadcast(colMeans)
bcStdDev = sc.broadcast(colStdDev)
csAuto = autoVector.map(centerAndScale)
#csAuto.collect()
#csAuto.foreach(println)
print(csAuto)

#Create Spark Data Frame
autoRows = csAuto.map(lambda f:Row(features=f))
autoDf = SQLContext.createDataFrame(autoRows)
autoDf.select("features").show(10)

kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(autoDf)
predictions = model.transform(autoDf)
predictions.collect()
predictions.foreach(println)

#Plot the results in a scatter plot
unstripped = predictions.map(unstripData)
predList=unstripped.collect()
predPd = pd.DataFrame(predList)

# preparing to save the clustered data
list_current_gni_final_maped = current_gni_final_maped.collect()
list_current_gni_rdd = current_gni_rdd.collect()
list_predictions_pandas=predictions.toPandas()
list_predictions_temp=list_predictions_pandas.as_matrix()
开发者ID:rzkhqq,项目名称:BigData4,代码行数:31,代码来源:current_gni.py

示例11: SparkContext

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]

from pyspark.mllib.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark import SparkContext
from pyspark.sql import SQLContext

# sc = SparkContext(appName="test")
# sqlContext = SQLContext(sc)

data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = sqlContext.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)

centers = model.clusterCenters()
model.transform(df).select("features", "prediction").collect()

开发者ID:zjffdu,项目名称:hadoop-spark,代码行数:18,代码来源:kmeans.py

示例12: display

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
# COMMAND ----------

display(transformed)

# COMMAND ----------

# MAGIC %md
# MAGIC #### K-Means Visualized

# COMMAND ----------

modelCenters = []
iterations = [0, 2, 4, 7, 10, 20]
for i in iterations:
    kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1)
    model = kmeans.fit(irisTwoFeatures)
    modelCenters.append(model.clusterCenters())

# COMMAND ----------

print 'modelCenters:'
for centroids in modelCenters:
  print centroids

# COMMAND ----------

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

def prepareSubplot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999',
开发者ID:smoltis,项目名称:spark,代码行数:33,代码来源:2-etl-kmeans_student.py

示例13: KMeans

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
df0 = tfs.analyze(df).cache()


mllib_df.count()
df0.count()

np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0


ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
        "random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
ta_1 = time.time()

tb_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False)
tb_1 = time.time()

tc_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True)
tc_1 = time.time()

mllib_dt = ta_1 - ta_0
tf_dt = tb_1 - tb_0
tf2_dt = tc_1 - tc_0

print("mllib:", mllib_dt, "tf+spark:",tf_dt, "tf:",tf2_dt)
开发者ID:databricks,项目名称:tensorframes,代码行数:33,代码来源:kmeans_demo.py

示例14: oneHotEncodeColumns

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"])

dfhot.show(5)

# Taining set
assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features")
train = assembler.transform(dfhot)

# Kmeans set for 5 clusters
knum = 5
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0)
model = kmeans.fit(train)
print "Model Created!"

# See cluster centers:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    
# Apply the clustering model to our data:
prediction = model.transform(train)
prediction.groupBy("cluster").count().orderBy("cluster").show()

# Look at the features of each cluster
customerCluster = {}
for i in range(0,knum):
开发者ID:raul-arrabales,项目名称:Spark-Hands-on,代码行数:33,代码来源:Session6.py

示例15: assign_cluster

# 需要导入模块: from pyspark.ml.clustering import KMeans [as 别名]
# 或者: from pyspark.ml.clustering.KMeans import fit [as 别名]
def assign_cluster(data):
    """Train kmeans on rescaled data and then label the rescaled data."""
    kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label")
    model = kmeans.fit(data)
    label_df = model.transform(data)
    return label_df
开发者ID:datitran,项目名称:spark-tdd-example,代码行数:8,代码来源:clustering.py


注:本文中的pyspark.ml.clustering.KMeans.fit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。