本文整理汇总了Python中pyspark.mllib.random.RandomRDDs.normalVectorRDD方法的典型用法代码示例。如果您正苦于以下问题:Python RandomRDDs.normalVectorRDD方法的具体用法?Python RandomRDDs.normalVectorRDD怎么用?Python RandomRDDs.normalVectorRDD使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.random.RandomRDDs
的用法示例。
在下文中一共展示了RandomRDDs.normalVectorRDD方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_col_norms
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))
data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
summary2 = Statistics.colStats(data2)
self.assertEqual(array([45.0]), summary2.normL1())
import math
expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
示例2: test_col_with_different_rdds
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
def test_col_with_different_rdds(self):
# numpy
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(1000, summary.count())
# array
data = self.sc.parallelize([range(10)] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
# array
data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
示例3: exec
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
#A script to execute kmeans clustering in spark
#to run enter: >>> exec(open("./dokmeans.py").read())
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans
#generate random data RDD we need this package
from pyspark.mllib.random import RandomRDDs
#let's generate random class data, add in a cluster center to random 2D points
#use default num of partitions, or use a definte number to make it so that the union
# will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))
#concatenate 2 RDDs with .union(other) function
c12 =c1_v.union(c2_v)
my_data=c12.union(c3_v) #this now has all points, as RDD
my_kmmodel = KMeans.train(my_data,k=1,
maxIterations=20,runs=1,
initializationMode='k-means||',seed=10L)
#try: help(KMeans.train) to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.
示例4: len
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
print("Usage: random_rdd_generation", file=sys.stderr)
sys.exit(-1)
sc = SparkContext(appName="PythonRandomRDDGeneration")
numExamples = 10000 # number of examples to generate
fraction = 0.1 # fraction of data to sample
# Example: RandomRDDs.normalRDD
normalRDD = RandomRDDs.normalRDD(sc, numExamples)
print('Generated RDD of %d examples sampled from the standard normal distribution'
% normalRDD.count())
print(' First 5 samples:')
for sample in normalRDD.take(5):
print(' ' + str(sample))
print()
# Example: RandomRDDs.normalVectorRDD
normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
print(' First 5 samples:')
for sample in normalVectorRDD.take(5):
print(' ' + str(sample))
print()
sc.stop()
示例5:
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""
import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np
num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
sc,
numCols=num_features,
numRows=100,
seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")
# For now, analysis is still required.
df0 = tfs.analyze(df)
init_centers = np.random.randn(k, num_features)
# For debugging
block = np.array(data.take(10))[::,0,::]
# Find the distances first
with tf.Graph().as_default() as g:
points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
num_points = tf.shape(points)[0]
#centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
示例6: test_col_norms
# 需要导入模块: from pyspark.mllib.random import RandomRDDs [as 别名]
# 或者: from pyspark.mllib.random.RandomRDDs import normalVectorRDD [as 别名]
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))