本文整理匯總了Python中pyspark.mllib.random.RandomRDDs類的典型用法代碼示例。如果您正苦於以下問題:Python RandomRDDs類的具體用法?Python RandomRDDs怎麽用?Python RandomRDDs使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了RandomRDDs類的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_col_norms
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))
data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
summary2 = Statistics.colStats(data2)
self.assertEqual(array([45.0]), summary2.normL1())
import math
expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
示例2: test_col_with_different_rdds
def test_col_with_different_rdds(self):
# numpy
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(1000, summary.count())
# array
data = self.sc.parallelize([range(10)] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
# array
data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
summary = Statistics.colStats(data)
self.assertEqual(10, summary.count())
示例3: exec
#A script to execute kmeans clustering in spark
#to run enter: >>> exec(open("./dokmeans.py").read())
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans
#generate random data RDD we need this package
from pyspark.mllib.random import RandomRDDs
#let's generate random class data, add in a cluster center to random 2D points
#use default num of partitions, or use a definte number to make it so that the union
# will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))
#concatenate 2 RDDs with .union(other) function
c12 =c1_v.union(c2_v)
my_data=c12.union(c3_v) #this now has all points, as RDD
my_kmmodel = KMeans.train(my_data,k=1,
maxIterations=20,runs=1,
initializationMode='k-means||',seed=10L)
#try: help(KMeans.train) to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.
示例4: len
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
print("Usage: random_rdd_generation", file=sys.stderr)
sys.exit(-1)
sc = SparkContext(appName="PythonRandomRDDGeneration")
numExamples = 10000 # number of examples to generate
fraction = 0.1 # fraction of data to sample
# Example: RandomRDDs.normalRDD
normalRDD = RandomRDDs.normalRDD(sc, numExamples)
print('Generated RDD of %d examples sampled from the standard normal distribution'
% normalRDD.count())
print(' First 5 samples:')
for sample in normalRDD.take(5):
print(' ' + str(sample))
print()
# Example: RandomRDDs.normalVectorRDD
normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
print(' First 5 samples:')
for sample in normalVectorRDD.take(5):
print(' ' + str(sample))
print()
示例5: test_to_java_object_rdd
def test_to_java_object_rdd(self): # SPARK-6660
data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
self.assertEqual(_to_java_object_rdd(data).count(), 10)
示例6:
""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""
import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np
num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
sc,
numCols=num_features,
numRows=100,
seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")
# For now, analysis is still required.
df0 = tfs.analyze(df)
init_centers = np.random.randn(k, num_features)
# For debugging
block = np.array(data.take(10))[::,0,::]
# Find the distances first
with tf.Graph().as_default() as g:
points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
num_points = tf.shape(points)[0]
#centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')
示例7: SparkContext
"""
Testing with Random data generation
https://spark.apache.org/docs/latest/mllib-statistics.html
"""
from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext
sc = SparkContext("local", "Rubbish")
# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
u = RandomRDDs.uniformRDD(sc, 1000000L, 10)
# Apply a transform to get a random double RDD following `N(1, 4)`.
v = u.map(lambda x: 1.0 + 2.0 * x)
print v
示例8: test_col_norms
def test_col_norms(self):
data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
summary = Statistics.colStats(data)
self.assertEqual(10, len(summary.normL1()))
self.assertEqual(10, len(summary.normL2()))