Python random.RandomRDDs類代碼示例

本文整理匯總了Python中pyspark.mllib.random.RandomRDDs類的典型用法代碼示例。如果您正苦於以下問題：Python RandomRDDs類的具體用法？Python RandomRDDs怎麽用？Python RandomRDDs使用的例子？那麽, 這裏精選的類代碼示例或許可以為您提供幫助。

在下文中一共展示了RandomRDDs類的8個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_col_norms

    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)

開發者ID:HodaAlemi，項目名稱:spark，代碼行數:12，代碼來源:tests.py

示例2: test_col_with_different_rdds

 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())

開發者ID:greatyan，項目名稱:spark，代碼行數:13，代碼來源:tests.py

示例3: exec

#A script to execute kmeans clustering in spark
#to run enter: >>> exec(open("./dokmeans.py").read())

import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans


#generate random data RDD we need this package
from pyspark.mllib.random import RandomRDDs

#let's generate random class data, add in a cluster center to random 2D points

#use default num of partitions, or use a definte number to make it so that the union
#  will have samples across clusters
c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v))
c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v))
c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v))

#concatenate 2 RDDs with  .union(other) function
c12    =c1_v.union(c2_v)
my_data=c12.union(c3_v)   #this now has all points, as RDD


my_kmmodel = KMeans.train(my_data,k=1,
               maxIterations=20,runs=1,
               initializationMode='k-means||',seed=10L)

#try: help(KMeans.train)  to see parameter options
#k is the number of desired clusters.
#maxIterations is the maximum number of iterations to run.

開發者ID:abhijeetk，項目名稱:Big-Data-Specialization，代碼行數:31，代碼來源:dokmeans.py

示例4: len

from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs


if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: random_rdd_generation", file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonRandomRDDGeneration")

    numExamples = 10000  # number of examples to generate
    fraction = 0.1  # fraction of data to sample

    # Example: RandomRDDs.normalRDD
    normalRDD = RandomRDDs.normalRDD(sc, numExamples)
    print('Generated RDD of %d examples sampled from the standard normal distribution'
          % normalRDD.count())
    print('  First 5 samples:')
    for sample in normalRDD.take(5):
        print('    ' + str(sample))
    print()

    # Example: RandomRDDs.normalVectorRDD
    normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
    print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
    print('  First 5 samples:')
    for sample in normalVectorRDD.take(5):
        print('    ' + str(sample))
    print()

開發者ID:BaiBenny，項目名稱:spark，代碼行數:30，代碼來源:random_rdd_generation.py

示例5: test_to_java_object_rdd

 def test_to_java_object_rdd(self):  # SPARK-6660
     data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
     self.assertEqual(_to_java_object_rdd(data).count(), 10)

開發者ID:HodaAlemi，項目名稱:spark，代碼行數:3，代碼來源:tests.py

示例6:

""" Simple distributed implementation of the K-Means algorithm using Tensorflow.
"""

import tensorflow as tf
import tensorframes as tfs
from pyspark.mllib.random import RandomRDDs
import numpy as np

num_features = 4
k = 2
# TODO: does not work with 1
data = RandomRDDs.normalVectorRDD(
    sc,
    numCols=num_features,
    numRows=100,
    seed=1).map(lambda v: [v.tolist()])
df = sqlContext.createDataFrame(data).toDF("features")

# For now, analysis is still required.
df0 = tfs.analyze(df)

init_centers = np.random.randn(k, num_features)

# For debugging
block = np.array(data.take(10))[::,0,::]

# Find the distances first
with tf.Graph().as_default() as g:
    points = tf.placeholder(tf.double, shape=[None, num_features], name='points')
    num_points = tf.shape(points)[0]
    #centers = tf.placeholder(tf.double, shape=[k, num_features], name='centers')

開發者ID:Jay-Zeng，項目名稱:tensorframes，代碼行數:31，代碼來源:kmeans.py

示例7: SparkContext

"""

Testing with Random data generation

https://spark.apache.org/docs/latest/mllib-statistics.html

"""

from pyspark.mllib.random import RandomRDDs
from pyspark import SparkContext

sc = SparkContext("local", "Rubbish")

# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
u = RandomRDDs.uniformRDD(sc, 1000000L, 10)
# Apply a transform to get a random double RDD following `N(1, 4)`.
v = u.map(lambda x: 1.0 + 2.0 * x)

print v

開發者ID:jjingrong，項目名稱:Spark-MLlib，代碼行數:20，代碼來源:Random_data_generation.py

示例8: test_col_norms

 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))

開發者ID:Amir-Github，項目名稱:spark，代碼行數:5，代碼來源:tests.py

注：本文中的pyspark.mllib.random.RandomRDDs類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。