本文整理汇总了Python中pyspark.rdd.RDD.map方法的典型用法代码示例。如果您正苦于以下问题:Python RDD.map方法的具体用法?Python RDD.map怎么用?Python RDD.map使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.rdd.RDD
的用法示例。
在下文中一共展示了RDD.map方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadLabeledPoints
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def loadLabeledPoints(sc, path, minPartitions=None):
"""
Load labeled points saved using RDD.saveAsTextFile.
@param sc: Spark context
@param path: file or directory path in any Hadoop-supported file
system URI
@param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
>>> from pyspark.mllib.util import MLUtils
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
>>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
>>> type(loaded[0]) == LabeledPoint
True
>>> print examples[0]
(1.1,(3,[0,2],[-1.23,4.56e-07]))
>>> type(examples[1]) == LabeledPoint
True
>>> print examples[1]
(0.0,[1.01,2.02,3.03])
"""
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
serialized = RDD(jSerialized, sc, NoOpSerializer())
return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
示例2: normalVectorRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the standard normal distribution.
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - 0.0) < 0.1
True
>>> abs(mat.std() - 1.0) < 0.1
True
"""
jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例3: uniformVectorRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the uniform distribution U(0.0, 1.0).
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape
(10, 10)
>>> mat.max() <= 1.0 and mat.min() >= 0.0
True
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
4
"""
jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例4: poissonRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d samples from the Poisson
distribution with the input mean.
>>> mean = 100.0
>>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
>>> stats = x.stats()
>>> stats.count()
1000L
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
True
"""
jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例5: poissonVectorRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Poisson distribution with the input mean.
>>> import numpy as np
>>> mean = 100.0
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
>>> mat = np.mat(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
True
"""
jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例6: normalRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def normalRDD(sc, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d samples from the standard normal
distribution.
To transform the distribution in the generated RDD from standard normal
to some other normal N(mean, sigma), use
C{RandomRDDGenerators.normal(sc, n, p, seed)\
.map(lambda v: mean + sigma * v)}
>>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
>>> stats = x.stats()
>>> stats.count()
1000L
>>> abs(stats.mean() - 0.0) < 0.1
True
>>> abs(stats.stdev() - 1.0) < 0.1
True
"""
jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例7: createRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def createRDD(sc, kafkaParams, offsetRanges, leaders={},
keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
"""
.. note:: Experimental
Create a RDD from Kafka using offset ranges for each topic and partition.
:param sc: SparkContext object
:param kafkaParams: Additional params for Kafka
:param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume
:param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty
map, in which case leaders will be looked up on the driver.
:param keyDecoder: A function used to decode key (default is utf8_decoder)
:param valueDecoder: A function used to decode value (default is utf8_decoder)
:return: A RDD object
"""
if not isinstance(kafkaParams, dict):
raise TypeError("kafkaParams should be dict")
if not isinstance(offsetRanges, list):
raise TypeError("offsetRanges should be list")
try:
helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
helper = helperClass.newInstance()
joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
jleaders = dict([(k._jTopicAndPartition(helper),
v._jBroker(helper)) for (k, v) in leaders.items()])
jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders)
except Py4JJavaError as e:
if 'ClassNotFoundException' in str(e.java_exception):
KafkaUtils._printErrorMsg(sc)
raise e
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
rdd = RDD(jrdd, sc, ser)
return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
示例8: uniformRDD
# 需要导入模块: from pyspark.rdd import RDD [as 别名]
# 或者: from pyspark.rdd.RDD import map [as 别名]
def uniformRDD(sc, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d. samples from the
uniform distribution on [0.0, 1.0].
To transform the distribution in the generated RDD from U[0.0, 1.0]
to U[a, b], use
C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
.map(lambda v: a + (b - a) * v)}
>>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
>>> len(x)
100
>>> max(x) <= 1.0 and min(x) >= 0.0
True
>>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
4
>>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
>>> parts == sc.defaultParallelism
True
"""
jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))