本文整理汇总了Python中pyspark.rdd.RDD类的典型用法代码示例。如果您正苦于以下问题:Python RDD类的具体用法?Python RDD怎么用?Python RDD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RDD类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadLabeledPoints
def loadLabeledPoints(sc, path, minPartitions=None):
"""
Load labeled points saved using RDD.saveAsTextFile.
@param sc: Spark context
@param path: file or directory path in any Hadoop-supported file
system URI
@param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
>>> from pyspark.mllib.util import MLUtils
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
>>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
>>> type(loaded[0]) == LabeledPoint
True
>>> print examples[0]
(1.1,(3,[0,2],[-1.23,4.56e-07]))
>>> type(examples[1]) == LabeledPoint
True
>>> print examples[1]
(0.0,[1.01,2.02,3.03])
"""
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(sc._jsc, path, minPartitions)
serialized = RDD(jSerialized, sc, NoOpSerializer())
return serialized.map(lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
示例2: __init__
def __init__(self, ctx, resource_read=None, query=None, **kwargs):
kwargs = make_es_config(kwargs, resource_read=resource_read, query=query)
kwargs = as_java_object(ctx._gateway, kwargs)
jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs)
rdd = RDD(jrdd, ctx, NoOpSerializer())
# read the rdd in batches of two (first key then value / doc)
def pairwise(iterable):
iterator = iter(iterable)
return izip(iterator, iterator)
kvRdd = rdd.mapPartitions(pairwise, True)
super(EsRDD, self).__init__(kvRdd._jrdd, ctx)
示例3: rdd
def rdd(self):
"""Returns the content as an :class:`pyspark.RDD` of :class:`Row`.
"""
if self._lazy_rdd is None:
jrdd = self._jdf.javaToPython()
rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
schema = self.schema
def applySchema(it):
cls = _create_cls(schema)
return map(cls, it)
self._lazy_rdd = rdd.mapPartitions(applySchema)
return self._lazy_rdd
示例4: rdd
def rdd(self):
"""
Return the content of the :class:`DataFrame` as an :class:`RDD`
of :class:`Row` s.
"""
if not hasattr(self, '_lazy_rdd'):
jrdd = self._jdf.javaToPython()
rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
schema = self.schema
def applySchema(it):
cls = _create_cls(schema)
return itertools.imap(cls, it)
self._lazy_rdd = rdd.mapPartitions(applySchema)
return self._lazy_rdd
示例5: normalVectorRDD
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the standard normal distribution.
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - 0.0) < 0.1
True
>>> abs(mat.std() - 1.0) < 0.1
True
"""
jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例6: uniformVectorRDD
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the uniform distribution U(0.0, 1.0).
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape
(10, 10)
>>> mat.max() <= 1.0 and mat.min() >= 0.0
True
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
4
"""
jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例7: poissonRDD
def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d samples from the Poisson
distribution with the input mean.
>>> mean = 100.0
>>> x = RandomRDDGenerators.poissonRDD(sc, mean, 1000, seed=1L)
>>> stats = x.stats()
>>> stats.count()
1000L
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
True
"""
jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例8: poissonVectorRDD
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
"""
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Poisson distribution with the input mean.
>>> import numpy as np
>>> mean = 100.0
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
>>> mat = np.mat(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
True
"""
jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
示例9: normalRDD
def normalRDD(sc, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d samples from the standard normal
distribution.
To transform the distribution in the generated RDD from standard normal
to some other normal N(mean, sigma), use
C{RandomRDDGenerators.normal(sc, n, p, seed)\
.map(lambda v: mean + sigma * v)}
>>> x = RandomRDDGenerators.normalRDD(sc, 1000, seed=1L)
>>> stats = x.stats()
>>> stats.count()
1000L
>>> abs(stats.mean() - 0.0) < 0.1
True
>>> abs(stats.stdev() - 1.0) < 0.1
True
"""
jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例10: asDataFrames
def asDataFrames(self, *index_by):
'''
Reads the spanned rows as DataFrames if pandas is available, or as
a dict of numpy arrays if only numpy is available or as a dict with
primitives and objects otherwise.
@param index_by If pandas is available, the dataframes will be
indexed by the given columns.
'''
for c in index_by:
if c in self.columns:
raise ValueError('column %s cannot be used as index in the data'
'frames as it is a column by which the rows are spanned.')
columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
jrdd = self._helper.spanBy(self._cjrdd, columns)
rdd = RDD(jrdd, self.ctx)
global pd
if index_by and pd:
return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
else:
return rdd
示例11: createRDD
def createRDD(sc, kafkaParams, offsetRanges, leaders={},
keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
"""
.. note:: Experimental
Create a RDD from Kafka using offset ranges for each topic and partition.
:param sc: SparkContext object
:param kafkaParams: Additional params for Kafka
:param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume
:param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty
map, in which case leaders will be looked up on the driver.
:param keyDecoder: A function used to decode key (default is utf8_decoder)
:param valueDecoder: A function used to decode value (default is utf8_decoder)
:return: A RDD object
"""
if not isinstance(kafkaParams, dict):
raise TypeError("kafkaParams should be dict")
if not isinstance(offsetRanges, list):
raise TypeError("offsetRanges should be list")
try:
helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
.loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
helper = helperClass.newInstance()
joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
jleaders = dict([(k._jTopicAndPartition(helper),
v._jBroker(helper)) for (k, v) in leaders.items()])
jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders)
except Py4JJavaError as e:
if 'ClassNotFoundException' in str(e.java_exception):
KafkaUtils._printErrorMsg(sc)
raise e
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
rdd = RDD(jrdd, sc, ser)
return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
示例12: uniformRDD
def uniformRDD(sc, size, numPartitions=None, seed=None):
"""
Generates an RDD comprised of i.i.d. samples from the
uniform distribution on [0.0, 1.0].
To transform the distribution in the generated RDD from U[0.0, 1.0]
to U[a, b], use
C{RandomRDDGenerators.uniformRDD(sc, n, p, seed)\
.map(lambda v: a + (b - a) * v)}
>>> x = RandomRDDGenerators.uniformRDD(sc, 100).collect()
>>> len(x)
100
>>> max(x) <= 1.0 and min(x) >= 0.0
True
>>> RandomRDDGenerators.uniformRDD(sc, 100, 4).getNumPartitions()
4
>>> parts = RandomRDDGenerators.uniformRDD(sc, 100, seed=4).getNumPartitions()
>>> parts == sc.defaultParallelism
True
"""
jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
示例13: __init__
def __init__(self, jrdd, ctx, jrdd_deserializer):
RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
示例14: update_dictionary
from pyspark import SparkContext
from pyspark import SparkConf
import sys
from pyspark.rdd import RDD
def update_dictionary(map):
map.update(test="alex")
return map
if __name__ == "__main__":
print("here1")
conf = SparkConf()
sc = SparkContext(appName="alex_test_app")
print("here2")
print("here2b: " + conf.get("spark.aleph2_job_config"))
aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils").newInstance().getAleph2(sc._jsc, sys.argv[1])
print("here3")
print aleph2.getRddInputNames()
print("here4")
#print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count()
print("here5")
to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m))
aleph2.emitRdd(to_output._to_java_object_rdd())
示例15: __init__
def __init__(self, jrdd, ctx, jrdd_deserializer):
warnings.warn(
"Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
"See SPARK-21893.",
DeprecationWarning)
RDD.__init__(self, jrdd, ctx, jrdd_deserializer)