本文整理汇总了Python中pyspark.rdd.RDD属性的典型用法代码示例。如果您正苦于以下问题:Python rdd.RDD属性的具体用法?Python rdd.RDD怎么用?Python rdd.RDD使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类pyspark.rdd
的用法示例。
在下文中一共展示了rdd.RDD属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: asDataFrames
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def asDataFrames(self, *index_by):
'''
Reads the spanned rows as DataFrames if pandas is available, or as
a dict of numpy arrays if only numpy is available or as a dict with
primitives and objects otherwise.
@param index_by If pandas is available, the dataframes will be
indexed by the given columns.
'''
for c in index_by:
if c in self.columns:
raise ValueError('column %s cannot be used as index in the data'
'frames as it is a column by which the rows are spanned.')
columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
jrdd = self._helper.spanBy(self._crdd, columns)
rdd = RDD(jrdd, self.ctx)
global pd
if index_by and pd:
return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
else:
return rdd
示例2: joinWithCassandraTable
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def joinWithCassandraTable(left_rdd, keyspace, table):
'''
Join an RDD with a Cassandra table on the partition key. Use .on(...)
to specifiy other columns to join on. .select(...), .where(...) and
.limit(...) can be used as well.
Arguments:
@param left_rdd(RDD):
The RDD to join. Equals to self when invoking joinWithCassandraTable on a monkey
patched RDD.
@param keyspace(string):
The keyspace to join on
@param table(string):
The CQL table to join on.
'''
return CassandraJoinRDD(left_rdd, keyspace, table)
示例3: test_fold_by_key_mutable_zero_value
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def test_fold_by_key_mutable_zero_value(self):
# Test for SPARK-9021; uses foldByKey to make a pair RDD that contains
# lists of all values for each key in the original RDD
tuples = [(i, range(i)) for i in range(10)]*2
# Show that single or multiple partitions work
data1 = self.sc.parallelize(tuples, 1)
data2 = self.sc.parallelize(tuples, 2)
def comboOp(x, y):
x.extend(y)
return x
values1 = data1.foldByKey([], comboOp).collect()
values2 = data2.foldByKey([], comboOp).collect()
# Sort lists to ensure clean comparison with ground_truth
values1.sort()
values2.sort()
# list(range(...)) for Python 3.x compatibility
ground_truth = [(i, list(range(i))*2) for i in range(10)]
self.assertEqual(values1, ground_truth)
self.assertEqual(values2, ground_truth)
示例4: test_multiple_python_java_RDD_conversions
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def test_multiple_python_java_RDD_conversions(self):
# Regression test for SPARK-5361
data = [
(u'1', {u'director': u'David Lean'}),
(u'2', {u'director': u'Andrew Dominik'})
]
data_rdd = self.sc.parallelize(data)
data_java_rdd = data_rdd._to_java_object_rdd()
data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
converted_rdd = RDD(data_python_rdd, self.sc)
self.assertEqual(2, converted_rdd.count())
# conversion between python and java RDD threw exceptions
data_java_rdd = converted_rdd._to_java_object_rdd()
data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
converted_rdd = RDD(data_python_rdd, self.sc)
self.assertEqual(2, converted_rdd.count())
示例5: transform
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self, x):
"""
Transforms term frequency (TF) vectors to TF-IDF vectors.
If `minDocFreq` was set for the IDF calculation,
the terms which occur in fewer than `minDocFreq`
documents will have an entry of 0.
.. note:: In Python, transform cannot currently be used within
an RDD transformation or action.
Call transform directly on the RDD instead.
:param x: an RDD of term frequency vectors or a term frequency
vector
:return: an RDD of TF-IDF vectors or a TF-IDF vector
"""
return JavaVectorTransformer.transform(self, x)
示例6: predict
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def predict(self, x):
"""
Find the cluster that each of the points belongs to in this
model.
:param x:
A data point (or RDD of points) to determine cluster index.
:return:
Predicted cluster index or an RDD of predicted cluster indices
if the input is an RDD.
"""
if isinstance(x, RDD):
vecs = x.map(_convert_to_vector)
return self.call("predict", vecs)
x = _convert_to_vector(x)
return self.call("predict", x)
示例7: train
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def train(cls, rdd, k, maxIterations=100, initMode="random"):
r"""
:param rdd:
An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
affinity matrix, which is the matrix A in the PIC paper. The
similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric
matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with
nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or
(j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored,
because it is assumed s\ :sub:`ij`\ = 0.0.
:param k:
Number of clusters.
:param maxIterations:
Maximum number of iterations of the PIC algorithm.
(default: 100)
:param initMode:
Initialization mode. This can be either "random" to use
a random vector as vertex properties, or "degree" to use
normalized sum similarities.
(default: "random")
"""
model = callMLlibFunc("trainPowerIterationClusteringModel",
rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode)
return PowerIterationClusteringModel(model)
示例8: coalesce
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def coalesce(self, numPartitions):
"""
Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.
:param numPartitions: int, to specify the target number of partitions
Similar to coalesce defined on an :class:`RDD`, this operation results in a
narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,
there will not be a shuffle, instead each of the 100 new partitions will
claim 10 of the current partitions. If a larger number of partitions is requested,
it will stay at the current number of partitions.
However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
this may result in your computation taking place on fewer nodes than
you like (e.g. one node in the case of numPartitions = 1). To avoid this,
you can call repartition(). This will add a shuffle step, but means the
current upstream partitions will be executed in parallel (per whatever
the current partitioning is).
>>> df.coalesce(1).rdd.getNumPartitions()
1
"""
return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)
示例9: cassandraTable
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def cassandraTable(self, keyspace, table):
"""Returns all the Rows in a Cassandra keyspace and table as an RDD.
@param keyspace: Cassandra keyspace / schema name
@param table: Cassandra table / column family name
"""
# Unsure right now if we need CassandraSerializer, but likely do since
# we'll get generic CassandraRow instances back that we'll need to
# inspect?
# return RDD(self._jcsc.cassandraTable(keyspace, table), self,
# CassandraSerializer())
return RDD(self._jcsc.cassandraTable(keyspace, table),
self, BatchedSerializer(PickleSerializer()))
# Unfortunately, can't call rdd.saveToCassandra as we'd dynamically have to
# bind a method to all rdd instances which isn't feasible
示例10: joinWithCassandraTable
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def joinWithCassandraTable(left_rdd, keyspace, table):
"""
Join an RDD with a Cassandra table on the partition key. Use .on(...)
to specifiy other columns to join on. .select(...), .where(...) and
.limit(...) can be used as well.
Arguments:
@param left_rdd(RDD):
The RDD to join. Equals to self when invoking
joinWithCassandraTable on a monkey patched RDD.
@param keyspace(string):
The keyspace to join on
@param table(string):
The CQL table to join on.
"""
return CassandraJoinRDD(left_rdd, keyspace, table)
示例11: _rdd
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def _rdd(self):
"""Return an RDD of Panda DataFrame objects. This can be expensive
especially if we don't do a narrow transformation after and get it back
to Spark SQL land quickly."""
columns = self._schema_rdd.columns
index_names = self._index_names
def fromRecords(records):
if not records:
return []
else:
loaded_df = pd.DataFrame.from_records([records],
columns=columns)
indexed_df = _update_index_on_df(loaded_df, index_names)
return [indexed_df]
return self._schema_rdd.rdd.flatMap(fromRecords)
示例12: _evil_apply_with_dataframes
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def _evil_apply_with_dataframes(self, func, preserves_cols=False):
"""Convert the underlying SchmeaRDD to an RDD of DataFrames.
apply the provide function and convert the result back.
This is hella slow."""
source_rdd = self._rdd()
result_rdd = func(source_rdd)
# By default we don't know what the columns & indexes are so we let
# from_rdd_of_dataframes look at the first partition to determine them.
column_idxs = None
if preserves_cols:
index_names = self._index_names
# Remove indexes from the columns
columns = self._schema_rdd.columns[len(self._index_names):]
column_idxs = (columns, index_names)
return self.from_rdd_of_dataframes(
result_rdd, column_idxs=column_idxs)
示例13: __init__
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def __init__(self, ctx, keyspace, table, row_format=None, read_conf=None, **read_conf_kwargs):
if not keyspace:
raise ValueError("keyspace not set")
if not table:
raise ValueError("table not set")
if row_format is None:
row_format = RowFormat.ROW
elif row_format < 0 or row_format >= len(RowFormat.values):
raise ValueError("invalid row_format %s" % row_format)
self.keyspace = keyspace
self.table = table
self.row_format = row_format
self.read_conf = ReadConf.build(read_conf, **read_conf_kwargs)
self._limit = None
# this jrdd is for compatibility with pyspark.rdd.RDD
# while allowing this constructor to be use for type checking etc
# and setting _jrdd //after// invoking this constructor
class DummyJRDD(object):
def id(self):
return -1
jrdd = DummyJRDD()
super(_CassandraRDD, self).__init__(jrdd, ctx)
示例14: transform
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self,X_rdd,y_rdd=None):
'''
given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
'''
#check input type
if type(X_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
if y_rdd and type(y_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
#get term frequencies
X = X_rdd.map(self._term_frequency).cache()
#convert to sparse
X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))
#check if labels exist
if y_rdd:
#combine X and y into single dataframe
X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
data = data.toDF(schema)
data = data.withColumn('label',data.label.cast(DoubleType()))
else:
schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
data = X.toDF(schema)
return data
示例15: transform
# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self,X_rdd,y_rdd=None,train=True):
'''
given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
'''
#check input type
if type(X_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
if y_rdd and type(y_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
#word tokenization
X = X_rdd.map(self._tokenize).cache()
#create dictionary of words
if train:
self.dictionary = X.map(lambda row: row[1]).flatMap(lambda word: word).map(lambda word: (word,1)).reduceByKey(lambda acc, w: acc + w).filter(lambda x: x[1]>=self.min_df).collectAsMap()
self.dictionary = dict(zip(self.dictionary,xrange(len(self.dictionary))))
#create word vectors
X = X.map(self._term_frequency)
#check if labels exist
if y_rdd:
#combine X and y into single dataframe
X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
data = data.toDF(schema)
data = data.withColumn('label',data.label.cast(DoubleType()))
else:
schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
data = X.toDF(schema)
return data