Python rdd.RDD属性代码示例

本文整理汇总了Python中pyspark.rdd.RDD属性的典型用法代码示例。如果您正苦于以下问题：Python rdd.RDD属性的具体用法？Python rdd.RDD怎么用？Python rdd.RDD使用的例子？那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类pyspark.rdd的用法示例。

在下文中一共展示了rdd.RDD属性的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: asDataFrames

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def asDataFrames(self, *index_by):
        '''
            Reads the spanned rows as DataFrames if pandas is available, or as
            a dict of numpy arrays if only numpy is available or as a dict with
            primitives and objects otherwise.

            @param index_by If pandas is available, the dataframes will be
            indexed by the given columns.
        '''
        for c in index_by:
            if c in self.columns:
                raise ValueError('column %s cannot be used as index in the data'
                    'frames as it is a column by which the rows are spanned.')

        columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
        jrdd = self._helper.spanBy(self._crdd, columns)
        rdd = RDD(jrdd, self.ctx)

        global pd
        if index_by and pd:
            return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
        else:
            return rdd

开发者ID:TargetHolding，项目名称:pyspark-cassandra，代码行数:25，代码来源:rdd.py

示例2: joinWithCassandraTable

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def joinWithCassandraTable(left_rdd, keyspace, table):
    '''
        Join an RDD with a Cassandra table on the partition key. Use .on(...)
        to specifiy other columns to join on. .select(...), .where(...) and
        .limit(...) can be used as well.

        Arguments:
        @param left_rdd(RDD):
            The RDD to join. Equals to self when invoking joinWithCassandraTable on a monkey
            patched RDD.
        @param keyspace(string):
            The keyspace to join on
        @param table(string):
            The CQL table to join on.
    '''

    return CassandraJoinRDD(left_rdd, keyspace, table)

开发者ID:TargetHolding，项目名称:pyspark-cassandra，代码行数:19，代码来源:rdd.py

示例3: test_fold_by_key_mutable_zero_value

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def test_fold_by_key_mutable_zero_value(self):
        # Test for SPARK-9021; uses foldByKey to make a pair RDD that contains
        # lists of all values for each key in the original RDD

        tuples = [(i, range(i)) for i in range(10)]*2
        # Show that single or multiple partitions work
        data1 = self.sc.parallelize(tuples, 1)
        data2 = self.sc.parallelize(tuples, 2)

        def comboOp(x, y):
            x.extend(y)
            return x

        values1 = data1.foldByKey([], comboOp).collect()
        values2 = data2.foldByKey([], comboOp).collect()
        # Sort lists to ensure clean comparison with ground_truth
        values1.sort()
        values2.sort()

        # list(range(...)) for Python 3.x compatibility
        ground_truth = [(i, list(range(i))*2) for i in range(10)]
        self.assertEqual(values1, ground_truth)
        self.assertEqual(values2, ground_truth)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:25，代码来源:tests.py

示例4: test_multiple_python_java_RDD_conversions

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def test_multiple_python_java_RDD_conversions(self):
        # Regression test for SPARK-5361
        data = [
            (u'1', {u'director': u'David Lean'}),
            (u'2', {u'director': u'Andrew Dominik'})
        ]
        data_rdd = self.sc.parallelize(data)
        data_java_rdd = data_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())

        # conversion between python and java RDD threw exceptions
        data_java_rdd = converted_rdd._to_java_object_rdd()
        data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd)
        converted_rdd = RDD(data_python_rdd, self.sc)
        self.assertEqual(2, converted_rdd.count())

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:19，代码来源:tests.py

示例5: transform

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self, x):
        """
        Transforms term frequency (TF) vectors to TF-IDF vectors.

        If `minDocFreq` was set for the IDF calculation,
        the terms which occur in fewer than `minDocFreq`
        documents will have an entry of 0.

        .. note:: In Python, transform cannot currently be used within
            an RDD transformation or action.
            Call transform directly on the RDD instead.

        :param x: an RDD of term frequency vectors or a term frequency
                  vector
        :return: an RDD of TF-IDF vectors or a TF-IDF vector
        """
        return JavaVectorTransformer.transform(self, x)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:19，代码来源:feature.py

示例6: predict

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def predict(self, x):
        """
        Find the cluster that each of the points belongs to in this
        model.

        :param x:
          A data point (or RDD of points) to determine cluster index.
        :return:
          Predicted cluster index or an RDD of predicted cluster indices
          if the input is an RDD.
        """
        if isinstance(x, RDD):
            vecs = x.map(_convert_to_vector)
            return self.call("predict", vecs)

        x = _convert_to_vector(x)
        return self.call("predict", x)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:19，代码来源:clustering.py

示例7: train

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def train(cls, rdd, k, maxIterations=100, initMode="random"):
        r"""
        :param rdd:
          An RDD of (i, j, s\ :sub:`ij`\) tuples representing the
          affinity matrix, which is the matrix A in the PIC paper.  The
          similarity s\ :sub:`ij`\ must be nonnegative.  This is a symmetric
          matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\  For any (i, j) with
          nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or
          (j, i, s\ :sub:`ji`\) in the input.  Tuples with i = j are ignored,
          because it is assumed s\ :sub:`ij`\ = 0.0.
        :param k:
          Number of clusters.
        :param maxIterations:
          Maximum number of iterations of the PIC algorithm.
          (default: 100)
        :param initMode:
          Initialization mode. This can be either "random" to use
          a random vector as vertex properties, or "degree" to use
          normalized sum similarities.
          (default: "random")
        """
        model = callMLlibFunc("trainPowerIterationClusteringModel",
                              rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode)
        return PowerIterationClusteringModel(model)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:26，代码来源:clustering.py

示例8: coalesce

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def coalesce(self, numPartitions):
        """
        Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.

        :param numPartitions: int, to specify the target number of partitions

        Similar to coalesce defined on an :class:`RDD`, this operation results in a
        narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,
        there will not be a shuffle, instead each of the 100 new partitions will
        claim 10 of the current partitions. If a larger number of partitions is requested,
        it will stay at the current number of partitions.

        However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
        this may result in your computation taking place on fewer nodes than
        you like (e.g. one node in the case of numPartitions = 1). To avoid this,
        you can call repartition(). This will add a shuffle step, but means the
        current upstream partitions will be executed in parallel (per whatever
        the current partitioning is).

        >>> df.coalesce(1).rdd.getNumPartitions()
        1
        """
        return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:25，代码来源:dataframe.py

示例9: cassandraTable

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def cassandraTable(self, keyspace, table):
        """Returns all the Rows in a Cassandra keyspace and table as an RDD.

        @param keyspace: Cassandra keyspace / schema name
        @param table: Cassandra table / column family name
        """
        # Unsure right now if we need CassandraSerializer, but likely do since
        # we'll get generic CassandraRow instances back that we'll need to
        # inspect?
        # return RDD(self._jcsc.cassandraTable(keyspace, table), self,
        #            CassandraSerializer())
        return RDD(self._jcsc.cassandraTable(keyspace, table),
                   self, BatchedSerializer(PickleSerializer()))


# Unfortunately, can't call rdd.saveToCassandra as we'd dynamically have to
# bind a method to all rdd instances which isn't feasible

开发者ID:Parsely，项目名称:pyspark-cassandra，代码行数:19，代码来源:pyspark_cassandra.py

示例10: joinWithCassandraTable

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def joinWithCassandraTable(left_rdd, keyspace, table):
    """
        Join an RDD with a Cassandra table on the partition key. Use .on(...)
        to specifiy other columns to join on. .select(...), .where(...) and
        .limit(...) can be used as well.

        Arguments:
        @param left_rdd(RDD):
            The RDD to join. Equals to self when invoking
            joinWithCassandraTable on a monkey patched RDD.
        @param keyspace(string):
            The keyspace to join on
        @param table(string):
            The CQL table to join on.
    """

    return CassandraJoinRDD(left_rdd, keyspace, table)

开发者ID:anguenot，项目名称:pyspark-cassandra，代码行数:19，代码来源:rdd.py

示例11: _rdd

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def _rdd(self):
        """Return an RDD of Panda DataFrame objects. This can be expensive
        especially if we don't do a narrow transformation after and get it back
        to Spark SQL land quickly."""
        columns = self._schema_rdd.columns
        index_names = self._index_names

        def fromRecords(records):
            if not records:
                return []
            else:
                loaded_df = pd.DataFrame.from_records([records],
                                                      columns=columns)
                indexed_df = _update_index_on_df(loaded_df, index_names)
                return [indexed_df]

        return self._schema_rdd.rdd.flatMap(fromRecords)

开发者ID:sparklingpandas，项目名称:sparklingpandas，代码行数:19，代码来源:dataframe.py

示例12: _evil_apply_with_dataframes

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def _evil_apply_with_dataframes(self, func, preserves_cols=False):
        """Convert the underlying SchmeaRDD to an RDD of DataFrames.
        apply the provide function and convert the result back.
        This is hella slow."""
        source_rdd = self._rdd()
        result_rdd = func(source_rdd)
        # By default we don't know what the columns & indexes are so we let
        # from_rdd_of_dataframes look at the first partition to determine them.
        column_idxs = None
        if preserves_cols:
            index_names = self._index_names
            # Remove indexes from the columns
            columns = self._schema_rdd.columns[len(self._index_names):]
            column_idxs = (columns, index_names)
        return self.from_rdd_of_dataframes(
            result_rdd, column_idxs=column_idxs)

开发者ID:sparklingpandas，项目名称:sparklingpandas，代码行数:18，代码来源:dataframe.py

示例13: init

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def __init__(self, ctx, keyspace, table, row_format=None, read_conf=None, **read_conf_kwargs):
        if not keyspace:
            raise ValueError("keyspace not set")

        if not table:
            raise ValueError("table not set")

        if row_format is None:
            row_format = RowFormat.ROW
        elif row_format < 0 or row_format >= len(RowFormat.values):
            raise ValueError("invalid row_format %s" % row_format)

        self.keyspace = keyspace
        self.table = table
        self.row_format = row_format
        self.read_conf = ReadConf.build(read_conf, **read_conf_kwargs)
        self._limit = None

        # this jrdd is for compatibility with pyspark.rdd.RDD
        # while allowing this constructor to be use for type checking etc
        # and setting _jrdd //after// invoking this constructor
        class DummyJRDD(object):
            def id(self):
                return -1
        jrdd = DummyJRDD()

        super(_CassandraRDD, self).__init__(jrdd, ctx)

开发者ID:TargetHolding，项目名称:pyspark-cassandra，代码行数:29，代码来源:rdd.py

示例14: transform

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self,X_rdd,y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #get term frequencies
        X = X_rdd.map(self._term_frequency).cache()
        
        #convert to sparse
        X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data

开发者ID:iamshang1，项目名称:Projects，代码行数:33，代码来源:preprocessing_bytes.py

示例15: transform

# 需要导入模块: from pyspark import rdd [as 别名]
# 或者: from pyspark.rdd import RDD [as 别名]
def transform(self,X_rdd,y_rdd=None,train=True):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #word tokenization
        X = X_rdd.map(self._tokenize).cache()
        
        #create dictionary of words
        if train:
            self.dictionary = X.map(lambda row: row[1]).flatMap(lambda word: word).map(lambda word: (word,1)).reduceByKey(lambda acc, w: acc + w).filter(lambda x: x[1]>=self.min_df).collectAsMap()
            self.dictionary = dict(zip(self.dictionary,xrange(len(self.dictionary))))

        #create word vectors
        X = X.map(self._term_frequency)
        
        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data

开发者ID:iamshang1，项目名称:Projects，代码行数:38，代码来源:preprocessing_asm.py

注：本文中的pyspark.rdd.RDD属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。