当前位置: 首页>>代码示例>>Python>>正文


Python sql.Row方法代码示例

本文整理汇总了Python中pyspark.sql.Row方法的典型用法代码示例。如果您正苦于以下问题:Python sql.Row方法的具体用法?Python sql.Row怎么用?Python sql.Row使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql的用法示例。


在下文中一共展示了sql.Row方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _build_local_features

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _build_local_features(np_dtype):
    """
    Build numpy array (i.e. local) features.
    """
    # Build local features and DataFrame from it
    local_features = []
    np.random.seed(997)
    for idx in range(100):
        _dict = {'idx': idx}
        for colname, _ in _input_mapping.items():
            colvalue = np.random.randn(_tensor_size) * 100
            _dict[colname] = colvalue.astype(np_dtype).tolist()

        local_features.append(Row(**_dict))

    return local_features 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:18,代码来源:tf_transformer_test.py

示例2: test_map_rows_sql_1

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_map_rows_sql_1(self):
        data = [Row(x=float(x)) for x in range(5)]
        df = self.sql.createDataFrame(data)
        with IsolatedSession() as issn:
            # The placeholder that corresponds to column 'x' as a whole column
            x = tf.placeholder(tf.double, shape=[], name="x")
            # The output that adds 3 to x
            z = tf.add(x, 3, name='z')
            # Let's register these computations in SQL.
            makeGraphUDF(issn.graph, "map_rows_sql_1", [z])

        # Here we go, for the SQL users, straight from PySpark.
        df2 = df.selectExpr("map_rows_sql_1(x) AS z")
        print("df2 = %s" % df2)
        data2 = df2.collect()
        assert data2[0].z == 3.0, data2 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:18,代码来源:keras_sql_udf_test.py

示例3: test_map_blocks_sql_1

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_map_blocks_sql_1(self):
        data = [Row(x=float(x)) for x in range(5)]
        df = self.sql.createDataFrame(data)
        with IsolatedSession() as issn:
            # The placeholder that corresponds to column 'x' as a whole column
            x = tf.placeholder(tf.double, shape=[None], name="x")
            # The output that adds 3 to x
            z = tf.add(x, 3, name='z')
            # Let's register these computations in SQL.
            makeGraphUDF(issn.graph, "map_blocks_sql_1", [z], blocked=True)

        # Here we go, for the SQL users, straight from PySpark.
        df2 = df.selectExpr("map_blocks_sql_1(x) AS z")
        print("df2 = %s" % df2)
        data2 = df2.collect()
        assert len(data2) == 5, data2
        assert data2[0].z == 3.0, data2 
开发者ID:databricks,项目名称:spark-deep-learning,代码行数:19,代码来源:keras_sql_udf_test.py

示例4: _monkey_patch_RDD

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _monkey_patch_RDD(sparkSession):
    def toDF(self, schema=None, sampleRatio=None):
        """
        Converts current :class:`RDD` into a :class:`DataFrame`

        This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``

        :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: a DataFrame

        >>> rdd.toDF().collect()
        [Row(name=u'Alice', age=1)]
        """
        return sparkSession.createDataFrame(self, schema, sampleRatio)

    RDD.toDF = toDF 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:19,代码来源:session.py

示例5: _inferSchemaFromList

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _inferSchemaFromList(self, data, names=None):
        """
        Infer schema from list of Row or tuple.

        :param data: list of Row or tuple
        :param names: list of column names
        :return: :class:`pyspark.sql.types.StructType`
        """
        if not data:
            raise ValueError("can not infer schema from empty dataset")
        first = data[0]
        if type(first) is dict:
            warnings.warn("inferring schema from dict is deprecated,"
                          "please use pyspark.sql.Row instead")
        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
        if _has_nulltype(schema):
            raise ValueError("Some of types cannot be determined after inferring")
        return schema 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:20,代码来源:session.py

示例6: _test

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")])
    globs['df'] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1) 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:26,代码来源:session.py

示例7: _test

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _test():
    import os
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row
    import pyspark.sql.session

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.session.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['spark'] = SparkSession(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")])
    globs['df'] = rdd.toDF()
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.session, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        exit(-1) 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:26,代码来源:session.py

示例8: convert_svmrank_to_xgboost

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
    def convert_one(row: Row) -> Row:
        # For now place the .xgb right next to the svmrank files. Naming/path
        # options could be added if needed later.
        out_path = row.path + '.xgb'
        _convert_xgboost_remote(row.path, out_path)
        return Row(**dict(
            row.asDict(),
            vec_format='xgboost',
            path=out_path))

    # Each row represents potentially gigabytes, convince spark
    # to create a partition per row.
    rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
    df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema)  # type: ignore
    # Return both the xgb and svmrank datasets since
    # we aren't purging the related files. df is safe to reuse since
    # svmrank conversion returns a new dataframe with no lineage.
    return df.union(df_xgb) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:21,代码来源:make_folds.py

示例9: test_lf_applier_spark_preprocessor_memoized

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            return Row(num=x.num, num_squared=x.num ** 2)

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp_memoized])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:19,代码来源:test_spark.py

示例10: test_decorator_mapper_memoized_none

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_decorator_mapper_memoized_none(self) -> None:
        square_hit_tracker = SquareHitTracker()

        @lambda_mapper(memoize=True)
        def square(x: DataPoint) -> DataPoint:
            fields = x.asDict()
            fields["num_squared"] = square_hit_tracker(x.num)
            if x.num == 21:
                return None
            return Row(**fields)

        x21 = self._get_x(21)
        x21_mapped = square(x21)
        self.assertIsNone(x21_mapped)
        self.assertEqual(square_hit_tracker.n_hits, 1)
        x21_mapped = square(x21)
        self.assertIsNone(x21_mapped)
        self.assertEqual(square_hit_tracker.n_hits, 1) 
开发者ID:snorkel-team,项目名称:snorkel,代码行数:20,代码来源:test_spark.py

示例11: test_string_indexer_handle_invalid

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
                            stringOrderType="alphabetAsc")
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:22,代码来源:tests.py

示例12: test_infer_schema

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_infer_schema(self):
        rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1),
                                   Row(label=0.0, features=self.sv1)])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:tests.py

示例13: approx_count_distinct

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def approx_count_distinct(col, rsd=None):
    """Aggregate function: returns a new :class:`Column` for approximate distinct count of
    column `col`.

    :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
        efficient to use :func:`countDistinct`

    >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
    [Row(distinct_ages=2)]
    """
    sc = SparkContext._active_spark_context
    if rsd is None:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))
    else:
        jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)
    return Column(jc) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:functions.py

示例14: monotonically_increasing_id

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def monotonically_increasing_id():
    """A column that generates monotonically increasing 64-bit integers.

    The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
    The current implementation puts the partition ID in the upper 31 bits, and the record number
    within each partition in the lower 33 bits. The assumption is that the data frame has
    less than 1 billion partitions, and each partition has less than 8 billion records.

    .. note:: The function is non-deterministic because its result depends on partition IDs.

    As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
    This expression would return the following IDs:
    0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.

    >>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1'])
    >>> df0.select(monotonically_increasing_id().alias('id')).collect()
    [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.monotonically_increasing_id()) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:22,代码来源:functions.py

示例15: randn

# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def randn(seed=None):
    """Generates a column with independent and identically distributed (i.i.d.) samples from
    the standard normal distribution.

    .. note:: The function is non-deterministic in general case.

    >>> df.withColumn('randn', randn(seed=42)).collect()
    [Row(age=2, name=u'Alice', randn=-0.7556247885860078),
    Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
    """
    sc = SparkContext._active_spark_context
    if seed is not None:
        jc = sc._jvm.functions.randn(seed)
    else:
        jc = sc._jvm.functions.randn()
    return Column(jc) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:functions.py


注:本文中的pyspark.sql.Row方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。