本文整理汇总了Python中pyspark.sql.Row方法的典型用法代码示例。如果您正苦于以下问题:Python sql.Row方法的具体用法?Python sql.Row怎么用?Python sql.Row使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql
的用法示例。
在下文中一共展示了sql.Row方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _build_local_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _build_local_features(np_dtype):
"""
Build numpy array (i.e. local) features.
"""
# Build local features and DataFrame from it
local_features = []
np.random.seed(997)
for idx in range(100):
_dict = {'idx': idx}
for colname, _ in _input_mapping.items():
colvalue = np.random.randn(_tensor_size) * 100
_dict[colname] = colvalue.astype(np_dtype).tolist()
local_features.append(Row(**_dict))
return local_features
示例2: test_map_rows_sql_1
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_map_rows_sql_1(self):
data = [Row(x=float(x)) for x in range(5)]
df = self.sql.createDataFrame(data)
with IsolatedSession() as issn:
# The placeholder that corresponds to column 'x' as a whole column
x = tf.placeholder(tf.double, shape=[], name="x")
# The output that adds 3 to x
z = tf.add(x, 3, name='z')
# Let's register these computations in SQL.
makeGraphUDF(issn.graph, "map_rows_sql_1", [z])
# Here we go, for the SQL users, straight from PySpark.
df2 = df.selectExpr("map_rows_sql_1(x) AS z")
print("df2 = %s" % df2)
data2 = df2.collect()
assert data2[0].z == 3.0, data2
示例3: test_map_blocks_sql_1
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_map_blocks_sql_1(self):
data = [Row(x=float(x)) for x in range(5)]
df = self.sql.createDataFrame(data)
with IsolatedSession() as issn:
# The placeholder that corresponds to column 'x' as a whole column
x = tf.placeholder(tf.double, shape=[None], name="x")
# The output that adds 3 to x
z = tf.add(x, 3, name='z')
# Let's register these computations in SQL.
makeGraphUDF(issn.graph, "map_blocks_sql_1", [z], blocked=True)
# Here we go, for the SQL users, straight from PySpark.
df2 = df.selectExpr("map_blocks_sql_1(x) AS z")
print("df2 = %s" % df2)
data2 = df2.collect()
assert len(data2) == 5, data2
assert data2[0].z == 3.0, data2
示例4: _monkey_patch_RDD
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _monkey_patch_RDD(sparkSession):
def toDF(self, schema=None, sampleRatio=None):
"""
Converts current :class:`RDD` into a :class:`DataFrame`
This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``
:param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns
:param samplingRatio: the sample ratio of rows used for inferring
:return: a DataFrame
>>> rdd.toDF().collect()
[Row(name=u'Alice', age=1)]
"""
return sparkSession.createDataFrame(self, schema, sampleRatio)
RDD.toDF = toDF
示例5: _inferSchemaFromList
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _inferSchemaFromList(self, data, names=None):
"""
Infer schema from list of Row or tuple.
:param data: list of Row or tuple
:param names: list of column names
:return: :class:`pyspark.sql.types.StructType`
"""
if not data:
raise ValueError("can not infer schema from empty dataset")
first = data[0]
if type(first) is dict:
warnings.warn("inferring schema from dict is deprecated,"
"please use pyspark.sql.Row instead")
schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
if _has_nulltype(schema):
raise ValueError("Some of types cannot be determined after inferring")
return schema
示例6: _test
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _test():
import os
import doctest
from pyspark.context import SparkContext
from pyspark.sql import Row
import pyspark.sql.session
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.session.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
globs['sc'] = sc
globs['spark'] = SparkSession(sc)
globs['rdd'] = rdd = sc.parallelize(
[Row(field1=1, field2="row1"),
Row(field1=2, field2="row2"),
Row(field1=3, field2="row3")])
globs['df'] = rdd.toDF()
(failure_count, test_count) = doctest.testmod(
pyspark.sql.session, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
globs['sc'].stop()
if failure_count:
sys.exit(-1)
示例7: _test
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def _test():
import os
import doctest
from pyspark.context import SparkContext
from pyspark.sql import Row
import pyspark.sql.session
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.session.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
globs['sc'] = sc
globs['spark'] = SparkSession(sc)
globs['rdd'] = rdd = sc.parallelize(
[Row(field1=1, field2="row1"),
Row(field1=2, field2="row2"),
Row(field1=3, field2="row3")])
globs['df'] = rdd.toDF()
(failure_count, test_count) = doctest.testmod(
pyspark.sql.session, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
globs['sc'].stop()
if failure_count:
exit(-1)
示例8: convert_svmrank_to_xgboost
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
def convert_one(row: Row) -> Row:
# For now place the .xgb right next to the svmrank files. Naming/path
# options could be added if needed later.
out_path = row.path + '.xgb'
_convert_xgboost_remote(row.path, out_path)
return Row(**dict(
row.asDict(),
vec_format='xgboost',
path=out_path))
# Each row represents potentially gigabytes, convince spark
# to create a partition per row.
rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema) # type: ignore
# Return both the xgb and svmrank datasets since
# we aren't purging the related files. df is safe to reuse since
# svmrank conversion returns a new dataframe with no lineage.
return df.union(df_xgb)
示例9: test_lf_applier_spark_preprocessor_memoized
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
@preprocessor(memoize=True)
def square_memoize(x: DataPoint) -> DataPoint:
return Row(num=x.num, num_squared=x.num ** 2)
@labeling_function(pre=[square_memoize])
def fp_memoized(x: DataPoint) -> int:
return 0 if x.num_squared > 42 else -1
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, fp_memoized])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
示例10: test_decorator_mapper_memoized_none
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_decorator_mapper_memoized_none(self) -> None:
square_hit_tracker = SquareHitTracker()
@lambda_mapper(memoize=True)
def square(x: DataPoint) -> DataPoint:
fields = x.asDict()
fields["num_squared"] = square_hit_tracker(x.num)
if x.num == 21:
return None
return Row(**fields)
x21 = self._get_x(21)
x21_mapped = square(x21)
self.assertIsNone(x21_mapped)
self.assertEqual(square_hit_tracker.n_hits, 1)
x21_mapped = square(x21)
self.assertIsNone(x21_mapped)
self.assertEqual(square_hit_tracker.n_hits, 1)
示例11: test_string_indexer_handle_invalid
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_string_indexer_handle_invalid(self):
df = self.spark.createDataFrame([
(0, "a"),
(1, "d"),
(2, None)], ["id", "label"])
si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
stringOrderType="alphabetAsc")
model1 = si1.fit(df)
td1 = model1.transform(df)
actual1 = td1.select("id", "indexed").collect()
expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
self.assertEqual(actual1, expected1)
si2 = si1.setHandleInvalid("skip")
model2 = si2.fit(df)
td2 = model2.transform(df)
actual2 = td2.select("id", "indexed").collect()
expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
self.assertEqual(actual2, expected2)
示例12: test_infer_schema
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def test_infer_schema(self):
rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1),
Row(label=0.0, features=self.sv1)])
df = rdd.toDF()
schema = df.schema
field = [f for f in schema.fields if f.name == "features"][0]
self.assertEqual(field.dataType, self.udt)
vectors = df.rdd.map(lambda p: p.features).collect()
self.assertEqual(len(vectors), 2)
for v in vectors:
if isinstance(v, SparseVector):
self.assertEqual(v, self.sv1)
elif isinstance(v, DenseVector):
self.assertEqual(v, self.dv1)
else:
raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
示例13: approx_count_distinct
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def approx_count_distinct(col, rsd=None):
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of
column `col`.
:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
efficient to use :func:`countDistinct`
>>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
[Row(distinct_ages=2)]
"""
sc = SparkContext._active_spark_context
if rsd is None:
jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))
else:
jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)
return Column(jc)
示例14: monotonically_increasing_id
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def monotonically_increasing_id():
"""A column that generates monotonically increasing 64-bit integers.
The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
The current implementation puts the partition ID in the upper 31 bits, and the record number
within each partition in the lower 33 bits. The assumption is that the data frame has
less than 1 billion partitions, and each partition has less than 8 billion records.
.. note:: The function is non-deterministic because its result depends on partition IDs.
As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
This expression would return the following IDs:
0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
>>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1'])
>>> df0.select(monotonically_increasing_id().alias('id')).collect()
[Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.monotonically_increasing_id())
示例15: randn
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import Row [as 别名]
def randn(seed=None):
"""Generates a column with independent and identically distributed (i.i.d.) samples from
the standard normal distribution.
.. note:: The function is non-deterministic in general case.
>>> df.withColumn('randn', randn(seed=42)).collect()
[Row(age=2, name=u'Alice', randn=-0.7556247885860078),
Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
"""
sc = SparkContext._active_spark_context
if seed is not None:
jc = sc._jvm.functions.randn(seed)
else:
jc = sc._jvm.functions.randn()
return Column(jc)