本文整理汇总了Python中pyspark.sql.functions.pandas_udf方法的典型用法代码示例。如果您正苦于以下问题:Python functions.pandas_udf方法的具体用法?Python functions.pandas_udf怎么用?Python functions.pandas_udf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.pandas_udf方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _cumprod
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def _cumprod(self, skipna, part_cols=()):
from pyspark.sql.functions import pandas_udf
def cumprod(scol):
@pandas_udf(returnType=self.spark.data_type)
def negative_check(s):
assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
"values should be bigger than 0: %s" % s
)
return s
return F.sum(F.log(negative_check(scol)))
kser = self._cum(cumprod, skipna, part_cols)
return kser._with_new_scol(F.exp(kser.spark.column)).rename(self.name)
# ----------------------------------------------------------------------
# Accessor Methods
# ----------------------------------------------------------------------
示例2: _transform_batch
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def _transform_batch(self, func, return_schema):
from databricks.koalas.series import Series
from databricks import koalas as ks
if isinstance(func, np.ufunc):
f = func
func = lambda *args, **kwargs: f(*args, **kwargs)
if return_schema is None:
# TODO: In this case, it avoids the shortcut for now (but only infers schema)
# because it returns a series from a different DataFrame and it has a different
# anchor. We should fix this to allow the shortcut or only allow to infer
# schema.
limit = ks.get_option("compute.shortcut_limit")
pser = self._kser.head(limit)._to_internal_pandas()
transformed = pser.transform(func)
kser = Series(transformed)
spark_return_type = kser.spark.data_type
else:
spark_return_type = return_schema
pudf = pandas_udf(func, returnType=spark_return_type, functionType=PandasUDFType.SCALAR)
return self._kser._with_new_scol(scol=pudf(self._kser.spark.column)).rename(self._kser.name)
示例3: create_udf_node
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def create_udf_node(self, udf_func):
"""Create a new UDF node type and adds a corresponding compile rule.
Parameters
----------
udf_func : function
Should be the result of calling pyspark.sql.functions.udf or
pyspark.sql.functions.pandas_udf on the user-specified func
Returns
-------
result : type
A new SparkUDFNode or SparkUDAFNode subclass
"""
name = udf_func.__name__
definition = next(_udf_name_cache[name])
external_name = '{}_{:d}'.format(name, definition)
UDFNode = type(
external_name,
(self.base_class,),
{
'signature': sig.TypeSignature.from_dtypes(self.input_type),
'return_type': self.output_type,
},
)
# Add udf_func as a property. If added to the class namespace dict, it
# would be incorrectly used as a bound method, i.e.
# udf_func(t.column) would be a call to bound method func with t.column
# interpreted as self.
UDFNode.udf_func = property(lambda self, udf_func=udf_func: udf_func)
@compiles(UDFNode)
def compiles_udf_node(t, expr):
return '{}({})'.format(
UDFNode.__name__, ', '.join(map(t.translate, expr.op().args))
)
return UDFNode
示例4: pyspark_udf
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def pyspark_udf(self, func):
return f.pandas_udf(func, self.spark_output_type, self.pandas_udf_type)
示例5: compile_strftime
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def compile_strftime(t, expr, scope, **kwargs):
op = expr.op()
format_str = op.format_str.op().value
@pandas_udf('string', PandasUDFType.SCALAR)
def strftime(timestamps):
return timestamps.dt.strftime(format_str)
src_column = t.translate(op.arg, scope)
return strftime(src_column)
示例6: compile_day_of_week_index
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def compile_day_of_week_index(t, expr, scope, **kwargs):
op = expr.op()
@pandas_udf('short', PandasUDFType.SCALAR)
def day_of_week(s):
return s.dt.dayofweek
src_column = t.translate(op.arg, scope)
return day_of_week(src_column.cast('timestamp'))
示例7: compiles_day_of_week_name
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def compiles_day_of_week_name(t, expr, scope, **kwargs):
op = expr.op()
@pandas_udf('string', PandasUDFType.SCALAR)
def day_name(s):
return s.dt.day_name()
src_column = t.translate(op.arg, scope)
return day_name(src_column.cast('timestamp'))
示例8: test_array_field
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def test_array_field(spark_test_ctx):
@pandas_udf('array<float>')
def gen_array(v):
return v.map(lambda x: np.random.rand(10))
df1 = spark_test_ctx.spark.range(10).withColumn('v', gen_array('id')).repartition(2)
cv1 = make_spark_converter(df1)
# we can auto infer one-dim array shape
with cv1.make_tf_dataset(batch_size=4, num_epochs=1) as dataset:
tf_iter = dataset.make_one_shot_iterator()
next_op = tf_iter.get_next()
with tf.Session() as sess:
batch1 = sess.run(next_op)
assert batch1.v.shape == (4, 10)
示例9: _spark_group_map_apply
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import pandas_udf [as 别名]
def _spark_group_map_apply(kdf, func, groupkeys_scols, return_schema, retain_index):
output_func = GroupBy._make_pandas_df_builder_func(kdf, func, return_schema, retain_index)
grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(output_func)
sdf = kdf._internal.spark_frame.drop(*HIDDEN_COLUMNS)
return sdf.groupby(*groupkeys_scols).apply(grouped_map_func)