本文整理汇总了Python中pyspark.sql.functions.sum方法的典型用法代码示例。如果您正苦于以下问题:Python functions.sum方法的具体用法?Python functions.sum怎么用?Python functions.sum使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.sum方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self):
"""Compute the sum for each group."""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.sum)
self._prep_pandas_groupby()
myargs = self._myargs
mykwargs = self._mykwargs
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).sum()
def merge_value(x, y):
return pd.concat([x, create_combiner(y)])
def merge_combiner(x, y):
return x + y
rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx)
示例2: compile_sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def compile_sum(t, expr, scope, context=None, **kwargs):
return compile_aggregator(t, expr, scope, F.sum, context, **kwargs)
示例3: run
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
df = i[Employment]
return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
示例4: run
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
df = i[_DEP_NAME_]
return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
示例5: run
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
df = i[inputdata.Employment]
return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
示例6: sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self):
def sum(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.sum(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(sum)
示例7: agg_sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def agg_sum(field_name, alias=None, expression=None):
field_alias = get_alias(field_name, alias, "sum")
field_expression = expression
if field_expression is None:
field_expression = field_name
return F.sum(field_expression).alias(field_alias)
示例8: sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self, axis=None, numeric_only=True):
"""
Return the sum of the values.
Parameters
----------
axis : {index (0), columns (1)}
Axis for the function to be applied on.
numeric_only : bool, default True
Include only float, int, boolean columns. False is not supported. This parameter
is mainly for pandas compatibility.
Returns
-------
sum : scalar for a Series, and a Series for a DataFrame.
Examples
--------
>>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
... columns=['a', 'b'])
On a DataFrame:
>>> df.sum()
a 6.0
b 0.6
Name: 0, dtype: float64
>>> df.sum(axis=1)
0 1.1
1 2.2
2 3.3
3 0.0
Name: 0, dtype: float64
On a Series:
>>> df['a'].sum()
6.0
"""
return self._reduce_for_stat_function(
F.sum, name="sum", numeric_only=numeric_only, axis=axis
)
示例9: cummax
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cummax(self):
"""
Cumulative max for each group.
Returns
-------
Series or DataFrame
See Also
--------
Series.cummax
DataFrame.cummax
Examples
--------
>>> df = ks.DataFrame(
... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
... columns=list('ABC'))
>>> df
A B C
0 1 NaN 4
1 1 0.1 3
2 1 20.0 2
3 4 10.0 1
By default, iterates over rows and finds the sum in each column.
>>> df.groupby("A").cummax().sort_index()
B C
0 NaN 4
1 0.1 4
2 20.0 4
3 10.0 1
It works as below in Series.
>>> df.C.groupby(df.A).cummax().sort_index()
0 4
1 4
2 4
3 1
Name: C, dtype: int64
"""
return self._apply_series_op(
lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols),
should_resolve=True,
)
示例10: cummin
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cummin(self):
"""
Cumulative min for each group.
Returns
-------
Series or DataFrame
See Also
--------
Series.cummin
DataFrame.cummin
Examples
--------
>>> df = ks.DataFrame(
... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
... columns=list('ABC'))
>>> df
A B C
0 1 NaN 4
1 1 0.1 3
2 1 20.0 2
3 4 10.0 1
By default, iterates over rows and finds the sum in each column.
>>> df.groupby("A").cummin().sort_index()
B C
0 NaN 4
1 0.1 3
2 0.1 2
3 10.0 1
It works as below in Series.
>>> df.B.groupby(df.A).cummin().sort_index()
0 NaN
1 0.1
2 0.1
3 10.0
Name: B, dtype: float64
"""
return self._apply_series_op(
lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols),
should_resolve=True,
)
示例11: cumprod
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cumprod(self):
"""
Cumulative product for each group.
Returns
-------
Series or DataFrame
See Also
--------
Series.cumprod
DataFrame.cumprod
Examples
--------
>>> df = ks.DataFrame(
... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
... columns=list('ABC'))
>>> df
A B C
0 1 NaN 4
1 1 0.1 3
2 1 20.0 2
3 4 10.0 1
By default, iterates over rows and finds the sum in each column.
>>> df.groupby("A").cumprod().sort_index()
B C
0 NaN 4.0
1 0.1 12.0
2 2.0 24.0
3 10.0 1.0
It works as below in Series.
>>> df.B.groupby(df.A).cumprod().sort_index()
0 NaN
1 0.1
2 2.0
3 10.0
Name: B, dtype: float64
"""
return self._apply_series_op(
lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols), should_resolve=True
)
示例12: cumsum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cumsum(self):
"""
Cumulative sum for each group.
Returns
-------
Series or DataFrame
See Also
--------
Series.cumsum
DataFrame.cumsum
Examples
--------
>>> df = ks.DataFrame(
... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
... columns=list('ABC'))
>>> df
A B C
0 1 NaN 4
1 1 0.1 3
2 1 20.0 2
3 4 10.0 1
By default, iterates over rows and finds the sum in each column.
>>> df.groupby("A").cumsum().sort_index()
B C
0 NaN 4
1 0.1 7
2 20.1 9
3 10.0 1
It works as below in Series.
>>> df.B.groupby(df.A).cumsum().sort_index()
0 NaN
1 0.1
2 20.1
3 10.0
Name: B, dtype: float64
"""
return self._apply_series_op(
lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols),
should_resolve=True,
)
示例13: normalize_keyword_aggregation
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def normalize_keyword_aggregation(kwargs):
"""
Normalize user-provided kwargs.
Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
to the old OrderedDict[str, List[scalar]]].
Parameters
----------
kwargs : dict
Returns
-------
aggspec : dict
The transformed kwargs.
columns : List[str]
The user-provided keys.
order : List[Tuple[str, str]]
Pairs of the input and output column names.
Examples
--------
>>> normalize_keyword_aggregation({'output': ('input', 'sum')})
(OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')])
"""
# this is due to python version issue, not sure the impact on koalas
PY36 = sys.version_info >= (3, 6)
if not PY36:
kwargs = OrderedDict(sorted(kwargs.items()))
# TODO(Py35): When we drop python 3.5, change this to defaultdict(list)
aggspec = OrderedDict()
order = []
columns, pairs = list(zip(*kwargs.items()))
for column, aggfunc in pairs:
if column in aggspec:
aggspec[column].append(aggfunc)
else:
aggspec[column] = [aggfunc]
order.append((column, aggfunc))
# For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be
# flattened to ('y', 'A', 'max'), it won't do anything on normal Index.
if isinstance(order[0][0], tuple):
order = [(*levs, method) for levs, method in order]
return aggspec, columns, order
示例14: aggregate_addons
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def aggregate_addons(df):
"""
Aggregates add-on indicators by client, channel, version and locale.
The result is a DataFrame with the additional aggregate columns:
n_self_installed_addons (int)
n_shield_addons (int)
n_foreign_installed_addons (int)
n_system_addons (int)
n_web_extensions (int)
first_addon_install_date (str %Y%m%d)
profile_creation_date (str %Y%m%d)
for each of the above facets.
:param df: an expoded instance of main_summary by active_addons
with various additional indicator columns
:return SparkDF: an aggregated dataset with each of the above columns
"""
addon_aggregates = (
df.distinct()
.groupBy("client_id", "normalized_channel", "app_version", "locale")
.agg(
fun.sum("is_self_install").alias("n_self_installed_addons"),
fun.sum("is_shield_addon").alias("n_shield_addons"),
fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
fun.sum("is_system").alias("n_system_addons"),
fun.sum("is_web_extension").alias("n_web_extensions"),
fun.min(
fun.when(
df.is_self_install == 1,
fun.date_format(
fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
"yyyyMMdd",
),
).otherwise(None)
).alias("first_addon_install_date"),
fun.date_format(
fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
"yyyyMMdd",
).alias("profile_creation_date"),
)
)
return addon_aggregates
示例15: transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def transform(spark):
"""Create the bookmark problem and summary tables."""
query = """
SELECT s.app_build_id,
s.app_version,
s.app_display_version,
s.app_name,
s.app_channel,
s.uid,
s.device_id AS device_id,
s.submission_date_s3 AS submission_day,
date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
s.when,
s.status,
e.name AS engine_name,
e.status AS engine_status,
e.failure_reason AS engine_failure_reason,
e.validation.problems IS NOT NULL AS engine_has_problems,
e.validation.version AS engine_validation_version,
e.validation.checked AS engine_validation_checked,
e.validation.took AS engine_validation_took,
p.name AS engine_validation_problem_name,
p.count AS engine_validation_problem_count
FROM sync_summary s
LATERAL VIEW explode(s.engines) AS e
LATERAL VIEW OUTER explode(e.validation.problems) AS p
WHERE s.failure_reason IS NULL
"""
engine_validations = spark.sql(query)
bookmark_validations = engine_validations.where(
F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
)
bookmark_validation_problems = bookmark_validations.where(
F.col("engine_has_problems")
)
# Generate aggregates over all bookmarks
bookmark_aggregates = (
bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
# see bug 1410963 for submission date vs sync date
.groupBy("submission_day").agg(
F.countDistinct("uid", "device_id", "when").alias(
"total_bookmark_validations"
),
F.countDistinct("uid").alias("total_validated_users"),
F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
)
)
bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")