当前位置: 首页>>代码示例>>Python>>正文


Python functions.sum方法代码示例

本文整理汇总了Python中pyspark.sql.functions.sum方法的典型用法代码示例。如果您正苦于以下问题:Python functions.sum方法的具体用法?Python functions.sum怎么用?Python functions.sum使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.functions的用法示例。


在下文中一共展示了functions.sum方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self):
        """Compute the sum for each group."""
        if self._can_use_new_school():
            self._prep_spark_sql_groupby()
            import pyspark.sql.functions as func
            return self._use_aggregation(func.sum)
        self._prep_pandas_groupby()
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pd.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return DataFrame.fromDataFrameRDD(rddOfSum, self.sql_ctx) 
开发者ID:sparklingpandas,项目名称:sparklingpandas,代码行数:26,代码来源:groupby.py

示例2: compile_sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def compile_sum(t, expr, scope, context=None, **kwargs):
    return compile_aggregator(t, expr, scope, F.sum, context, **kwargs) 
开发者ID:ibis-project,项目名称:ibis,代码行数:4,代码来源:compiler.py

示例3: run

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
        df = i[Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:5,代码来源:employment.py

示例4: run

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
        df = i[_DEP_NAME_]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:5,代码来源:employment.py

示例5: run

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def run(self, i):
        df = i[inputdata.Employment]
        return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP")) 
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:5,代码来源:employment.py

示例6: sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self):
        def sum(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.sum(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(sum) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例7: agg_sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def agg_sum(field_name, alias=None, expression=None):
    field_alias = get_alias(field_name, alias, "sum")
    field_expression = expression
    if field_expression is None:
        field_expression = field_name
    return F.sum(field_expression).alias(field_alias) 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:8,代码来源:fields.py

示例8: sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def sum(self, axis=None, numeric_only=True):
        """
        Return the sum of the values.

        Parameters
        ----------
        axis : {index (0), columns (1)}
            Axis for the function to be applied on.
        numeric_only : bool, default True
            Include only float, int, boolean columns. False is not supported. This parameter
            is mainly for pandas compatibility.

        Returns
        -------
        sum : scalar for a Series, and a Series for a DataFrame.

        Examples
        --------

        >>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
        ...                   columns=['a', 'b'])

        On a DataFrame:

        >>> df.sum()
        a    6.0
        b    0.6
        Name: 0, dtype: float64

        >>> df.sum(axis=1)
        0    1.1
        1    2.2
        2    3.3
        3    0.0
        Name: 0, dtype: float64

        On a Series:

        >>> df['a'].sum()
        6.0
        """
        return self._reduce_for_stat_function(
            F.sum, name="sum", numeric_only=numeric_only, axis=axis
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:46,代码来源:generic.py

示例9: cummax

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cummax(self):
        """
        Cumulative max for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cummax
        DataFrame.cummax

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cummax().sort_index()
              B  C
        0   NaN  4
        1   0.1  4
        2  20.0  4
        3  10.0  1

        It works as below in Series.

        >>> df.C.groupby(df.A).cummax().sort_index()
        0    4
        1    4
        2    4
        3    1
        Name: C, dtype: int64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.max, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:50,代码来源:groupby.py

示例10: cummin

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cummin(self):
        """
        Cumulative min for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cummin
        DataFrame.cummin

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cummin().sort_index()
              B  C
        0   NaN  4
        1   0.1  3
        2   0.1  2
        3  10.0  1

        It works as below in Series.

        >>> df.B.groupby(df.A).cummin().sort_index()
        0     NaN
        1     0.1
        2     0.1
        3    10.0
        Name: B, dtype: float64
        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.min, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:49,代码来源:groupby.py

示例11: cumprod

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cumprod(self):
        """
        Cumulative product for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cumprod
        DataFrame.cumprod

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cumprod().sort_index()
              B     C
        0   NaN   4.0
        1   0.1  12.0
        2   2.0  24.0
        3  10.0   1.0

        It works as below in Series.

        >>> df.B.groupby(df.A).cumprod().sort_index()
        0     NaN
        1     0.1
        2     2.0
        3    10.0
        Name: B, dtype: float64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cumprod(True, part_cols=sg._groupkeys_scols), should_resolve=True
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:49,代码来源:groupby.py

示例12: cumsum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def cumsum(self):
        """
        Cumulative sum for each group.

        Returns
        -------
        Series or DataFrame

        See Also
        --------
        Series.cumsum
        DataFrame.cumsum

        Examples
        --------
        >>> df = ks.DataFrame(
        ...     [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],
        ...     columns=list('ABC'))
        >>> df
           A     B  C
        0  1   NaN  4
        1  1   0.1  3
        2  1  20.0  2
        3  4  10.0  1

        By default, iterates over rows and finds the sum in each column.

        >>> df.groupby("A").cumsum().sort_index()
              B  C
        0   NaN  4
        1   0.1  7
        2  20.1  9
        3  10.0  1

        It works as below in Series.

        >>> df.B.groupby(df.A).cumsum().sort_index()
        0     NaN
        1     0.1
        2    20.1
        3    10.0
        Name: B, dtype: float64

        """
        return self._apply_series_op(
            lambda sg: sg._kser._cum(F.sum, True, part_cols=sg._groupkeys_scols),
            should_resolve=True,
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:50,代码来源:groupby.py

示例13: normalize_keyword_aggregation

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def normalize_keyword_aggregation(kwargs):
    """
    Normalize user-provided kwargs.

    Transforms from the new ``Dict[str, NamedAgg]`` style kwargs
    to the old OrderedDict[str, List[scalar]]].

    Parameters
    ----------
    kwargs : dict

    Returns
    -------
    aggspec : dict
        The transformed kwargs.
    columns : List[str]
        The user-provided keys.
    order : List[Tuple[str, str]]
        Pairs of the input and output column names.

    Examples
    --------
    >>> normalize_keyword_aggregation({'output': ('input', 'sum')})
    (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')])
    """
    # this is due to python version issue, not sure the impact on koalas
    PY36 = sys.version_info >= (3, 6)
    if not PY36:
        kwargs = OrderedDict(sorted(kwargs.items()))

    # TODO(Py35): When we drop python 3.5, change this to defaultdict(list)
    aggspec = OrderedDict()
    order = []
    columns, pairs = list(zip(*kwargs.items()))

    for column, aggfunc in pairs:
        if column in aggspec:
            aggspec[column].append(aggfunc)
        else:
            aggspec[column] = [aggfunc]

        order.append((column, aggfunc))
    # For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be
    # flattened to ('y', 'A', 'max'), it won't do anything on normal Index.
    if isinstance(order[0][0], tuple):
        order = [(*levs, method) for levs, method in order]
    return aggspec, columns, order 
开发者ID:databricks,项目名称:koalas,代码行数:49,代码来源:groupby.py

示例14: aggregate_addons

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def aggregate_addons(df):
    """
    Aggregates add-on indicators by client, channel, version and locale.
    The result is a DataFrame with the additional aggregate columns:

    n_self_installed_addons (int)
    n_shield_addons (int)
    n_foreign_installed_addons (int)
    n_system_addons (int)
    n_web_extensions (int)
    first_addon_install_date (str %Y%m%d)
    profile_creation_date (str %Y%m%d)

    for each of the above facets.

    :param df: an expoded instance of main_summary by active_addons
               with various additional indicator columns
    :return SparkDF: an aggregated dataset with each of the above columns
    """
    addon_aggregates = (
        df.distinct()
        .groupBy("client_id", "normalized_channel", "app_version", "locale")
        .agg(
            fun.sum("is_self_install").alias("n_self_installed_addons"),
            fun.sum("is_shield_addon").alias("n_shield_addons"),
            fun.sum("is_foreign_install").alias("n_foreign_installed_addons"),
            fun.sum("is_system").alias("n_system_addons"),
            fun.sum("is_web_extension").alias("n_web_extensions"),
            fun.min(
                fun.when(
                    df.is_self_install == 1,
                    fun.date_format(
                        fun.from_unixtime(fun.col("install_day") * 60 * 60 * 24),
                        "yyyyMMdd",
                    ),
                ).otherwise(None)
            ).alias("first_addon_install_date"),
            fun.date_format(
                fun.from_unixtime(fun.min("profile_creation_date") * 60 * 60 * 24),
                "yyyyMMdd",
            ).alias("profile_creation_date"),
        )
    )
    return addon_aggregates 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:46,代码来源:addon_aggregates.py

示例15: transform

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import sum [as 别名]
def transform(spark):
    """Create the bookmark problem and summary tables."""

    query = """
    SELECT s.app_build_id,
           s.app_version,
           s.app_display_version,
           s.app_name,
           s.app_channel,
           s.uid,
           s.device_id AS device_id,
           s.submission_date_s3 AS submission_day,
           date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
           s.when,
           s.status,
           e.name AS engine_name,
           e.status AS engine_status,
           e.failure_reason AS engine_failure_reason,
           e.validation.problems IS NOT NULL AS engine_has_problems,
           e.validation.version AS engine_validation_version,
           e.validation.checked AS engine_validation_checked,
           e.validation.took AS engine_validation_took,
           p.name AS engine_validation_problem_name,
           p.count AS engine_validation_problem_count
    FROM sync_summary s
    LATERAL VIEW explode(s.engines) AS e
    LATERAL VIEW OUTER explode(e.validation.problems) AS p
    WHERE s.failure_reason IS NULL
    """
    engine_validations = spark.sql(query)

    bookmark_validations = engine_validations.where(
        F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
    )

    bookmark_validation_problems = bookmark_validations.where(
        F.col("engine_has_problems")
    )

    # Generate aggregates over all bookmarks
    bookmark_aggregates = (
        bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
        # see bug 1410963 for submission date vs sync date
        .groupBy("submission_day").agg(
            F.countDistinct("uid", "device_id", "when").alias(
                "total_bookmark_validations"
            ),
            F.countDistinct("uid").alias("total_validated_users"),
            F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
        )
    )

    bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
    bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day") 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:56,代码来源:bookmark_validation.py


注:本文中的pyspark.sql.functions.sum方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。