Python functions.countDistinct方法代码示例

本文整理汇总了Python中pyspark.sql.functions.countDistinct方法的典型用法代码示例。如果您正苦于以下问题：Python functions.countDistinct方法的具体用法？Python functions.countDistinct怎么用？Python functions.countDistinct使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions的用法示例。

在下文中一共展示了functions.countDistinct方法的6个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: has_duplicates

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def has_duplicates(self) -> bool:
        """
        If index has duplicates, return True, otherwise False.

        Examples
        --------
        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
        >>> kdf.index.has_duplicates
        True

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
        >>> kdf.index.has_duplicates
        False

        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
        >>> kdf.index.has_duplicates
        True
        """
        sdf = self._internal.spark_frame.select(self.spark.column)
        scol = scol_for(sdf, sdf.columns[0])

        return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]

开发者ID:databricks，项目名称:koalas，代码行数:24，代码来源:indexes.py

示例2: levshape

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def levshape(self):
        """
        A tuple with the length of each level.

        Examples
        --------
        >>> midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> midx  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> midx.levshape
        (3, 3)
        """
        result = self._internal.spark_frame.agg(
            *(F.countDistinct(c) for c in self._internal.index_spark_columns)
        ).collect()[0]
        return tuple(result)

开发者ID:databricks，项目名称:koalas，代码行数:22，代码来源:indexes.py

示例3: _nunique

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def _nunique(self, dropna=True, approx=False, rsd=0.05):
        colname = self._internal.data_spark_column_names[0]
        count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
        if dropna:
            return count_fn(self.spark.column).alias(colname)
        else:
            return (
                count_fn(self.spark.column)
                + F.when(
                    F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1
                ).otherwise(0)
            ).alias(colname)

开发者ID:databricks，项目名称:koalas，代码行数:14，代码来源:base.py

示例4: is_unique

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def is_unique(self):
        """
        Return boolean if values in the object are unique

        Returns
        -------
        is_unique : boolean

        >>> ks.Series([1, 2, 3]).is_unique
        True
        >>> ks.Series([1, 2, 2]).is_unique
        False
        >>> ks.Series([1, 2, 3, None]).is_unique
        True
        """
        scol = self.spark.column

        # Here we check:
        #   1. the distinct count without nulls and count without nulls for non-null values
        #   2. count null values and see if null is a distinct value.
        #
        # This workaround is in order to calculate the distinct count including nulls in
        # single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls.
        return self._internal.spark_frame.select(
            (F.count(scol) == F.countDistinct(scol))
            & (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1)
        ).collect()[0][0]

开发者ID:databricks，项目名称:koalas，代码行数:29，代码来源:series.py

示例5: nunique

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def nunique(self, dropna=True):
        """
        Return DataFrame with number of distinct observations per group for each column.

        Parameters
        ----------
        dropna : boolean, default True
            Don’t include NaN in the counts.

        Returns
        -------
        nunique : DataFrame

        Examples
        --------

        >>> df = ks.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
        ...                           'ham', 'ham'],
        ...                    'value1': [1, 5, 5, 2, 5, 5],
        ...                    'value2': list('abbaxy')}, columns=['id', 'value1', 'value2'])
        >>> df
             id  value1 value2
        0  spam       1      a
        1   egg       5      b
        2   egg       5      b
        3  spam       2      a
        4   ham       5      x
        5   ham       5      y

        >>> df.groupby('id').nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
              id  value1  value2
        id
        egg    1       1       1
        ham    1       1       2
        spam   1       2       1

        >>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
        id
        egg     1
        ham     1
        spam    2
        Name: value1, dtype: int64
        """
        if dropna:
            stat_function = lambda col: F.countDistinct(col)
        else:
            stat_function = lambda col: (
                F.countDistinct(col)
                + F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0)
            )

        should_include_groupkeys = isinstance(self, DataFrameGroupBy)
        return self._reduce_for_stat_function(
            stat_function, only_numeric=False, should_include_groupkeys=should_include_groupkeys
        )

开发者ID:databricks，项目名称:koalas，代码行数:57，代码来源:groupby.py

示例6: transform

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def transform(spark):
    """Create the bookmark problem and summary tables."""

    query = """
    SELECT s.app_build_id,
           s.app_version,
           s.app_display_version,
           s.app_name,
           s.app_channel,
           s.uid,
           s.device_id AS device_id,
           s.submission_date_s3 AS submission_day,
           date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
           s.when,
           s.status,
           e.name AS engine_name,
           e.status AS engine_status,
           e.failure_reason AS engine_failure_reason,
           e.validation.problems IS NOT NULL AS engine_has_problems,
           e.validation.version AS engine_validation_version,
           e.validation.checked AS engine_validation_checked,
           e.validation.took AS engine_validation_took,
           p.name AS engine_validation_problem_name,
           p.count AS engine_validation_problem_count
    FROM sync_summary s
    LATERAL VIEW explode(s.engines) AS e
    LATERAL VIEW OUTER explode(e.validation.problems) AS p
    WHERE s.failure_reason IS NULL
    """
    engine_validations = spark.sql(query)

    bookmark_validations = engine_validations.where(
        F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
    )

    bookmark_validation_problems = bookmark_validations.where(
        F.col("engine_has_problems")
    )

    # Generate aggregates over all bookmarks
    bookmark_aggregates = (
        bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
        # see bug 1410963 for submission date vs sync date
        .groupBy("submission_day").agg(
            F.countDistinct("uid", "device_id", "when").alias(
                "total_bookmark_validations"
            ),
            F.countDistinct("uid").alias("total_validated_users"),
            F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
        )
    )

    bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
    bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")

开发者ID:mozilla，项目名称:python_mozetl，代码行数:56，代码来源:bookmark_validation.py

注：本文中的pyspark.sql.functions.countDistinct方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。