本文整理汇总了Python中pyspark.sql.functions.countDistinct方法的典型用法代码示例。如果您正苦于以下问题:Python functions.countDistinct方法的具体用法?Python functions.countDistinct怎么用?Python functions.countDistinct使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.countDistinct方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: has_duplicates
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def has_duplicates(self) -> bool:
"""
If index has duplicates, return True, otherwise False.
Examples
--------
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
>>> kdf.index.has_duplicates
True
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
>>> kdf.index.has_duplicates
False
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
>>> kdf.index.has_duplicates
True
"""
sdf = self._internal.spark_frame.select(self.spark.column)
scol = scol_for(sdf, sdf.columns[0])
return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
示例2: levshape
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def levshape(self):
"""
A tuple with the length of each level.
Examples
--------
>>> midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
>>> midx # doctest: +SKIP
MultiIndex([('a', 'x'),
('b', 'y'),
('c', 'z')],
)
>>> midx.levshape
(3, 3)
"""
result = self._internal.spark_frame.agg(
*(F.countDistinct(c) for c in self._internal.index_spark_columns)
).collect()[0]
return tuple(result)
示例3: _nunique
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def _nunique(self, dropna=True, approx=False, rsd=0.05):
colname = self._internal.data_spark_column_names[0]
count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
if dropna:
return count_fn(self.spark.column).alias(colname)
else:
return (
count_fn(self.spark.column)
+ F.when(
F.count(F.when(self.spark.column.isNull(), 1).otherwise(None)) >= 1, 1
).otherwise(0)
).alias(colname)
示例4: is_unique
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def is_unique(self):
"""
Return boolean if values in the object are unique
Returns
-------
is_unique : boolean
>>> ks.Series([1, 2, 3]).is_unique
True
>>> ks.Series([1, 2, 2]).is_unique
False
>>> ks.Series([1, 2, 3, None]).is_unique
True
"""
scol = self.spark.column
# Here we check:
# 1. the distinct count without nulls and count without nulls for non-null values
# 2. count null values and see if null is a distinct value.
#
# This workaround is in order to calculate the distinct count including nulls in
# single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls.
return self._internal.spark_frame.select(
(F.count(scol) == F.countDistinct(scol))
& (F.count(F.when(scol.isNull(), 1).otherwise(None)) <= 1)
).collect()[0][0]
示例5: nunique
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def nunique(self, dropna=True):
"""
Return DataFrame with number of distinct observations per group for each column.
Parameters
----------
dropna : boolean, default True
Don’t include NaN in the counts.
Returns
-------
nunique : DataFrame
Examples
--------
>>> df = ks.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
... 'ham', 'ham'],
... 'value1': [1, 5, 5, 2, 5, 5],
... 'value2': list('abbaxy')}, columns=['id', 'value1', 'value2'])
>>> df
id value1 value2
0 spam 1 a
1 egg 5 b
2 egg 5 b
3 spam 2 a
4 ham 5 x
5 ham 5 y
>>> df.groupby('id').nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
id value1 value2
id
egg 1 1 1
ham 1 1 2
spam 1 2 1
>>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
id
egg 1
ham 1
spam 2
Name: value1, dtype: int64
"""
if dropna:
stat_function = lambda col: F.countDistinct(col)
else:
stat_function = lambda col: (
F.countDistinct(col)
+ F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0)
)
should_include_groupkeys = isinstance(self, DataFrameGroupBy)
return self._reduce_for_stat_function(
stat_function, only_numeric=False, should_include_groupkeys=should_include_groupkeys
)
示例6: transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import countDistinct [as 别名]
def transform(spark):
"""Create the bookmark problem and summary tables."""
query = """
SELECT s.app_build_id,
s.app_version,
s.app_display_version,
s.app_name,
s.app_channel,
s.uid,
s.device_id AS device_id,
s.submission_date_s3 AS submission_day,
date_format(from_unixtime(s.when / 1000), 'YYYYMMdd') AS sync_day,
s.when,
s.status,
e.name AS engine_name,
e.status AS engine_status,
e.failure_reason AS engine_failure_reason,
e.validation.problems IS NOT NULL AS engine_has_problems,
e.validation.version AS engine_validation_version,
e.validation.checked AS engine_validation_checked,
e.validation.took AS engine_validation_took,
p.name AS engine_validation_problem_name,
p.count AS engine_validation_problem_count
FROM sync_summary s
LATERAL VIEW explode(s.engines) AS e
LATERAL VIEW OUTER explode(e.validation.problems) AS p
WHERE s.failure_reason IS NULL
"""
engine_validations = spark.sql(query)
bookmark_validations = engine_validations.where(
F.col("engine_name").isin("bookmarks", "bookmarks-buffered")
)
bookmark_validation_problems = bookmark_validations.where(
F.col("engine_has_problems")
)
# Generate aggregates over all bookmarks
bookmark_aggregates = (
bookmark_validations.where(F.col("engine_validation_checked").isNotNull())
# see bug 1410963 for submission date vs sync date
.groupBy("submission_day").agg(
F.countDistinct("uid", "device_id", "when").alias(
"total_bookmark_validations"
),
F.countDistinct("uid").alias("total_validated_users"),
F.sum("engine_validation_checked").alias("total_bookmarks_checked"),
)
)
bookmark_validation_problems.createOrReplaceTempView("bmk_validation_problems")
bookmark_aggregates.createOrReplaceTempView("bmk_total_per_day")