本文整理汇总了Python中pyspark.sql.functions.mean方法的典型用法代码示例。如果您正苦于以下问题:Python functions.mean方法的具体用法?Python functions.mean怎么用?Python functions.mean使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.mean方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_window
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def test_window(client):
import pyspark.sql.functions as F
from pyspark.sql.window import Window
table = client.table('basic_table')
w = ibis.window()
result = table.mutate(
grouped_demeaned=table['id'] - table['id'].mean().over(w)
).compile()
spark_window = Window.partitionBy()
spark_table = table.compile()
expected = spark_table.withColumn(
'grouped_demeaned',
spark_table['id'] - F.mean(spark_table['id']).over(spark_window),
)
tm.assert_frame_equal(result.toPandas(), expected.toPandas())
示例2: transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def transform(self, dataframe):
"""Applies standardization to the specified columns.
# Arguments
dataframe: dataframe. Spark Dataframe.
"""
# Compute the means of the specified columns.
means = [mean(x) for x in self.columns]
means = dataframe.select(means).collect()[0].asDict()
self.means = self.clean_mean_keys(means)
# Compute the standard deviation of the specified columns.
stddevs = [stddev_pop(x) for x in self.columns]
stddevs = dataframe.select(stddevs).collect()[0].asDict()
self.stddevs = self.clean_stddev_keys(stddevs)
# For every feature, add a new column to the dataframe.
for column in self.columns:
self.current_column = column
dataframe = dataframe.rdd.map(self._transform).toDF()
return dataframe
示例3: mad
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def mad(self):
"""
Return the mean absolute deviation of values.
Examples
--------
>>> s = ks.Series([1, 2, 3, 4])
>>> s
0 1
1 2
2 3
3 4
Name: 0, dtype: int64
>>> s.mad()
1.0
"""
sdf = self._internal.spark_frame
spark_column = self.spark.column
avg = unpack_scalar(sdf.select(F.avg(spark_column)))
mad = unpack_scalar(sdf.select(F.avg(F.abs(spark_column - avg))))
return mad
示例4: compile_mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def compile_mean(t, expr, scope, context=None, **kwargs):
return compile_aggregator(t, expr, scope, F.mean, context, **kwargs)
示例5: clean_mean_keys
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def clean_mean_keys(self, means):
"""Cleans the keys of the specified dictionary (mean)."""
new_means = {}
for k in means:
new_means[k[4:-1]] = means[k]
return new_means
示例6: _transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def _transform(self, row):
"""Take the column, and normalize it with the computed means and std devs."""
mean = self.means[self.current_column]
stddev = self.stddevs[self.current_column]
x = row[self.current_column]
x_normalized = (x - mean) / stddev
output_column = self.current_column + self.column_suffix
new_row = new_dataframe_row(row, output_column, x_normalized)
return new_row
示例7: mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def mean(self):
def mean(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.mean(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(mean)
示例8: _compute_stats
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def _compute_stats(data, colname, whis, precision):
# Computes mean, median, Q1 and Q3 with approx_percentile and precision
pdf = data._kdf._internal.resolved_copy.spark_frame.agg(
*[
F.expr(
"approx_percentile({}, {}, {})".format(colname, q, int(1.0 / precision))
).alias("{}_{}%".format(colname, int(q * 100)))
for q in [0.25, 0.50, 0.75]
],
F.mean(colname).alias("{}_mean".format(colname))
).toPandas()
# Computes IQR and Tukey's fences
iqr = "{}_iqr".format(colname)
p75 = "{}_75%".format(colname)
p25 = "{}_25%".format(colname)
pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25]
pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr]
pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr]
qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"]
col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]]
col_summ.columns = qnames
lfence, ufence = col_summ["lfence"], col_summ["ufence"]
stats = {
"mean": col_summ["mean"].values[0],
"med": col_summ["50%"].values[0],
"q1": col_summ["25%"].values[0],
"q3": col_summ["75%"].values[0],
}
return stats, (lfence.values[0], ufence.values[0])
示例9: mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def mean(self):
"""Compute mean of groups, excluding missing values.
For multiple groupings, the result index will be a MultiIndex.
"""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.mean)
self._prep_pandas_groupby()
return DataFrame.fromDataFrameRDD(
self._regroup_mergedRDD().values().map(
lambda x: x.mean()), self.sql_ctx)
示例10: agg_mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def agg_mean(field_name, alias=None):
field_alias = get_alias(field_name, alias, "mean")
return F.mean(field_name).alias(field_alias)
示例11: ndcg
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
query_cols=['wikiid', 'query', 'session_id']):
"""
Calculate ndcg@k for the provided dataframe
Parameters
----------
df : pyspark.sql.DataFrame
Input dataframe to calculate against
k : int
Cutoff for ndcg calculation
label_col : str
Column name containing integer label, higher is better, of the hit
position_col : str
Column name containing order displayed to user, lowest first, of the hit
query_cols : list of str
Column names to group by, which indicate a unique query displayed to a user
Returns
-------
float
The ndcg@k value, always between 0 and 1
"""
if wiki_col not in query_cols:
query_cols = query_cols + [wiki_col]
# ideal results per labels
w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
topAtK = (
df
.select(label_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
# top k results shown to user
w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
predictedTopAtK = (
df
.select(label_col, position_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
return {row[wiki_col]: row.ndcgAtK for row in topAtK
.join(predictedTopAtK, query_cols, how='inner')
.select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
.groupBy(wiki_col)
.agg(F.mean('ndcgAtK').alias('ndcgAtK'))
.collect()}
示例12: mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def mean(self, axis=None, numeric_only=True):
"""
Return the mean of the values.
Parameters
----------
axis : {index (0), columns (1)}
Axis for the function to be applied on.
numeric_only : bool, default True
Include only float, int, boolean columns. False is not supported. This parameter
is mainly for pandas compatibility.
Returns
-------
mean : scalar for a Series, and a Series for a DataFrame.
Examples
--------
>>> df = ks.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},
... columns=['a', 'b'])
On a DataFrame:
>>> df.mean()
a 2.0
b 0.2
Name: 0, dtype: float64
>>> df.mean(axis=1)
0 0.55
1 1.10
2 1.65
3 NaN
Name: 0, dtype: float64
On a Series:
>>> df['a'].mean()
2.0
"""
return self._reduce_for_stat_function(
F.mean, name="mean", numeric_only=numeric_only, axis=axis
)
示例13: _compute_plot_data
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def _compute_plot_data(self):
colname = self.data.name
data = self.data
# Updates all props with the rc defaults from matplotlib
self.kwds.update(KoalasBoxPlot.rc_defaults(**self.kwds))
# Gets some important kwds
showfliers = self.kwds.get("showfliers", False)
whis = self.kwds.get("whis", 1.5)
labels = self.kwds.get("labels", [colname])
# This one is Koalas specific to control precision for approx_percentile
precision = self.kwds.get("precision", 0.01)
# # Computes mean, median, Q1 and Q3 with approx_percentile and precision
col_stats, col_fences = KoalasBoxPlot._compute_stats(data, colname, whis, precision)
# # Creates a column to flag rows as outliers or not
outliers = KoalasBoxPlot._outliers(data, colname, *col_fences)
# # Computes min and max values of non-outliers - the whiskers
whiskers = KoalasBoxPlot._calc_whiskers(colname, outliers)
if showfliers:
fliers = KoalasBoxPlot._get_fliers(colname, outliers)
else:
fliers = []
# Builds bxpstats dict
stats = []
item = {
"mean": col_stats["mean"],
"med": col_stats["med"],
"q1": col_stats["q1"],
"q3": col_stats["q3"],
"whislo": whiskers[0],
"whishi": whiskers[1],
"fliers": fliers,
"label": labels[0],
}
stats.append(item)
self.data = {labels[0]: stats}
示例14: _rank
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import mean [as 别名]
def _rank(self, method="average", ascending=True, part_cols=()):
if method not in ["average", "min", "max", "first", "dense"]:
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
raise ValueError(msg)
if len(self._internal.index_spark_column_names) > 1:
raise ValueError("rank do not support index now")
if ascending:
asc_func = lambda scol: scol.asc()
else:
asc_func = lambda scol: scol.desc()
if method == "first":
window = (
Window.orderBy(
asc_func(self.spark.column), asc_func(F.col(NATURAL_ORDER_COLUMN_NAME)),
)
.partitionBy(*part_cols)
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
)
scol = F.row_number().over(window)
elif method == "dense":
window = (
Window.orderBy(asc_func(self.spark.column))
.partitionBy(*part_cols)
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
)
scol = F.dense_rank().over(window)
else:
if method == "average":
stat_func = F.mean
elif method == "min":
stat_func = F.min
elif method == "max":
stat_func = F.max
window1 = (
Window.orderBy(asc_func(self.spark.column))
.partitionBy(*part_cols)
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
)
window2 = Window.partitionBy([self.spark.column] + list(part_cols)).rowsBetween(
Window.unboundedPreceding, Window.unboundedFollowing
)
scol = stat_func(F.row_number().over(window1)).over(window2)
kser = self._with_new_scol(scol).rename(self.name)
return kser.astype(np.float64)