本文整理汇总了Python中pyspark.sql.functions.max方法的典型用法代码示例。如果您正苦于以下问题:Python functions.max方法的具体用法?Python functions.max怎么用?Python functions.max使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.max方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: compile_aggregator
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def compile_aggregator(t, expr, scope, fn, context=None, **kwargs):
op = expr.op()
src_col = t.translate(op.arg, scope)
if getattr(op, 'where', None) is not None:
condition = t.translate(op.where, scope)
src_col = F.when(condition, src_col)
col = fn(src_col)
if context is None:
# We are trying to compile a expr such as some_col.max()
# to a Spark expression.
# Here we get the root table df of that column and compile
# the expr to:
# df.select(max(some_col))
return t.translate(expr.op().arg.op().table, scope).select(col)
elif context == AggregationContext.WINDOW:
window = kwargs['window']
return col.over(window)
else:
return col
示例2: compile_notany
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def compile_notany(t, expr, scope, *, context=None, window=None, **kwargs):
# The code here is a little ugly because the translation are different
# with different context.
# When translating col.notany() (context is None), we returns the dataframe
# so we need to negate the aggregator, i.e., df.select(~F.max(col))
# When traslating col.notany().over(w), we need to negate the result
# after the window translation, i.e., ~(F.max(col).over(w))
if context is None:
def fn(col):
return ~(F.max(col))
return compile_aggregator(t, expr, scope, fn, context, **kwargs)
else:
return ~compile_any(
t, expr, scope, context=context, window=window, **kwargs
)
示例3: get_latest_dataframe_id
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def get_latest_dataframe_id(dataframe_metadata_df):
""" Get dataframe id of dataframe on which model has been trained.
Args:
dataframe_metadata_df (dataframe): Refer to listenbrainz_spark.schema.dataframe_metadata_schema
Returns:
dataframe id
"""
# get timestamp of recently saved dataframe.
timestamp = dataframe_metadata_df.select(func.max('dataframe_created').alias('recent_dataframe_timestamp')).take(1)[0]
# get dataframe id corresponding to most recent timestamp.
df = dataframe_metadata_df.select('dataframe_id') \
.where(func.col('dataframe_created') == timestamp.recent_dataframe_timestamp).take(1)[0]
return df.dataframe_id
示例4: get_most_recent_model_id
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def get_most_recent_model_id():
""" Get model id of recently created model.
Returns:
model_id (str): Model identification string.
"""
try:
model_metadata = utils.read_files_from_HDFS(path.MODEL_METADATA)
except PathNotFoundException as err:
current_app.logger.error(str(err), exc_info=True)
sys.exit(-1)
except FileNotFetchedException as err:
current_app.logger.error(str(err), exc_info=True)
sys.exit(-1)
latest_ts = model_metadata.select(func.max('model_created').alias('model_created')).take(1)[0].model_created
model_id = model_metadata.select('model_id') \
.where(col('model_created') == latest_ts).take(1)[0].model_id
return model_id
示例5: get_dates_to_generate_candidate_sets
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def get_dates_to_generate_candidate_sets(mapped_df, recommendation_generation_window):
""" Get window to fetch listens to generate candidate sets.
Args:
mapped_df (dataframe): listens mapped with msid_mbid_mapping. Refer to candidate_sets.py
for dataframe columns.
recommendation_generation_window (int): recommendations to be generated on history of given number of days.
Returns:
from_date (datetime): Date from which start fetching listens.
to_date (datetime): Date upto which fetch listens.
"""
# get timestamp of latest listen in HDFS
to_date = mapped_df.select(func.max('listened_at').alias('listened_at')).collect()[0].listened_at
from_date = stats.adjust_days(to_date, recommendation_generation_window).replace(hour=0, minute=0, second=0)
return from_date, to_date
示例6: is_multi_agg_with_relabel
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def is_multi_agg_with_relabel(**kwargs):
"""
Check whether the kwargs pass to .agg look like multi-agg with relabling.
Parameters
----------
**kwargs : dict
Returns
-------
bool
Examples
--------
>>> is_multi_agg_with_relabel(a='max')
False
>>> is_multi_agg_with_relabel(a_max=('a', 'max'),
... a_min=('a', 'min'))
True
>>> is_multi_agg_with_relabel()
False
"""
if not kwargs:
return False
return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values())
示例7: stats
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def stats(self, columns):
"""Compute the stats for each column provided in columns.
Parameters
----------
columns : list of str, contains all columns to compute stats on.
"""
assert (not isinstance(columns, basestring)), "columns should be a " \
"list of strs, " \
"not a str!"
assert isinstance(columns, list), "columns should be a list!"
from pyspark.sql import functions as F
functions = [F.min, F.max, F.avg, F.count]
aggs = list(
self._flatmap(lambda column: map(lambda f: f(column), functions),
columns))
return PStats(self.from_schema_rdd(self._schema_rdd.agg(*aggs)))
示例8: max
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def max(self):
"""Compute the max for each group."""
if self._can_use_new_school():
self._prep_spark_sql_groupby()
import pyspark.sql.functions as func
return self._use_aggregation(func.max)
self._prep_pandas_groupby()
myargs = self._myargs
mykwargs = self._mykwargs
def create_combiner(x):
return x.groupby(*myargs, **mykwargs).max()
def merge_value(x, y):
return x.append(create_combiner(y)).max()
def merge_combiner(x, y):
return x.append(y).max(level=0)
rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
create_combiner,
merge_value,
merge_combiner)).values()
return DataFrame.fromDataFrameRDD(rddOfMax, self.sql_ctx)
示例9: to_pandas
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def to_pandas(self, kind='hist'):
"""Returns a pandas dataframe from the Histogram object.
This function calculates the Histogram function in Spark if it was not done yet.
Args:
:kind: (:obj:`str`, optional):
'hist' or 'density'. When using hist this returns the histogram object
as pandas dataframe. When using density the index contains the bin centers, and the values in the
DataFrame are the scaled values. Defaults to 'hist'
Returns:
A pandas DataFrame from the Histogram object.
"""
self.build()
if kind == 'hist':
return pd.DataFrame(self.hist_dict).set_index([self._get_col_names()])
elif kind == 'density':
result = pd.DataFrame(self.hist_dict).set_index([self._get_bin_centers()])
return result.apply(lambda x: x / x.max(), axis=0)
示例10: compile_any
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def compile_any(t, expr, scope, context=None, **kwargs):
return compile_aggregator(t, expr, scope, F.max, context, **kwargs)
示例11: compile_max
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def compile_max(t, expr, scope, context=None, **kwargs):
return compile_aggregator(t, expr, scope, F.max, context, **kwargs)
示例12: test_aggregation
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def test_aggregation(client):
import pyspark.sql.functions as F
table = client.table('basic_table')
result = table.aggregate(table['id'].max()).compile()
expected = table.compile().agg(F.max('id').alias('max'))
tm.assert_frame_equal(result.toPandas(), expected.toPandas())
示例13: make_daily_temperature_highs
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def make_daily_temperature_highs(_, weather_samples: DataFrame) -> DataFrame:
'''Computes the temperature high for each day'''
valid_date = f.to_date(weather_samples['valid']).alias('valid_date')
return weather_samples.groupBy(valid_date).agg(f.max('tmpf').alias('max_tmpf'))
示例14: max
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def max(self):
def max(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.max(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(max)
示例15: min
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import max [as 别名]
def min(self):
"""
Return the minimum value of the Index.
Returns
-------
scalar
Minimum value.
See Also
--------
Index.max : Return the maximum value of the object.
Series.min : Return the minimum value in a Series.
DataFrame.min : Return the minimum values in a DataFrame.
Examples
--------
>>> idx = ks.Index([3, 2, 1])
>>> idx.min()
1
>>> idx = ks.Index(['c', 'b', 'a'])
>>> idx.min()
'a'
For a MultiIndex, the maximum is determined lexicographically.
>>> idx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)])
>>> idx.min()
('a', 'x', 1)
"""
sdf = self._internal.spark_frame
min_row = sdf.select(F.min(F.struct(self._internal.index_spark_columns))).head()
result = tuple(min_row[0])
return result if len(result) > 1 else result[0]