本文整理汇总了Python中pyspark.sql.functions.row_number方法的典型用法代码示例。如果您正苦于以下问题:Python functions.row_number方法的具体用法?Python functions.row_number怎么用?Python functions.row_number使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.row_number方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_top_artists
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def get_top_artists(mapped_listens_subset, top_artist_limit):
""" Get top artists listened to by users who have a listening history in
the past X days where X = RECOMMENDATION_GENERATION_WINDOW.
Args:
df (dataframe): A subset of mapped_df containing user history.
top_artist_limit (int): number of top artist to calculate
Returns:
top_artists_df (dataframe): Top Y artists listened to by a user for all users where
Y = TOP_ARTISTS_LIMIT
"""
df = mapped_listens_subset.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
.groupBy('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
.agg(func.count('mb_artist_credit_id').alias('count'))
window = Window.partitionBy('user_name').orderBy(col('count').desc())
top_artists_df = df.withColumn('rank', row_number().over(window)) \
.where(col('rank') <= top_artist_limit) \
.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name')
return top_artists_df
示例2: _get_relevant_items_by_timestamp
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def _get_relevant_items_by_timestamp(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get relevant items for each customer defined by timestamp.
Relevant items are defined as k items that appear mostly recently
according to timestamps.
Args:
dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_timestamp (str): column name for timestamp.
col_prediction (str): column name for prediction.
k: number of relevent items to be filtered by the function.
Return:
spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())
items_for_user = (
dataframe.select(
col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
.select(col_user, col_prediction)
.dropDuplicates([col_user, col_prediction])
)
return items_for_user
示例3: compile_row_number
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def compile_row_number(t, expr, scope, *, window, **kwargs):
return F.row_number().over(window).cast('long') - 1
# -------------------------- Temporal Operations ----------------------------
# Ibis value to PySpark value
示例4: sum
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def sum(self):
def sum(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.sum(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(sum)
示例5: min
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def min(self):
def min(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.min(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(min)
示例6: max
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def max(self):
def max(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.max(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(max)
示例7: mean
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def mean(self):
def mean(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.mean(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(mean)
示例8: std
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def std(self):
def std(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.stddev(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(std)
示例9: attach_sequence_column
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def attach_sequence_column(sdf, column_name):
scols = [scol_for(sdf, column) for column in sdf.columns]
sequential_index = F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1
return sdf.select(sequential_index.alias(column_name), *scols)
示例10: _get_top_k_items
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def _get_top_k_items(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get the input customer-item-rating tuple in the format of Spark
DataFrame, output a Spark DataFrame in the dense format of top k items
for each user.
NOTE: if it is implicit rating, just append a column of constants to be ratings.
Args:
dataframe (spark.DataFrame): DataFrame of rating data (in the format of
customerID-itemID-rating tuple).
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items for each user.
Return:
spark.DataFrame: DataFrame of top k items for each user.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())
# this does not work for rating of the same value.
items_for_user = (
dataframe.select(
col_user,
col_item,
col_rating,
row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.groupby(col_user)
.agg(F.collect_list(col_item).alias(col_prediction))
)
return items_for_user
示例11: transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def transform(landfill, n_documents=1000):
meta_schema = StructType(
[StructField(k, StringType(), True) for k in META_WHITELIST]
)
schema = StructType(
[
StructField("namespace", StringType(), False),
StructField("doc_type", StringType(), False),
StructField("doc_version", StringType(), True),
StructField("doc_id", StringType(), True),
StructField("meta", meta_schema, False),
StructField("content", StringType(), False),
]
)
documents = (
landfill.map(_process)
.filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
.toDF(schema)
)
window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
"doc_id"
)
df = (
documents.fillna("0", "doc_version")
.withColumn("row_id", row_number().over(window_spec))
.where(col("row_id") <= n_documents)
.drop("row_id")
)
return df
示例12: ndcg
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
query_cols=['wikiid', 'query', 'session_id']):
"""
Calculate ndcg@k for the provided dataframe
Parameters
----------
df : pyspark.sql.DataFrame
Input dataframe to calculate against
k : int
Cutoff for ndcg calculation
label_col : str
Column name containing integer label, higher is better, of the hit
position_col : str
Column name containing order displayed to user, lowest first, of the hit
query_cols : list of str
Column names to group by, which indicate a unique query displayed to a user
Returns
-------
float
The ndcg@k value, always between 0 and 1
"""
if wiki_col not in query_cols:
query_cols = query_cols + [wiki_col]
# ideal results per labels
w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
topAtK = (
df
.select(label_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
# top k results shown to user
w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
predictedTopAtK = (
df
.select(label_col, position_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
return {row[wiki_col]: row.ndcgAtK for row in topAtK
.join(predictedTopAtK, query_cols, how='inner')
.select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
.groupBy(wiki_col)
.agg(F.mean('ndcgAtK').alias('ndcgAtK'))
.collect()}
示例13: get_top_similar_artists
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def get_top_similar_artists(top_artists_df, artists_relation_df, similar_artist_limit):
""" Get artists similar to top artists.
Args:
top_artists_df: Dataframe containing top artists listened to by users
artist_relation_df: Dataframe containing artists and similar artists.
For columns refer to artist_relation_schema in listenbrainz_spark/schema.py.
similar_artist_limit (int): number of similar artist to calculate
Returns:
top_similar_artists_df (dataframe): Top Z artists similar to top artists where
Z = SIMILAR_ARTISTS_LIMIT.
"""
condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_0]
df1 = top_artists_df.join(artists_relation_df, condition, 'inner') \
.select(col('id_0').alias('top_artist_credit_id'),
col('name_0').alias('top_artist_name'),
col('id_1').alias('similar_artist_credit_id'),
col('name_1').alias('similar_artist_name'),
'score',
'user_name')
condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_1]
df2 = top_artists_df.join(artists_relation_df, condition, 'inner') \
.select(col('id_1').alias('top_artist_credit_id'),
col('name_1').alias('top_artist_name'),
col('id_0').alias('similar_artist_credit_id'),
col('name_0').alias('similar_artist_name'),
'score',
'user_name')
similar_artists_df = df1.union(df2)
window = Window.partitionBy('top_artist_credit_id', 'user_name')\
.orderBy(col('score').desc())
top_similar_artists_df = similar_artists_df.withColumn('rank', row_number().over(window)) \
.where(col('rank') <= similar_artist_limit)\
.select('top_artist_credit_id', 'top_artist_name',
'similar_artist_credit_id', 'similar_artist_name',
'score', 'user_name')
return top_similar_artists_df
示例14: count
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def count(self):
"""
The expanding count of any non-NaN observations inside the window.
.. note:: the current implementation of this API uses Spark's Window without
specifying partition specification. This leads to move all data into
single partition in single machine and could cause serious
performance degradation. Avoid this method against very large dataset.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the expanding
calculation.
See Also
--------
Series.expanding : Calling object with Series data.
DataFrame.expanding : Calling object with DataFrames.
Series.count : Count of the full Series.
DataFrame.count : Count of the full DataFrame.
Examples
--------
>>> s = ks.Series([2, 3, float("nan"), 10])
>>> s.expanding().count()
0 1.0
1 2.0
2 2.0
3 3.0
Name: 0, dtype: float64
>>> s.to_frame().expanding().count()
0
0 1.0
1 2.0
2 2.0
3 3.0
"""
def count(scol):
return F.when(
F.row_number().over(self._unbounded_window) >= self._min_periods,
F.count(scol).over(self._window),
).otherwise(F.lit(None))
return self._apply_as_series_or_frame(count).astype("float64")
示例15: nsmallest
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def nsmallest(self, n=5):
"""
Return the first n rows ordered by columns in ascending order in group.
Return the first n rows with the smallest values in columns, in ascending order.
The columns that are not specified are returned as well, but not used for ordering.
Parameters
----------
n : int
Number of items to retrieve.
See Also
--------
databricks.koalas.Series.nsmallest
databricks.koalas.DataFrame.nsmallest
Examples
--------
>>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])
>>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE
a
1 0 1
2 3 2
3 6 3
Name: b, dtype: int64
"""
if len(self._kdf._internal.index_names) > 1:
raise ValueError("nsmallest do not support multi-index now")
sdf = self._kdf._internal.spark_frame
name = self._agg_columns[0]._internal.data_spark_column_names[0]
window = Window.partitionBy(self._groupkeys_scols).orderBy(
self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
)
sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)
internal = InternalFrame(
spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
index_map=OrderedDict(
[
(s._internal.data_spark_column_names[0], s._internal.column_labels[0])
for s in self._groupkeys
]
+ list(self._kdf._internal.index_map.items())
),
data_spark_columns=[scol_for(sdf, name)],
)
return first_series(DataFrame(internal))
# TODO: add keep parameter