本文整理汇总了Python中pyspark.sql.functions.collect_list方法的典型用法代码示例。如果您正苦于以下问题:Python functions.collect_list方法的具体用法?Python functions.collect_list怎么用?Python functions.collect_list使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.collect_list方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _calculate_metrics
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def _calculate_metrics(self):
"""Calculate ranking metrics."""
self._items_for_user_pred = self.rating_pred
self._items_for_user_true = (
self.rating_true
.groupBy(self.col_user)
.agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
.select(self.col_user, "ground_truth")
)
self._items_for_user_all = self._items_for_user_pred.join(
self._items_for_user_true, on=self.col_user
).drop(self.col_user)
return RankingMetrics(self._items_for_user_all.rdd)
示例2: _ndcg_at
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def _ndcg_at(k, label_col):
def ndcg_at_k(predicted, actual):
# TODO: Taking in rn and then re-sorting might not be necessary, but i can't
# find any real guarantee that they would come in order after a groupBy + collect_list,
# since they were only ordered within the window function.
predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
dcg = 0.
for i, label in enumerate(predicted):
# This form is used to match EvalNDCG in xgboost
dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
idcg = 0.
for i, label in enumerate(actual):
idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
if idcg == 0:
return 0
else:
return dcg / idcg
return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())
示例3: cluster_within_norm_query_groups
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
T.StructField('query', T.StringType(), nullable=False),
T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
])))
return (
df
.groupBy('wikiid', 'norm_query')
.agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
.select(
'wikiid', 'norm_query',
F.explode(make_groups('source')).alias('group'))
.select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
示例4: with_unique_cluster_id
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def with_unique_cluster_id(df: DataFrame) -> DataFrame:
return (
df
.groupby('wikiid', 'norm_query', 'norm_query_group_id')
.agg(F.collect_list('query').alias('queries'))
.select(
'wikiid', 'queries',
F.monotonically_increasing_id().alias('cluster_id'))
.select('wikiid', F.explode('queries').alias('query'), 'cluster_id'))
示例5: compile_group_concat
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def compile_group_concat(t, expr, scope, context=None, **kwargs):
sep = expr.op().sep.op().value
def fn(col):
return F.concat_ws(sep, F.collect_list(col))
return compile_aggregator(t, expr, scope, fn, context)
示例6: compile_array_collect
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def compile_array_collect(t, expr, scope, **kwargs):
op = expr.op()
src_column = t.translate(op.arg, scope)
return F.collect_list(src_column)
# --------------------------- Null Operations -----------------------------
示例7: get_listening_activity
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def get_listening_activity():
""" Calculate number of listens for each user in time ranges given in the 'time_range' table """
# Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
result_without_zero_days = run_query("""
SELECT listens.user_name
, time_range.time_range
, count(listens.user_name) as listen_count
FROM listens
JOIN time_range
ON listens.listened_at >= time_range.start
AND listens.listened_at <= time_range.end
GROUP BY listens.user_name
, time_range.time_range
""")
result_without_zero_days.createOrReplaceTempView('result_without_zero_days')
# Add the time ranges which have zero listens to the previous dataframe
result = run_query("""
SELECT dist_user_name.user_name
, time_range.time_range
, to_unix_timestamp(time_range.start) as from_ts
, to_unix_timestamp(time_range.end) as to_ts
, ifnull(result_without_zero_days.listen_count, 0) as listen_count
FROM (SELECT DISTINCT user_name FROM listens) dist_user_name
CROSS JOIN time_range
LEFT JOIN result_without_zero_days
ON result_without_zero_days.user_name = dist_user_name.user_name
AND result_without_zero_days.time_range = time_range.time_range
""")
# Create a table with a list of time ranges and corresponding listen count for each user
iterator = result \
.withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
.groupBy("user_name") \
.agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
.toLocalIterator()
return iterator
示例8: get_listening_activity_all_time
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]:
""" Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """
current_app.logger.debug("Calculating listening_activity_all_time")
to_date = get_latest_listen_ts()
from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)
result_without_zero_years = None
for year in range(from_date.year, to_date.year+1):
year_start = datetime(year, 1, 1)
year_end = get_year_end(year)
try:
_get_listens(year_start, year_end)
except HDFSException:
# Skip if no listens present in df
continue
year_df = run_query("""
SELECT user_name,
count(user_name) as listen_count
FROM listens
GROUP BY user_name
""")
year_df = year_df.withColumn('time_range', lit(str(year))).withColumn(
'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp()))
result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df
# Create a table with a list of time ranges and corresponding listen count for each user
data = result_without_zero_years \
.withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
.groupBy("user_name") \
.agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
.toLocalIterator()
messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())
current_app.logger.debug("Done!")
return messages
示例9: _get_top_k_items
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def _get_top_k_items(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get the input customer-item-rating tuple in the format of Spark
DataFrame, output a Spark DataFrame in the dense format of top k items
for each user.
NOTE: if it is implicit rating, just append a column of constants to be ratings.
Args:
dataframe (spark.DataFrame): DataFrame of rating data (in the format of
customerID-itemID-rating tuple).
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items for each user.
Return:
spark.DataFrame: DataFrame of top k items for each user.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())
# this does not work for rating of the same value.
items_for_user = (
dataframe.select(
col_user,
col_item,
col_rating,
row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.groupby(col_user)
.agg(F.collect_list(col_item).alias(col_prediction))
)
return items_for_user
示例10: _get_relevant_items_by_threshold
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def _get_relevant_items_by_threshold(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=PREDICTION_COL,
threshold=DEFAULT_THRESHOLD
):
"""Get relevant items for each customer in the input rating data.
Relevant items are defined as those having ratings above certain threshold.
The threshold is defined as a statistical measure of the ratings for a
user, e.g., median.
Args:
dataframe: Spark DataFrame of customerID-itemID-rating tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
threshold (float): threshold for determining the relevant recommended items.
This is used for the case that predicted ratings follow a known
distribution.
Return:
spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
items.
"""
items_for_user = (
dataframe
.orderBy(col_rating, ascending=False)
.where(col_rating + " >= " + str(threshold))
.select(
col_user, col_item, col_rating
)
.withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
.select(col_user, col_prediction)
.dropDuplicates()
)
return items_for_user
示例11: _get_relevant_items_by_timestamp
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def _get_relevant_items_by_timestamp(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get relevant items for each customer defined by timestamp.
Relevant items are defined as k items that appear mostly recently
according to timestamps.
Args:
dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_timestamp (str): column name for timestamp.
col_prediction (str): column name for prediction.
k: number of relevent items to be filtered by the function.
Return:
spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())
items_for_user = (
dataframe.select(
col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
.select(col_user, col_prediction)
.dropDuplicates([col_user, col_prediction])
)
return items_for_user
示例12: ndcg
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
query_cols=['wikiid', 'query', 'session_id']):
"""
Calculate ndcg@k for the provided dataframe
Parameters
----------
df : pyspark.sql.DataFrame
Input dataframe to calculate against
k : int
Cutoff for ndcg calculation
label_col : str
Column name containing integer label, higher is better, of the hit
position_col : str
Column name containing order displayed to user, lowest first, of the hit
query_cols : list of str
Column names to group by, which indicate a unique query displayed to a user
Returns
-------
float
The ndcg@k value, always between 0 and 1
"""
if wiki_col not in query_cols:
query_cols = query_cols + [wiki_col]
# ideal results per labels
w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
topAtK = (
df
.select(label_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
# top k results shown to user
w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
predictedTopAtK = (
df
.select(label_col, position_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
return {row[wiki_col]: row.ndcgAtK for row in topAtK
.join(predictedTopAtK, query_cols, how='inner')
.select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
.groupBy(wiki_col)
.agg(F.mean('ndcgAtK').alias('ndcgAtK'))
.collect()}
示例13: get_releases
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def get_releases(table):
"""
Get release information (release_name, release_mbid etc) for every user
ordered by listen count (number of times a user has listened to tracks
which belong to a particular release).
Args:
table: name of the temporary table
Returns:
iterator (iter): an iterator over result
{
'user1' : [{
'release_name': str
'release_msid': str,
'release_mbid': str,
'artist_name': str,
'artist_msid': str,
'artist_mbids': list(str),
'listen_count': int
}],
'user2' : [{...}],
}
"""
result = run_query("""
SELECT user_name
, nullif(release_name, '') as release_name
, CASE
WHEN release_mbid IS NOT NULL AND release_mbid != '' THEN NULL
ELSE nullif(release_msid, '')
END as release_msid
, nullif(release_mbid, '') as release_mbid
, artist_name
, CASE
WHEN cardinality(artist_mbids) > 0 THEN NULL
ELSE nullif(artist_msid, '')
END as artist_msid
, artist_mbids
, count(release_name) as listen_count
FROM {}
WHERE release_name IS NOT NULL AND release_name != ''
GROUP BY user_name
, release_name
, release_msid
, release_mbid
, artist_name
, artist_msid
, artist_mbids
""".format(table))
iterator = result \
.withColumn("releases", struct("listen_count", "release_name", "release_msid", "release_mbid", "artist_name",
"artist_msid", "artist_mbids")) \
.groupBy("user_name") \
.agg(sort_array(collect_list("releases"), asc=False).alias("releases")) \
.toLocalIterator()
return iterator
示例14: get_artists
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def get_artists(table):
""" Get artist information (artist_name, artist_msid etc) for every user
ordered by listen count
Args:
table (str): name of the temporary table.
Returns:
iterator (iter): an iterator over result
{
user1: [{
'artist_name': str,
'artist_msid': str,
'artist_mbids': list(str),
'listen_count': int
}],
user2: [{...}],
}
"""
result = run_query("""
SELECT user_name
, artist_name
, CASE
WHEN cardinality(artist_mbids) > 0 THEN NULL
ELSE nullif(artist_msid, '')
END as artist_msid
, artist_mbids
, count(artist_name) as listen_count
FROM {table}
GROUP BY user_name
, artist_name
, artist_msid
, artist_mbids
""".format(table=table))
iterator = result \
.withColumn("artists", struct("listen_count", "artist_name", "artist_msid", "artist_mbids")) \
.groupBy("user_name") \
.agg(sort_array(collect_list("artists"), asc=False).alias("artists")) \
.toLocalIterator()
return iterator
示例15: transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import collect_list [as 别名]
def transform(longitudinal_addons):
# Only for logging, not used, but may be interesting for later analysis.
guid_set_unique = (
longitudinal_addons.withColumn(
"exploded", F.explode(longitudinal_addons.installed_addons)
)
.select("exploded") # noqa: E501 - long lines
.rdd.flatMap(lambda x: x)
.distinct()
.collect()
)
logging.info(
"Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
)
restructured = longitudinal_addons.rdd.flatMap(
lambda x: key_all(x.installed_addons)
).toDF(["key_addon", "coinstalled_addons"])
# Explode the list of co-installs and count pair occurrences.
addon_co_installations = (
restructured.select(
"key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
) # noqa: E501 - long lines
.groupBy("key_addon", "coinstalled_addon")
.count()
)
# Collect the set of coinstalled_addon, count pairs for each key_addon.
combine_and_map_cols = F.udf(
lambda x, y: (x, y),
StructType([StructField("id", StringType()), StructField("n", LongType())]),
)
# Spark functions are sometimes long and unwieldy. Tough luck.
# Ignore E128 and E501 long line errors
addon_co_installations_collapsed = (
addon_co_installations.select( # noqa: E128
"key_addon",
combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501
"id_n"
),
)
.groupby("key_addon")
.agg(F.collect_list("id_n").alias("coinstallation_counts"))
)
logging.info(addon_co_installations_collapsed.printSchema())
logging.info("Collecting final result of co-installations.")
return addon_co_installations_collapsed