本文整理汇总了Python中pyspark.sql.Window.partitionBy方法的典型用法代码示例。如果您正苦于以下问题:Python Window.partitionBy方法的具体用法?Python Window.partitionBy怎么用?Python Window.partitionBy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.Window
的用法示例。
在下文中一共展示了Window.partitionBy方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_at_least_n_distinct
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def test_at_least_n_distinct(spark_context):
df = spark_context.parallelize([
('foo', 'bar', 'baz'),
('foo', 'bar', 'bang'),
('foo', 'bar', 'bang'),
('foo', 'test', 'test'),
('foo', 'test', 'test'),
('fizz', 'bang', 'boom'),
]).toDF(['a', 'b', 'c'])
w = Window.partitionBy('a', 'b')
res = df.withColumn('z', mjolnir.spark.at_least_n_distinct('c', 2).over(w)).collect()
expect = [
('foo', 'bar', 'baz', True),
('foo', 'bar', 'bang', True),
('foo', 'bar', 'bang', True),
('foo', 'test', 'test', False),
('foo', 'test', 'test', False),
('fizz', 'bang', 'boom', False),
]
assert sorted(map(tuple, res)) == sorted(expect)
示例2: filter_min_sessions_per_norm_query
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def filter_min_sessions_per_norm_query(min_sessions: int) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
w = Window.partitionBy('wikiid', 'norm_query')
return (
df.withColumn(
'has_min_sessions',
at_least_n_distinct('session_id', min_sessions).over(w))
.where(F.col('has_min_sessions'))
.drop('has_min_sessions'))
return transform
示例3: process_file
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def process_file(date_update):
"""Process downloaded MEDLINE folder to parquet file"""
print("Process MEDLINE file to parquet")
# remove if folder still exist
if glob(os.path.join(save_dir, 'medline_*.parquet')):
subprocess.call(['rm', '-rf', 'medline_*.parquet'])
date_update_str = date_update.strftime("%Y_%m_%d")
path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
parse_results_rdd = path_rdd.\
flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
for publication_dict in pp.parse_medline_xml(x)])
medline_df = parse_results_rdd.toDF()
medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
mode='overwrite')
window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
windowed_df = medline_df.select(
max('delete').over(window).alias('is_deleted'),
rank().over(window).alias('pos'),
'*')
windowed_df.\
where('is_deleted = False and pos = 1').\
write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
mode='overwrite')
# parse grant database
parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
.filter(lambda x: x is not None)\
.map(lambda x: Row(**x))
grant_df = parse_grant_rdd.toDF()
grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
mode='overwrite')
示例4: _is_locally_monotonic_spark_column
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _is_locally_monotonic_spark_column(self, order):
window = (
Window.partitionBy(F.col("__partition_id"))
.orderBy(NATURAL_ORDER_COLUMN_NAME)
.rowsBetween(-1, -1)
)
if order == "increasing":
return (F.col("__origin") >= F.lag(F.col("__origin"), 1).over(window)) & F.col(
"__origin"
).isNotNull()
else:
return (F.col("__origin") <= F.lag(F.col("__origin"), 1).over(window)) & F.col(
"__origin"
).isNotNull()
示例5: _shift
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _shift(self, periods, fill_value, part_cols=()):
if not isinstance(periods, int):
raise ValueError("periods should be an int; however, got [%s]" % type(periods))
col = self.spark.column
window = (
Window.partitionBy(*part_cols)
.orderBy(NATURAL_ORDER_COLUMN_NAME)
.rowsBetween(-periods, -periods)
)
lag_col = F.lag(col, periods).over(window)
col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
return self._with_new_scol(col).rename(self.name)
# TODO: Update Documentation for Bins Parameter when its supported
示例6: _get_top_k_items
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_top_k_items(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get the input customer-item-rating tuple in the format of Spark
DataFrame, output a Spark DataFrame in the dense format of top k items
for each user.
NOTE: if it is implicit rating, just append a column of constants to be ratings.
Args:
dataframe (spark.DataFrame): DataFrame of rating data (in the format of
customerID-itemID-rating tuple).
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items for each user.
Return:
spark.DataFrame: DataFrame of top k items for each user.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())
# this does not work for rating of the same value.
items_for_user = (
dataframe.select(
col_user,
col_item,
col_rating,
row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.groupby(col_user)
.agg(F.collect_list(col_item).alias(col_prediction))
)
return items_for_user
示例7: _get_relevant_items_by_threshold
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_relevant_items_by_threshold(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=PREDICTION_COL,
threshold=DEFAULT_THRESHOLD
):
"""Get relevant items for each customer in the input rating data.
Relevant items are defined as those having ratings above certain threshold.
The threshold is defined as a statistical measure of the ratings for a
user, e.g., median.
Args:
dataframe: Spark DataFrame of customerID-itemID-rating tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
threshold (float): threshold for determining the relevant recommended items.
This is used for the case that predicted ratings follow a known
distribution.
Return:
spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
items.
"""
items_for_user = (
dataframe
.orderBy(col_rating, ascending=False)
.where(col_rating + " >= " + str(threshold))
.select(
col_user, col_item, col_rating
)
.withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
.select(col_user, col_prediction)
.dropDuplicates()
)
return items_for_user
示例8: _get_relevant_items_by_timestamp
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_relevant_items_by_timestamp(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
col_prediction=PREDICTION_COL,
k=DEFAULT_K
):
"""Get relevant items for each customer defined by timestamp.
Relevant items are defined as k items that appear mostly recently
according to timestamps.
Args:
dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_timestamp (str): column name for timestamp.
col_prediction (str): column name for prediction.
k: number of relevent items to be filtered by the function.
Return:
spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())
items_for_user = (
dataframe.select(
col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
.select(col_user, col_prediction)
.dropDuplicates([col_user, col_prediction])
)
return items_for_user
示例9: transform
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def transform(landfill, n_documents=1000):
meta_schema = StructType(
[StructField(k, StringType(), True) for k in META_WHITELIST]
)
schema = StructType(
[
StructField("namespace", StringType(), False),
StructField("doc_type", StringType(), False),
StructField("doc_version", StringType(), True),
StructField("doc_id", StringType(), True),
StructField("meta", meta_schema, False),
StructField("content", StringType(), False),
]
)
documents = (
landfill.map(_process)
.filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
.toDF(schema)
)
window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
"doc_id"
)
df = (
documents.fillna("0", "doc_version")
.withColumn("row_id", row_number().over(window_spec))
.where(col("row_id") <= n_documents)
.drop("row_id")
)
return df
示例10: save
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def save(submission_date, bucket, prefix, df):
path = "s3://{}/{}/{}/submission_date_s3={}".format(
bucket, prefix, "v3", submission_date
)
(
df.write.partitionBy("namespace", "doc_type", "doc_version").json(
path, mode="overwrite"
)
)
示例11: ndcg
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
query_cols=['wikiid', 'query', 'session_id']):
"""
Calculate ndcg@k for the provided dataframe
Parameters
----------
df : pyspark.sql.DataFrame
Input dataframe to calculate against
k : int
Cutoff for ndcg calculation
label_col : str
Column name containing integer label, higher is better, of the hit
position_col : str
Column name containing order displayed to user, lowest first, of the hit
query_cols : list of str
Column names to group by, which indicate a unique query displayed to a user
Returns
-------
float
The ndcg@k value, always between 0 and 1
"""
if wiki_col not in query_cols:
query_cols = query_cols + [wiki_col]
# ideal results per labels
w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
topAtK = (
df
.select(label_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
# top k results shown to user
w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
predictedTopAtK = (
df
.select(label_col, position_col, *query_cols)
.withColumn('rn', F.row_number().over(w))
.where(F.col('rn') <= k)
.groupBy(*query_cols)
.agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
return {row[wiki_col]: row.ndcgAtK for row in topAtK
.join(predictedTopAtK, query_cols, how='inner')
.select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
.groupBy(wiki_col)
.agg(F.mean('ndcgAtK').alias('ndcgAtK'))
.collect()}
示例12: compile_window_op
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def compile_window_op(t, expr, scope, **kwargs):
op = expr.op()
window = op.window
operand = op.expr
group_by = window._group_by
grouping_keys = [
key_op.name
if isinstance(key_op, ops.TableColumn)
else t.translate(key, scope)
for key, key_op in zip(
group_by, map(operator.methodcaller('op'), group_by)
)
]
order_by = window._order_by
ordering_keys = [
key.to_expr().get_name()
for key in map(operator.methodcaller('op'), order_by)
]
context = AggregationContext.WINDOW
pyspark_window = Window.partitionBy(grouping_keys).orderBy(ordering_keys)
# If the operand is a shift op (e.g. lead, lag), Spark will set the window
# bounds. Only set window bounds here if not a shift operation.
if not isinstance(operand.op(), ops.ShiftBase):
start = (
-window.preceding
if window.preceding is not None
else Window.unboundedPreceding
)
end = (
window.following
if window.following is not None
else Window.unboundedFollowing
)
pyspark_window = pyspark_window.rowsBetween(start, end)
result = t.translate(
operand, scope, window=pyspark_window, context=context
)
return result
示例13: nsmallest
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def nsmallest(self, n=5):
"""
Return the first n rows ordered by columns in ascending order in group.
Return the first n rows with the smallest values in columns, in ascending order.
The columns that are not specified are returned as well, but not used for ordering.
Parameters
----------
n : int
Number of items to retrieve.
See Also
--------
databricks.koalas.Series.nsmallest
databricks.koalas.DataFrame.nsmallest
Examples
--------
>>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])
>>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE
a
1 0 1
2 3 2
3 6 3
Name: b, dtype: int64
"""
if len(self._kdf._internal.index_names) > 1:
raise ValueError("nsmallest do not support multi-index now")
sdf = self._kdf._internal.spark_frame
name = self._agg_columns[0]._internal.data_spark_column_names[0]
window = Window.partitionBy(self._groupkeys_scols).orderBy(
self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
)
sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)
internal = InternalFrame(
spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
index_map=OrderedDict(
[
(s._internal.data_spark_column_names[0], s._internal.column_labels[0])
for s in self._groupkeys
]
+ list(self._kdf._internal.index_map.items())
),
data_spark_columns=[scol_for(sdf, name)],
)
return first_series(DataFrame(internal))
# TODO: add keep parameter
示例14: nlargest
# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def nlargest(self, n=5):
"""
Return the first n rows ordered by columns in descending order in group.
Return the first n rows with the smallest values in columns, in descending order.
The columns that are not specified are returned as well, but not used for ordering.
Parameters
----------
n : int
Number of items to retrieve.
See Also
--------
databricks.koalas.Series.nlargest
databricks.koalas.DataFrame.nlargest
Examples
--------
>>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])
>>> df.groupby(['a'])['b'].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE
a
1 1 2
2 4 3
3 7 4
Name: b, dtype: int64
"""
if len(self._kdf._internal.index_names) > 1:
raise ValueError("nlargest do not support multi-index now")
sdf = self._kdf._internal.spark_frame
name = self._agg_columns[0]._internal.data_spark_column_names[0]
window = Window.partitionBy(self._groupkeys_scols).orderBy(
self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME
)
sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)
internal = InternalFrame(
spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
index_map=OrderedDict(
[
(s._internal.data_spark_column_names[0], s._internal.column_labels[0])
for s in self._groupkeys
]
+ list(self._kdf._internal.index_map.items())
),
data_spark_columns=[scol_for(sdf, name)],
)
return first_series(DataFrame(internal))
# TODO: add bins, normalize parameter