当前位置: 首页>>代码示例>>Python>>正文


Python Window.partitionBy方法代码示例

本文整理汇总了Python中pyspark.sql.Window.partitionBy方法的典型用法代码示例。如果您正苦于以下问题:Python Window.partitionBy方法的具体用法?Python Window.partitionBy怎么用?Python Window.partitionBy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.Window的用法示例。


在下文中一共展示了Window.partitionBy方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_at_least_n_distinct

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def test_at_least_n_distinct(spark_context):
    df = spark_context.parallelize([
        ('foo', 'bar', 'baz'),
        ('foo', 'bar', 'bang'),
        ('foo', 'bar', 'bang'),
        ('foo', 'test', 'test'),
        ('foo', 'test', 'test'),
        ('fizz', 'bang', 'boom'),
    ]).toDF(['a', 'b', 'c'])

    w = Window.partitionBy('a', 'b')
    res = df.withColumn('z', mjolnir.spark.at_least_n_distinct('c', 2).over(w)).collect()
    expect = [
        ('foo', 'bar', 'baz', True),
        ('foo', 'bar', 'bang', True),
        ('foo', 'bar', 'bang', True),
        ('foo', 'test', 'test', False),
        ('foo', 'test', 'test', False),
        ('fizz', 'bang', 'boom', False),
    ]
    assert sorted(map(tuple, res)) == sorted(expect) 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:23,代码来源:test_spark.py

示例2: filter_min_sessions_per_norm_query

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def filter_min_sessions_per_norm_query(min_sessions: int) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        w = Window.partitionBy('wikiid', 'norm_query')
        return (
            df.withColumn(
                'has_min_sessions',
                at_least_n_distinct('session_id', min_sessions).over(w))
            .where(F.col('has_min_sessions'))
            .drop('has_min_sessions'))
    return transform 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:12,代码来源:norm_query_clustering.py

示例3: process_file

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite') 
开发者ID:titipata,项目名称:pubmed_parser,代码行数:35,代码来源:medline_spark.py

示例4: _is_locally_monotonic_spark_column

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _is_locally_monotonic_spark_column(self, order):
        window = (
            Window.partitionBy(F.col("__partition_id"))
            .orderBy(NATURAL_ORDER_COLUMN_NAME)
            .rowsBetween(-1, -1)
        )

        if order == "increasing":
            return (F.col("__origin") >= F.lag(F.col("__origin"), 1).over(window)) & F.col(
                "__origin"
            ).isNotNull()
        else:
            return (F.col("__origin") <= F.lag(F.col("__origin"), 1).over(window)) & F.col(
                "__origin"
            ).isNotNull() 
开发者ID:databricks,项目名称:koalas,代码行数:17,代码来源:base.py

示例5: _shift

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _shift(self, periods, fill_value, part_cols=()):
        if not isinstance(periods, int):
            raise ValueError("periods should be an int; however, got [%s]" % type(periods))

        col = self.spark.column
        window = (
            Window.partitionBy(*part_cols)
            .orderBy(NATURAL_ORDER_COLUMN_NAME)
            .rowsBetween(-periods, -periods)
        )
        lag_col = F.lag(col, periods).over(window)
        col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
        return self._with_new_scol(col).rename(self.name)

    # TODO: Update Documentation for Bins Parameter when its supported 
开发者ID:databricks,项目名称:koalas,代码行数:17,代码来源:base.py

示例6: _get_top_k_items

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user 
开发者ID:Azure-Samples,项目名称:azure-python-labs,代码行数:43,代码来源:spark_evaluation.py

示例7: _get_relevant_items_by_threshold

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user 
开发者ID:Azure-Samples,项目名称:azure-python-labs,代码行数:43,代码来源:spark_evaluation.py

示例8: _get_relevant_items_by_timestamp

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user 
开发者ID:Azure-Samples,项目名称:azure-python-labs,代码行数:42,代码来源:spark_evaluation.py

示例9: transform

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def transform(landfill, n_documents=1000):
    meta_schema = StructType(
        [StructField(k, StringType(), True) for k in META_WHITELIST]
    )

    schema = StructType(
        [
            StructField("namespace", StringType(), False),
            StructField("doc_type", StringType(), False),
            StructField("doc_version", StringType(), True),
            StructField("doc_id", StringType(), True),
            StructField("meta", meta_schema, False),
            StructField("content", StringType(), False),
        ]
    )

    documents = (
        landfill.map(_process)
        .filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
        .toDF(schema)
    )

    window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
        "doc_id"
    )

    df = (
        documents.fillna("0", "doc_version")
        .withColumn("row_id", row_number().over(window_spec))
        .where(col("row_id") <= n_documents)
        .drop("row_id")
    )

    return df 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:36,代码来源:sampler.py

示例10: save

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def save(submission_date, bucket, prefix, df):
    path = "s3://{}/{}/{}/submission_date_s3={}".format(
        bucket, prefix, "v3", submission_date
    )
    (
        df.write.partitionBy("namespace", "doc_type", "doc_version").json(
            path, mode="overwrite"
        )
    ) 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:11,代码来源:sampler.py

示例11: ndcg

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    if wiki_col not in query_cols:
        query_cols = query_cols + [wiki_col]

    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (
        df
        .select(label_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (
        df
        .select(label_col, position_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
    return {row[wiki_col]: row.ndcgAtK for row in topAtK
            .join(predictedTopAtK, query_cols, how='inner')
            .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
            .groupBy(wiki_col)
            .agg(F.mean('ndcgAtK').alias('ndcgAtK'))
            .collect()} 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:52,代码来源:metrics.py

示例12: compile_window_op

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def compile_window_op(t, expr, scope, **kwargs):
    op = expr.op()
    window = op.window
    operand = op.expr

    group_by = window._group_by
    grouping_keys = [
        key_op.name
        if isinstance(key_op, ops.TableColumn)
        else t.translate(key, scope)
        for key, key_op in zip(
            group_by, map(operator.methodcaller('op'), group_by)
        )
    ]

    order_by = window._order_by
    ordering_keys = [
        key.to_expr().get_name()
        for key in map(operator.methodcaller('op'), order_by)
    ]

    context = AggregationContext.WINDOW
    pyspark_window = Window.partitionBy(grouping_keys).orderBy(ordering_keys)

    # If the operand is a shift op (e.g. lead, lag), Spark will set the window
    # bounds. Only set window bounds here if not a shift operation.
    if not isinstance(operand.op(), ops.ShiftBase):
        start = (
            -window.preceding
            if window.preceding is not None
            else Window.unboundedPreceding
        )
        end = (
            window.following
            if window.following is not None
            else Window.unboundedFollowing
        )
        pyspark_window = pyspark_window.rowsBetween(start, end)

    result = t.translate(
        operand, scope, window=pyspark_window, context=context
    )

    return result 
开发者ID:ibis-project,项目名称:ibis,代码行数:46,代码来源:compiler.py

示例13: nsmallest

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def nsmallest(self, n=5):
        """
        Return the first n rows ordered by columns in ascending order in group.

        Return the first n rows with the smallest values in columns, in ascending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nsmallest
        databricks.koalas.DataFrame.nsmallest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nsmallest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  0    1
        2  3    2
        3  6    3
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nsmallest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add keep parameter 
开发者ID:databricks,项目名称:koalas,代码行数:55,代码来源:groupby.py

示例14: nlargest

# 需要导入模块: from pyspark.sql import Window [as 别名]
# 或者: from pyspark.sql.Window import partitionBy [as 别名]
def nlargest(self, n=5):
        """
        Return the first n rows ordered by columns in descending order in group.

        Return the first n rows with the smallest values in columns, in descending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nlargest
        databricks.koalas.DataFrame.nlargest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nlargest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  1    2
        2  4    3
        3  7    4
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nlargest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column.desc(), NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add bins, normalize parameter 
开发者ID:databricks,项目名称:koalas,代码行数:55,代码来源:groupby.py


注:本文中的pyspark.sql.Window.partitionBy方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。