当前位置: 首页>>代码示例>>Python>>正文


Python functions.row_number方法代码示例

本文整理汇总了Python中pyspark.sql.functions.row_number方法的典型用法代码示例。如果您正苦于以下问题:Python functions.row_number方法的具体用法?Python functions.row_number怎么用?Python functions.row_number使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.functions的用法示例。


在下文中一共展示了functions.row_number方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_top_artists

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def get_top_artists(mapped_listens_subset, top_artist_limit):
    """ Get top artists listened to by users who have a listening history in
        the past X days where X = RECOMMENDATION_GENERATION_WINDOW.

        Args:
            df (dataframe): A subset of mapped_df containing user history.
            top_artist_limit (int): number of top artist to calculate

        Returns:
            top_artists_df (dataframe): Top Y artists listened to by a user for all users where
                                        Y = TOP_ARTISTS_LIMIT
    """
    df = mapped_listens_subset.select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .groupBy('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name') \
                              .agg(func.count('mb_artist_credit_id').alias('count'))

    window = Window.partitionBy('user_name').orderBy(col('count').desc())

    top_artists_df = df.withColumn('rank', row_number().over(window)) \
                       .where(col('rank') <= top_artist_limit) \
                       .select('mb_artist_credit_id', 'msb_artist_credit_name_matchable', 'user_name')

    return top_artists_df 
开发者ID:metabrainz,项目名称:listenbrainz-server,代码行数:25,代码来源:candidate_sets.py

示例2: _get_relevant_items_by_timestamp

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user 
开发者ID:Azure-Samples,项目名称:azure-python-labs,代码行数:42,代码来源:spark_evaluation.py

示例3: compile_row_number

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def compile_row_number(t, expr, scope, *, window, **kwargs):
    return F.row_number().over(window).cast('long') - 1


# -------------------------- Temporal Operations ----------------------------

# Ibis value to PySpark value 
开发者ID:ibis-project,项目名称:ibis,代码行数:9,代码来源:compiler.py

示例4: sum

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def sum(self):
        def sum(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.sum(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(sum) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例5: min

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def min(self):
        def min(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.min(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(min) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例6: max

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def max(self):
        def max(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.max(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(max) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例7: mean

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def mean(self):
        def mean(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.mean(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(mean) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例8: std

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def std(self):
        def std(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.stddev(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(std) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:window.py

示例9: attach_sequence_column

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def attach_sequence_column(sdf, column_name):
        scols = [scol_for(sdf, column) for column in sdf.columns]
        sequential_index = F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1
        return sdf.select(sequential_index.alias(column_name), *scols) 
开发者ID:databricks,项目名称:koalas,代码行数:6,代码来源:internal.py

示例10: _get_top_k_items

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user 
开发者ID:Azure-Samples,项目名称:azure-python-labs,代码行数:43,代码来源:spark_evaluation.py

示例11: transform

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def transform(landfill, n_documents=1000):
    meta_schema = StructType(
        [StructField(k, StringType(), True) for k in META_WHITELIST]
    )

    schema = StructType(
        [
            StructField("namespace", StringType(), False),
            StructField("doc_type", StringType(), False),
            StructField("doc_version", StringType(), True),
            StructField("doc_id", StringType(), True),
            StructField("meta", meta_schema, False),
            StructField("content", StringType(), False),
        ]
    )

    documents = (
        landfill.map(_process)
        .filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
        .toDF(schema)
    )

    window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
        "doc_id"
    )

    df = (
        documents.fillna("0", "doc_version")
        .withColumn("row_id", row_number().over(window_spec))
        .where(col("row_id") <= n_documents)
        .drop("row_id")
    )

    return df 
开发者ID:mozilla,项目名称:python_mozetl,代码行数:36,代码来源:sampler.py

示例12: ndcg

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def ndcg(df, k, label_col='label', position_col='hit_position', wiki_col='wikiid',
         query_cols=['wikiid', 'query', 'session_id']):
    """
    Calculate ndcg@k for the provided dataframe

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input dataframe to calculate against
    k : int
        Cutoff for ndcg calculation
    label_col : str
        Column name containing integer label, higher is better, of the hit
    position_col : str
        Column name containing order displayed to user, lowest first, of the hit
    query_cols : list of str
        Column names to group by, which indicate a unique query displayed to a user

    Returns
    -------
    float
        The ndcg@k value, always between 0 and 1
    """
    if wiki_col not in query_cols:
        query_cols = query_cols + [wiki_col]

    # ideal results per labels
    w = Window.partitionBy(*query_cols).orderBy(F.col(label_col).desc())
    topAtK = (
        df
        .select(label_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('topAtK')))
    # top k results shown to user
    w = Window.partitionBy(*query_cols).orderBy(F.col(position_col).asc())
    predictedTopAtK = (
        df
        .select(label_col, position_col, *query_cols)
        .withColumn('rn', F.row_number().over(w))
        .where(F.col('rn') <= k)
        .groupBy(*query_cols)
        .agg(F.collect_list(F.struct(label_col, 'rn')).alias('predictedTopAtK')))
    return {row[wiki_col]: row.ndcgAtK for row in topAtK
            .join(predictedTopAtK, query_cols, how='inner')
            .select(wiki_col, _ndcg_at(k, label_col)('predictedTopAtK', 'topAtK').alias('ndcgAtK'))
            .groupBy(wiki_col)
            .agg(F.mean('ndcgAtK').alias('ndcgAtK'))
            .collect()} 
开发者ID:wikimedia,项目名称:search-MjoLniR,代码行数:52,代码来源:metrics.py

示例13: get_top_similar_artists

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def get_top_similar_artists(top_artists_df, artists_relation_df, similar_artist_limit):
    """ Get artists similar to top artists.

        Args:
            top_artists_df: Dataframe containing top artists listened to by users
            artist_relation_df: Dataframe containing artists and similar artists.
                                For columns refer to artist_relation_schema in listenbrainz_spark/schema.py.
            similar_artist_limit (int): number of similar artist to calculate

        Returns:
            top_similar_artists_df (dataframe): Top Z artists similar to top artists where
                                                Z = SIMILAR_ARTISTS_LIMIT.
    """
    condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_0]

    df1 = top_artists_df.join(artists_relation_df, condition, 'inner') \
                        .select(col('id_0').alias('top_artist_credit_id'),
                                col('name_0').alias('top_artist_name'),
                                col('id_1').alias('similar_artist_credit_id'),
                                col('name_1').alias('similar_artist_name'),
                                'score',
                                'user_name')

    condition = [top_artists_df.mb_artist_credit_id == artists_relation_df.id_1]

    df2 = top_artists_df.join(artists_relation_df, condition, 'inner') \
                        .select(col('id_1').alias('top_artist_credit_id'),
                                col('name_1').alias('top_artist_name'),
                                col('id_0').alias('similar_artist_credit_id'),
                                col('name_0').alias('similar_artist_name'),
                                'score',
                                'user_name')

    similar_artists_df = df1.union(df2)

    window = Window.partitionBy('top_artist_credit_id', 'user_name')\
                   .orderBy(col('score').desc())

    top_similar_artists_df = similar_artists_df.withColumn('rank', row_number().over(window)) \
                                               .where(col('rank') <= similar_artist_limit)\
                                               .select('top_artist_credit_id', 'top_artist_name',
                                                       'similar_artist_credit_id', 'similar_artist_name',
                                                       'score', 'user_name')

    return top_similar_artists_df 
开发者ID:metabrainz,项目名称:listenbrainz-server,代码行数:47,代码来源:candidate_sets.py

示例14: count

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def count(self):
        """
        The expanding count of any non-NaN observations inside the window.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        Series.count : Count of the full Series.
        DataFrame.count : Count of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 3, float("nan"), 10])
        >>> s.expanding().count()
        0    1.0
        1    2.0
        2    2.0
        3    3.0
        Name: 0, dtype: float64

        >>> s.to_frame().expanding().count()
             0
        0  1.0
        1  2.0
        2  2.0
        3  3.0
        """

        def count(scol):
            return F.when(
                F.row_number().over(self._unbounded_window) >= self._min_periods,
                F.count(scol).over(self._window),
            ).otherwise(F.lit(None))

        return self._apply_as_series_or_frame(count).astype("float64") 
开发者ID:databricks,项目名称:koalas,代码行数:49,代码来源:window.py

示例15: nsmallest

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import row_number [as 别名]
def nsmallest(self, n=5):
        """
        Return the first n rows ordered by columns in ascending order in group.

        Return the first n rows with the smallest values in columns, in ascending order.
        The columns that are not specified are returned as well, but not used for ordering.

        Parameters
        ----------
        n : int
            Number of items to retrieve.

        See Also
        --------
        databricks.koalas.Series.nsmallest
        databricks.koalas.DataFrame.nsmallest

        Examples
        --------
        >>> df = ks.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],
        ...                    'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])

        >>> df.groupby(['a'])['b'].nsmallest(1).sort_index()  # doctest: +NORMALIZE_WHITESPACE
        a
        1  0    1
        2  3    2
        3  6    3
        Name: b, dtype: int64
        """
        if len(self._kdf._internal.index_names) > 1:
            raise ValueError("nsmallest do not support multi-index now")

        sdf = self._kdf._internal.spark_frame
        name = self._agg_columns[0]._internal.data_spark_column_names[0]
        window = Window.partitionBy(self._groupkeys_scols).orderBy(
            self._agg_columns[0].spark.column, NATURAL_ORDER_COLUMN_NAME
        )
        sdf = sdf.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= n)

        internal = InternalFrame(
            spark_frame=sdf.drop(NATURAL_ORDER_COLUMN_NAME),
            index_map=OrderedDict(
                [
                    (s._internal.data_spark_column_names[0], s._internal.column_labels[0])
                    for s in self._groupkeys
                ]
                + list(self._kdf._internal.index_map.items())
            ),
            data_spark_columns=[scol_for(sdf, name)],
        )
        return first_series(DataFrame(internal))

    # TODO: add keep parameter 
开发者ID:databricks,项目名称:koalas,代码行数:55,代码来源:groupby.py


注:本文中的pyspark.sql.functions.row_number方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。