当前位置: 首页>>代码示例>>Python>>正文


Python Window.partitionBy方法代码示例

本文整理汇总了Python中pyspark.sql.window.Window.partitionBy方法的典型用法代码示例。如果您正苦于以下问题:Python Window.partitionBy方法的具体用法?Python Window.partitionBy怎么用?Python Window.partitionBy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.window.Window的用法示例。


在下文中一共展示了Window.partitionBy方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_window_functions

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
开发者ID:kmarquardsen,项目名称:spark,代码行数:27,代码来源:tests.py

示例2: train

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def train(self, df, featureCols):
     
     ntiles = []
     for col in featureCols:
         w = Window.partitionBy().orderBy(col)
         aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col)
         ntiles.append(list(aux.groupby('ntile').max(col).collect()))
         
     self.ntiles_ = np.array(ntiles)
     self.columns_ = map(str,featureCols)
     self._is_trained = True
开发者ID:elmi-gemini,项目名称:pyspark_utils,代码行数:13,代码来源:outlier_remover.py

示例3: compute

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
开发者ID:cygilbert,项目名称:projetnosql,代码行数:54,代码来源:Req30j.py

示例4: runOtherFunctions

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
开发者ID:oopchoi,项目名称:spark,代码行数:38,代码来源:dataframe_sample.py

示例5: with_window_column

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def with_window_column(df):
     from pyspark.sql.window import Window
     from pyspark.sql.functions import percent_rank
     windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast'])
     return df.withColumn("r", percent_rank().over(windowSpec))
开发者ID:mattomatic,项目名称:flint,代码行数:7,代码来源:test_partition_preserve.py

示例6: unpartitioned_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def unpartitioned_window(self):
     return Window.partitionBy()
开发者ID:git-prodigy,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py

示例7: ordered_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def ordered_window(self):
     return Window.partitionBy('id').orderBy('v')
开发者ID:git-prodigy,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py

示例8: unbounded_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def unbounded_window(self):
     return Window.partitionBy('id') \
         .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
开发者ID:git-prodigy,项目名称:spark,代码行数:5,代码来源:test_pandas_udf_window.py

示例9: cal_mat_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def cal_mat_window(sc, sqlContext, dfSC, window):
    windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1)
    mat = func.avg("close").over(windowSpec)
    dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat )
    print dfSC.collect()
开发者ID:hongbin0908,项目名称:bintrade,代码行数:7,代码来源:mat_close.py

示例10: sliding_range_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def sliding_range_window(self):
     return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
开发者ID:Brett-A,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py

示例11: sliding_row_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def sliding_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
开发者ID:Brett-A,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py

示例12: shrinking_range_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def shrinking_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(-3, Window.unboundedFollowing)
开发者ID:Brett-A,项目名称:spark,代码行数:5,代码来源:test_pandas_udf_window.py

示例13: shrinking_row_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def shrinking_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
开发者ID:Brett-A,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py

示例14: growing_range_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def growing_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(Window.unboundedPreceding, 4)
开发者ID:Brett-A,项目名称:spark,代码行数:5,代码来源:test_pandas_udf_window.py

示例15: growing_row_window

# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
 def growing_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
开发者ID:Brett-A,项目名称:spark,代码行数:4,代码来源:test_pandas_udf_window.py


注:本文中的pyspark.sql.window.Window.partitionBy方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。