本文整理汇总了Python中pyspark.sql.window.Window.partitionBy方法的典型用法代码示例。如果您正苦于以下问题:Python Window.partitionBy方法的具体用法?Python Window.partitionBy怎么用?Python Window.partitionBy使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.window.Window
的用法示例。
在下文中一共展示了Window.partitionBy方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_window_functions
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def test_window_functions(self):
df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
w = Window.partitionBy("value").orderBy("key")
from pyspark.sql import functions as F
sel = df.select(
df.value,
df.key,
F.max("key").over(w.rowsBetween(0, 1)),
F.min("key").over(w.rowsBetween(0, 1)),
F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
F.rowNumber().over(w),
F.rank().over(w),
F.denseRank().over(w),
F.ntile(2).over(w),
)
rs = sorted(sel.collect())
expected = [
("1", 1, 1, 1, 1, 1, 1, 1, 1),
("2", 1, 1, 1, 3, 1, 1, 1, 1),
("2", 1, 2, 1, 3, 2, 1, 1, 1),
("2", 2, 2, 2, 3, 3, 3, 2, 2),
]
for r, ex in zip(rs, expected):
self.assertEqual(tuple(r), ex[: len(r)])
示例2: train
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def train(self, df, featureCols):
ntiles = []
for col in featureCols:
w = Window.partitionBy().orderBy(col)
aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col)
ntiles.append(list(aux.groupby('ntile').max(col).collect()))
self.ntiles_ = np.array(ntiles)
self.columns_ = map(str,featureCols)
self._is_trained = True
示例3: compute
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def compute(day):
# On veut les jours day-30 à day-1
sums = wikipediadata.where(
(wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))
# Sous-ensemble de test
#sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))
# On somme les heures de la journées
sums = sums.groupby('projectcode', 'page', 'day').sum('views')
# On cache pour plus tard
sums.cache()
# on définit une windows := jour precedent
window_spec = Window.partitionBy(sums.projectcode, sums.page) \
.orderBy(sums.day.asc()).rowsBetween(-1, -1)
# on calcule la différence entre views(d) - views(d-1)
diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
.over(window_spec))
# on calcule les coefs à appliquer à chaque jour
coefs = pd.DataFrame({'day': range(day-30, day)})
coefs['coef'] = 1. / (day - coefs.day)
coefs = hc.createDataFrame(coefs)
diffs = diffs.join(coefs, 'day')
# on calcul le score de chaque jour
diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)
totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
# on normalise par la racine de la somme des views
totals = totals.withColumn('score',
totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
.orderBy(F.desc('score')) \
.withColumnRenamed('SUM(views)', 'total_views') \
.limit(10)
views = sums.select('projectcode', 'page', 'day', 'views') \
.join(totals.select('projectcode', 'page', 'total_views', 'score'),
(totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')
df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
df2 = views.toPandas()
df2 = df2.iloc[:, 2:]
df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
df.to_csv(filename(day), index=False)
# on vide le cache
hc.clearCache()
示例4: runOtherFunctions
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def runOtherFunctions(spark, personDf):
df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);
# array
df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)
# desc, asc
personDf.show()
personDf.sort(functions.desc("age"), functions.asc("name")).show()
# pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음
# split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)
# rownum, rank
f1 = StructField("date", StringType(), True)
f2 = StructField("product", StringType(), True)
f3 = StructField("amount", IntegerType(), True)
schema = StructType([f1, f2, f3])
p1 = ("2017-12-25 12:01:00", "note", 1000)
p2 = ("2017-12-25 12:01:10", "pencil", 3500)
p3 = ("2017-12-25 12:03:20", "pencil", 23000)
p4 = ("2017-12-25 12:05:00", "note", 1500)
p5 = ("2017-12-25 12:05:07", "note", 2000)
p6 = ("2017-12-25 12:06:25", "note", 1000)
p7 = ("2017-12-25 12:08:00", "pencil", 500)
p8 = ("2017-12-25 12:09:45", "note", 30000)
dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
w1 = Window.partitionBy("product").orderBy("amount")
w2 = Window.orderBy("amount")
dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
functions.rank().over(w2).alias("rank")).show()
示例5: with_window_column
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def with_window_column(df):
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank
windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast'])
return df.withColumn("r", percent_rank().over(windowSpec))
示例6: unpartitioned_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def unpartitioned_window(self):
return Window.partitionBy()
示例7: ordered_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def ordered_window(self):
return Window.partitionBy('id').orderBy('v')
示例8: unbounded_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def unbounded_window(self):
return Window.partitionBy('id') \
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
示例9: cal_mat_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def cal_mat_window(sc, sqlContext, dfSC, window):
windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1)
mat = func.avg("close").over(windowSpec)
dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat )
print dfSC.collect()
示例10: sliding_range_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def sliding_range_window(self):
return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
示例11: sliding_row_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def sliding_row_window(self):
return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
示例12: shrinking_range_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def shrinking_range_window(self):
return Window.partitionBy('id').orderBy('v') \
.rangeBetween(-3, Window.unboundedFollowing)
示例13: shrinking_row_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def shrinking_row_window(self):
return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
示例14: growing_range_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def growing_range_window(self):
return Window.partitionBy('id').orderBy('v') \
.rangeBetween(Window.unboundedPreceding, 4)
示例15: growing_row_window
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import partitionBy [as 别名]
def growing_row_window(self):
return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)