本文整理汇总了Python中pyspark.sql.window.Window.orderBy方法的典型用法代码示例。如果您正苦于以下问题:Python Window.orderBy方法的具体用法?Python Window.orderBy怎么用?Python Window.orderBy使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.window.Window
的用法示例。
在下文中一共展示了Window.orderBy方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_window_functions_without_partitionBy
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import orderBy [as 别名]
def test_window_functions_without_partitionBy(self):
df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
w = Window.orderBy("key", df.value)
from pyspark.sql import functions as F
sel = df.select(
df.value,
df.key,
F.max("key").over(w.rowsBetween(0, 1)),
F.min("key").over(w.rowsBetween(0, 1)),
F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
F.rowNumber().over(w),
F.rank().over(w),
F.denseRank().over(w),
F.ntile(2).over(w),
)
rs = sorted(sel.collect())
expected = [
("1", 1, 1, 1, 4, 1, 1, 1, 1),
("2", 1, 1, 1, 4, 2, 2, 2, 1),
("2", 1, 2, 1, 4, 3, 2, 2, 2),
("2", 2, 2, 2, 4, 4, 4, 3, 2),
]
for r, ex in zip(rs, expected):
self.assertEqual(tuple(r), ex[: len(r)])
示例2: runOtherFunctions
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import orderBy [as 别名]
def runOtherFunctions(spark, personDf):
df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);
# array
df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)
# desc, asc
personDf.show()
personDf.sort(functions.desc("age"), functions.asc("name")).show()
# pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음
# split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)
# rownum, rank
f1 = StructField("date", StringType(), True)
f2 = StructField("product", StringType(), True)
f3 = StructField("amount", IntegerType(), True)
schema = StructType([f1, f2, f3])
p1 = ("2017-12-25 12:01:00", "note", 1000)
p2 = ("2017-12-25 12:01:10", "pencil", 3500)
p3 = ("2017-12-25 12:03:20", "pencil", 23000)
p4 = ("2017-12-25 12:05:00", "note", 1500)
p5 = ("2017-12-25 12:05:07", "note", 2000)
p6 = ("2017-12-25 12:06:25", "note", 1000)
p7 = ("2017-12-25 12:08:00", "pencil", 500)
p8 = ("2017-12-25 12:09:45", "note", 30000)
dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
w1 = Window.partitionBy("product").orderBy("amount")
w2 = Window.orderBy("amount")
dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
functions.rank().over(w2).alias("rank")).show()
示例3: collect_numeric_metric
# 需要导入模块: from pyspark.sql.window import Window [as 别名]
# 或者: from pyspark.sql.window.Window import orderBy [as 别名]
def collect_numeric_metric(metric, df, population):
cdf = df.select(df[metric['src']])
cdf = cdf.dropna(subset=metric['src'])
cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket'))
total_count = cdf.count()
num_partitions = total_count / 500
ws = Window.orderBy('bucket')
cdf = cdf.select(
cdf['bucket'],
cume_dist().over(ws).alias('c'),
row_number().over(ws).alias('i'))
cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions)
cdf = cdf.collect()
# Collapse rows with duplicate buckets.
collapsed_data = []
prev = None
for d in cdf:
if not collapsed_data:
collapsed_data.append(d) # Always keep first record.
continue
if prev and prev['bucket'] == d['bucket']:
collapsed_data.pop()
collapsed_data.append(d)
prev = d
# Calculate `p` from `c`.
data = []
prev = None
for i, d in enumerate(collapsed_data):
p = d['c'] - prev['c'] if prev else d['c']
data.append({
'bucket': d['bucket'],
'c': d['c'],
'p': p,
})
prev = d
"""
Example of what `data` looks like now::
[{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056},
{'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256},
{'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026},
{'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184},
{'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685},
{'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795},
...]
"""
# Push data to database.
sql = ("INSERT INTO api_numericcollection "
"(num_observations, population, metric_id, dataset_id) "
"VALUES (%s, %s, %s, %s) "
"RETURNING id")
params = [total_count, population, metric['id'], dataset_id]
if DEBUG_SQL:
collection_id = 0
print sql, params
else:
cursor.execute(sql, params)
conn.commit()
collection_id = cursor.fetchone()[0]
for d in data:
sql = ("INSERT INTO api_numericpoint "
"(bucket, proportion, collection_id) "
"VALUES (%s, %s, %s)")
params = [d['bucket'], d['p'], collection_id]
if DEBUG_SQL:
print sql, params
else:
cursor.execute(sql, params)
if not DEBUG_SQL:
conn.commit()