本文整理汇总了Python中pyspark.sql.functions.col函数的典型用法代码示例。如果您正苦于以下问题:Python col函数的具体用法?Python col怎么用?Python col使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了col函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_basic
def test_basic(self):
df = self.data
weighted_mean_udf = self.pandas_agg_weighted_mean_udf
# Groupby one column and aggregate one UDF with literal
result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id')
expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
# Groupby one expression and aggregate one UDF with literal
result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\
.sort(df.id + 1)
expected2 = df.groupby((col('id') + 1))\
.agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1)
self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
# Groupby one column and aggregate one UDF without literal
result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id')
expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id')
self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
# Groupby one expression and aggregate one UDF without literal
result4 = df.groupby((col('id') + 1).alias('id'))\
.agg(weighted_mean_udf(df.v, df.w))\
.sort('id')
expected4 = df.groupby((col('id') + 1).alias('id'))\
.agg(mean(df.v).alias('weighted_mean(v, w)'))\
.sort('id')
self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
示例2: test_vectorized_udf_dates
def test_vectorized_udf_dates(self):
schema = StructType().add("idx", LongType()).add("date", DateType())
data = [(0, date(1969, 1, 1),),
(1, date(2012, 2, 2),),
(2, None,),
(3, date(2100, 4, 4),),
(4, date(2262, 4, 12),)]
df = self.spark.createDataFrame(data, schema=schema)
date_copy = pandas_udf(lambda t: t, returnType=DateType())
df = df.withColumn("date_copy", date_copy(col("date")))
@pandas_udf(returnType=StringType())
def check_data(idx, date, date_copy):
msgs = []
is_equal = date.isnull()
for i in range(len(idx)):
if (is_equal[i] and data[idx[i]][1] is None) or \
date[i] == data[idx[i]][1]:
msgs.append(None)
else:
msgs.append(
"date values are not equal (date='%s': data[%d][1]='%s')"
% (date[i], idx[i], data[idx[i]][1]))
return pd.Series(msgs)
result = df.withColumn("check_data",
check_data(col("idx"), col("date"), col("date_copy"))).collect()
self.assertEquals(len(data), len(result))
for i in range(len(result)):
self.assertEquals(data[i][1], result[i][1]) # "date" col
self.assertEquals(data[i][1], result[i][2]) # "date_copy" col
self.assertIsNone(result[i][3]) # "check_data" col
示例3: test_vectorized_udf_string_in_udf
def test_vectorized_udf_string_in_udf(self):
import pandas as pd
df = self.spark.range(10)
str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType())
actual = df.select(str_f(col('id')))
expected = df.select(col('id').cast('string'))
self.assertEquals(expected.collect(), actual.collect())
示例4: test_mixed_sql_and_udf
def test_mixed_sql_and_udf(self):
df = self.data
w = self.unbounded_window
ow = self.ordered_window
max_udf = self.pandas_agg_max_udf
min_udf = self.pandas_agg_min_udf
result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))
# Test mixing sql window function and window udf in the same expression
result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
expected2 = expected1
# Test chaining sql aggregate function and udf
result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
.withColumn('min_v', min(df['v']).over(w)) \
.withColumn('v_diff', col('max_v') - col('min_v')) \
.drop('max_v', 'min_v')
expected3 = expected1
# Test mixing sql window function and udf
result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
.withColumn('rank', rank().over(ow))
expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
.withColumn('rank', rank().over(ow))
self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
示例5: data
def data(self):
from pyspark.sql.functions import array, explode, col, lit
return self.spark.range(10).toDF('id') \
.withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
.withColumn("v", explode(col('vs'))) \
.drop('vs') \
.withColumn('w', lit(1.0))
示例6: test_column_getitem
def test_column_getitem(self):
from pyspark.sql.functions import col
self.assertIsInstance(col("foo")[1:3], Column)
self.assertIsInstance(col("foo")[0], Column)
self.assertIsInstance(col("foo")["bar"], Column)
self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
示例7: scalar_pandas_udf_example
def scalar_pandas_udf_example(spark):
# $example on:scalar_pandas_udf$
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType
# Declare the function and create the UDF
def multiply_func(a, b):
return a * b
multiply = pandas_udf(multiply_func, returnType=LongType())
# The function for a pandas_udf should be able to execute with local Pandas data
x = pd.Series([1, 2, 3])
print(multiply_func(x, x))
# 0 1
# 1 4
# 2 9
# dtype: int64
# Create a Spark DataFrame, 'spark' is an existing SparkSession
df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))
# Execute function as a Spark vectorized UDF
df.select(multiply(col("x"), col("x"))).show()
示例8: generateExpr
def generateExpr(columnName, listIntervals):
if (len(listIntervals) == 1):
return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None)
else:
return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]),
len(listIntervals) - 1)
.otherwise(generateExpr(columnName, listIntervals[1:])))
示例9: setup_method
def setup_method(self, method):
sparkConf = create_spark_conf().setMaster("local[4]")\
.setAppName("test wide and deep")
self.sc = init_nncontext(sparkConf)
self.sqlContext = SQLContext(self.sc)
data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
categorical_gender_udf = udf(lambda gender:
categorical_from_vocab_list(gender, ["F", "M"], start=1))
bucket_udf = udf(lambda feature1, feature2:
hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
self.data_in = self.sqlContext.read.parquet(data_path) \
.withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
.withColumn("occupation-gender",
bucket_udf(col("occupation"), col("gender")).cast("int"))
self.column_info = ColumnFeatureInfo(
wide_base_cols=["occupation", "gender"],
wide_base_dims=[21, 3],
wide_cross_cols=["occupation-gender"],
wide_cross_dims=[100],
indicator_cols=["occupation", "gender"],
indicator_dims=[21, 3],
embed_cols=["userId", "itemId"],
embed_in_dims=[100, 100],
embed_out_dims=[20, 20],
continuous_cols=["age"])
示例10: test_smvPlusDateTime
def test_smvPlusDateTime(self):
df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
r1 = df.select(col("t").smvPlusDays(-10).alias("ts"))
r2 = df.select(col("t").smvPlusMonths(1).alias("ts"))
r3 = df.select(col("t").smvPlusWeeks(3).alias("ts"))
r4 = df.select(col("t").smvPlusYears(2).alias("ts"))
r5 = df.select(col("t").smvPlusYears(4).alias("ts"))
s = "ts: Timestamp[yyyy-MM-dd hh:mm:ss.S]"
e1 = self.createDF(
s,
"1976-01-21 00:00:00.0;" +
"2012-02-19 00:00:00.0")
e2 = self.createDF(
s,
"1976-02-29 00:00:00.0;" +
"2012-03-29 00:00:00.0")
e3 = self.createDF(
s,
"1976-02-21 00:00:00.0;" +
"2012-03-21 00:00:00.0")
e4 = self.createDF(
s,
"1978-01-31 00:00:00.0;" +
"2014-02-28 00:00:00.0")
e5 = self.createDF(
s,
"1980-01-31 00:00:00.0;" +
"2016-02-29 00:00:00.0")
self.should_be_same(e1, r1)
self.should_be_same(e2, r2)
self.should_be_same(e3, r3)
self.should_be_same(e4, r4)
self.should_be_same(e5, r5)
示例11: test_udf_with_filter_function
def test_udf_with_filter_function(self):
df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
from pyspark.sql.functions import udf, col
from pyspark.sql.types import BooleanType
my_filter = udf(lambda a: a < 2, BooleanType())
sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2"))
self.assertEqual(sel.collect(), [Row(key=1, value='1')])
示例12: get_latest_data
def get_latest_data(self):
from pyspark.sql import SparkSession
import config
import pandas as pd
# initialise sparkContext
spark1 = SparkSession.builder \
.master(config.sp_master) \
.appName(config.sp_appname) \
.config('spark.executor.memory', config.sp_memory) \
.config("spark.cores.max", config.sp_cores) \
.getOrCreate()
sc = spark1.sparkContext
# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from datetime import datetime
t1 = datetime.now()
df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
df2 = sqlContext.read.parquet(config.proj_path+'/datas/appid_attribute_parquet')
df2 = df2[['attribute_id','source','target_address','location']]
#renaming the column
from pyspark.sql.functions import col
df2 = df2.select(col("attribute_id").alias("target_attribute_id"),
col("source").alias("source_y"),
col("target_address").alias("target_address_y"),
col("location").alias("location"),
)
# merging the dfs
df_merge = df.join(df2,how='left',on='target_attribute_id')
# Needed data extraction
t1 = datetime.now()
data = df_merge.registerTempTable('dummy')
data = sqlContext.sql('select sum(byte_count) as byte_count_sum , time_stamp, location from dummy group by location, time_stamp')
data = data[data.byte_count_sum > 0]
# data cleaning
self.p7_df=data.toPandas()
t2 =datetime.now()
time_to_fetch = str(t2-t1)
self.p7_df['bw'] = self.p7_df['byte_count_sum']/(8*3600)
self.p7_df = self.p7_df.sort_values(by='location',ascending=True)
dates_outlook = pd.to_datetime(pd.Series(self.p7_df.time_stamp),unit='ms')
self.p7_df.index = dates_outlook
self.p7_df['date'] = self.p7_df.index.date
self.p7_df = self.p7_df.sort_values(by='time_stamp')
t2 =datetime.now()
time_to_fetch = str(t2-t1)
示例13: test_smvRenameField_preserve_meta_for_unrenamed_fields
def test_smvRenameField_preserve_meta_for_unrenamed_fields(self):
df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
desc = "c description"
res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
.smvDesc(("c", desc))
self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])
res2 = res1.smvRenameField(("a", "d"))
self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
示例14: test_cast_to_string_with_udt
def test_cast_to_string_with_udt(self):
from pyspark.sql.functions import col
row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
schema = StructType([StructField("point", ExamplePointUDT(), False),
StructField("pypoint", PythonOnlyUDT(), False)])
df = self.spark.createDataFrame([row], schema)
result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head()
self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
示例15: test_smvDayMonth70
def test_smvDayMonth70(self):
df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
r1 = df.select(col("t").smvDay70().alias("t_day70"))
r2 = df.select(col("t").smvMonth70().alias("t_month70"))
e1 = self.createDF("t_day70: Integer", "2221;15399")
e2 = self.createDF("t_month70: Integer", "72;505")
self.should_be_same(e1, r1)
self.should_be_same(e2, r2)