本文整理汇总了Python中pyspark.sql.functions.udf函数的典型用法代码示例。如果您正苦于以下问题:Python udf函数的具体用法?Python udf怎么用?Python udf使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了udf函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setup_method
def setup_method(self, method):
sparkConf = create_spark_conf().setMaster("local[4]")\
.setAppName("test wide and deep")
self.sc = init_nncontext(sparkConf)
self.sqlContext = SQLContext(self.sc)
data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
categorical_gender_udf = udf(lambda gender:
categorical_from_vocab_list(gender, ["F", "M"], start=1))
bucket_udf = udf(lambda feature1, feature2:
hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
self.data_in = self.sqlContext.read.parquet(data_path) \
.withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
.withColumn("occupation-gender",
bucket_udf(col("occupation"), col("gender")).cast("int"))
self.column_info = ColumnFeatureInfo(
wide_base_cols=["occupation", "gender"],
wide_base_dims=[21, 3],
wide_cross_cols=["occupation-gender"],
wide_cross_dims=[100],
indicator_cols=["occupation", "gender"],
indicator_dims=[21, 3],
embed_cols=["userId", "itemId"],
embed_in_dims=[100, 100],
embed_out_dims=[20, 20],
continuous_cols=["age"])
示例2: test_pandas_udf_arrow_overflow
def test_pandas_udf_arrow_overflow(self):
from distutils.version import LooseVersion
import pandas as pd
import pyarrow as pa
df = self.spark.range(0, 1)
@pandas_udf(returnType="byte")
def udf(column):
return pd.Series([128])
# Arrow 0.11.0+ allows enabling or disabling safe type check.
if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
# When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
with self.sql_conf({
"spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
with self.assertRaisesRegexp(Exception,
"Exception thrown when converting pandas.Series"):
df.withColumn('udf', udf('id')).collect()
# Disabling safe type check, let Arrow do the cast anyway.
with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
df.withColumn('udf', udf('id')).collect()
else:
# SQL config `arrowSafeTypeConversion` no matters for older Arrow.
# Overflow cast causes an error.
with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
with self.assertRaisesRegexp(Exception,
"Integer value out of bounds"):
df.withColumn('udf', udf('id')).collect()
示例3: extract_pre
def extract_pre(sql_sc):
schema = StructType([
StructField("title", StringType()),
StructField("text", StringType())
])
first_ex = udf(extract_first_sentence, StringType())
inf_ex2 = udf(extract_names, StringType())
url_ex = udf(extract_urlpattern, StringType())
df = sql_sc.read \
.format("com.databricks.spark.csv") \
.schema(schema) \
.option("header", "false") \
.option("quotechar", '|') \
.option("delimiter", ',') \
.load(DATAP + '/dump/articles.csv')
#df = df.repartition(100)
df = df.withColumn("first_sentence", first_ex(df.text))
df = df.withColumn('infoboxnames', inf_ex2(df.text))
df = df.withColumn("urlwords", url_ex(df.title))
df['title', 'first_sentence', 'infoboxnames', 'urlwords'].write \
.format("com.databricks.spark.csv") \
.option("header", "false") \
.option("quotechar", '|') \
.option("delimiter", ",") \
.csv(DATAP + '/dump/articles_annotated_pre')
示例4: cat2Num
def cat2Num(self, df, indices):
'''sbaronia - extract the categorical data and make df out of it
so oneHotEncoding can be run on them'''
protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache()
protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache()
ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache()
ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache()
'''sbaronia - add those hot encoded features columns to original df'''
int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache()
int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache()
'''sbaronia - now create a new column features which has
converted vector form and drop rest columns'''
comb_udf = udf(replaceCat2Num,StringType())
int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \
comb_udf(int_join_2.rawFeatures, \
int_join_2.num_features0, \
int_join_2.num_features1).alias("features")).cache()
'''sbaronia - convert list of numerical features to DenseVector
so they can be used in KMeans'''
dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT())
feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache()
return feat
示例5: text_features
def text_features(p_df):
"""
Extracts features derived from the quora question texts.
:param p_df: A DataFrame.
:return: A DataFrame.
"""
diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())
p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
p_df = p_df.withColumn(
"unique_chars_q1", unique_chars("question1")
).withColumn("unique_chars_q2", unique_chars("question2"))
assembler = VectorAssembler(
inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
outputCol="text_features"
)
p_df = assembler.transform(p_df)
return p_df
示例6: extract_nlp
def extract_nlp(sql_sc):
schema = StructType([
StructField("title", StringType()),
StructField("first_sentence", StringType()),
StructField("infoboxnames", StringType()),
StructField("urlwords", StringType())
])
noun_ex = udf(extract_nouns, StringType())
pos_ex = udf(extract_hypernyms, StringType())
df = sql_sc.read \
.format("com.databricks.spark.csv") \
.schema(schema) \
.option("header", "false") \
.option("quotechar", '|') \
.option("delimiter", ',') \
.load(DATAP + '/dump/articles_annotated_pre.csv')
df = df.withColumn("nouns", noun_ex(df.first_sentence))
df = df.withColumn("pos_hyps", pos_ex(df.first_sentence))
df['title', 'urlwords', 'infoboxnames', 'nouns', 'pos_hyps'].write \
.format("com.databricks.spark.csv") \
.option("header", "false") \
.option("quotechar", '|') \
.option("delimiter", ",") \
.csv(DATAP + '/dump/articles_annotated')
示例7: test_udf_in_generate
def test_udf_in_generate(self):
from pyspark.sql.functions import udf, explode
df = self.spark.range(5)
f = udf(lambda x: list(range(x)), ArrayType(LongType()))
row = df.select(explode(f(*df))).groupBy().sum().first()
self.assertEqual(row[0], 10)
df = self.spark.range(3)
res = df.select("id", explode(f(df.id))).collect()
self.assertEqual(res[0][0], 1)
self.assertEqual(res[0][1], 0)
self.assertEqual(res[1][0], 2)
self.assertEqual(res[1][1], 0)
self.assertEqual(res[2][0], 2)
self.assertEqual(res[2][1], 1)
range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
res = df.select("id", explode(range_udf(df.id))).collect()
self.assertEqual(res[0][0], 0)
self.assertEqual(res[0][1], -1)
self.assertEqual(res[1][0], 0)
self.assertEqual(res[1][1], 0)
self.assertEqual(res[2][0], 1)
self.assertEqual(res[2][1], 0)
self.assertEqual(res[3][0], 1)
self.assertEqual(res[3][1], 1)
示例8: test_udf_wrapper
def test_udf_wrapper(self):
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def f(x):
"""Identity"""
return x
return_type = IntegerType()
f_ = udf(f, return_type)
self.assertTrue(f.__doc__ in f_.__doc__)
self.assertEqual(f, f_.func)
self.assertEqual(return_type, f_.returnType)
class F(object):
"""Identity"""
def __call__(self, x):
return x
f = F()
return_type = IntegerType()
f_ = udf(f, return_type)
self.assertTrue(f.__doc__ in f_.__doc__)
self.assertEqual(f, f_.func)
self.assertEqual(return_type, f_.returnType)
f = functools.partial(f, x=1)
return_type = IntegerType()
f_ = udf(f, return_type)
self.assertTrue(f.__doc__ in f_.__doc__)
self.assertEqual(f, f_.func)
self.assertEqual(return_type, f_.returnType)
示例9: make_prediction
def make_prediction(event, df):
event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6]
udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType())
udf_shortest_distance=udf(utils.shortest_distance, FloatType())
df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30")
df = df.withColumn("event_sum_distance",
udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2")
df = df.sort('event_sum_distance')
if df.count() < 10:
return [0,0]
a = pd.DataFrame(df.take(50))
a.columns = df.columns
speed_array = a.as_matrix(["avg_speed"])
dist_sf_array = a.as_matrix(["dist_sf"])
distance_array = a["trip_distance"].tolist()
fare_array = a["total_notip"].tolist()
time_array = a["trip_time_in_secs"].tolist()
#set initial parameter values
x0 = [0.5, 0.5, 3.0, 3.0]
bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10))
#perform the fit
res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds)
grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
#get the predictions
time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean())
fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3])
if res.success == True:
return [fare_pred, time_pred]
else:
return [0,0]
示例10: test_pandas_udf_detect_unsafe_type_conversion
def test_pandas_udf_detect_unsafe_type_conversion(self):
from distutils.version import LooseVersion
import pandas as pd
import numpy as np
import pyarrow as pa
values = [1.0] * 3
pdf = pd.DataFrame({'A': values})
df = self.spark.createDataFrame(pdf).repartition(1)
@pandas_udf(returnType="int")
def udf(column):
return pd.Series(np.linspace(0, 1, 3))
# Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
with self.sql_conf({
"spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
with self.assertRaisesRegexp(Exception,
"Exception thrown when converting pandas.Series"):
df.select(['A']).withColumn('udf', udf('A')).collect()
# Disabling Arrow safe type check.
with self.sql_conf({
"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
df.select(['A']).withColumn('udf', udf('A')).collect()
示例11: test_nondeterministic_udf
def test_nondeterministic_udf(self):
# Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
import random
udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
self.assertEqual(udf_random_col.deterministic, False)
df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
[row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
self.assertEqual(row[0] + 10, row[1])
示例12: select_prediction_udf
def select_prediction_udf(self, column):
if column not in self.get_output_names():
raise ValueError("Column '" + column + "' is not defined as the output column in MOJO Pipeline.")
if self.get_named_mojo_output_columns():
func = udf(lambda d: d, DoubleType())
return func("prediction." + column).alias(column)
else:
idx = self.get_output_names().index(column)
func = udf(lambda arr: arr[idx], DoubleType())
return func("prediction.preds").alias(column)
示例13: cat2Num
def cat2Num(self, df, indices):
unique_values = []
for i in indices:
d = udf(lambda r: r[i], StringType())
dt = df.select(d(df.rawFeatures)).distinct().collect()
unique_values.extend(dt)
unique_count = len(unique_values)
convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType()))
newdf = df.withColumn("features", convertUDF(df.rawFeatures))
return newdf
示例14: processMSC
def processMSC():
"""
Parses MSC records as per defined rules
:return: Records returned in pipe-delimited format
"""
# Assumption: MSC folder under the provided input path
inputDir = os.path.join(args.inputdir, "INPUT")
lines = sc.textFile(inputDir)
# Call the parsing function
parsedMSCLines = lines.map(parseMSCRecords)
# The schema is encoded in a string.
schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
# Apply the schema to the RDD.
schemaData = sqlContext.createDataFrame(parsedMSCLines, schema)
modify_phone_number_udf = udf(mod_number, StringType())
ph_num_mod = schemaData.select(
schemaData.RecordType,
modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'),
modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'),
schemaData.CallDate,
schemaData.CallHour,
schemaData.Duration,
schemaData.StartTower,
schemaData.StartLAC,
schemaData.CallType)
get_phone_type_udf = udf(get_phone_type, StringType())
first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum))
sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum))
final_df = sec_ph_type.select(
sec_ph_type.RecordType,
sec_ph_type.FirstNum,
sec_ph_type.SecondNum,
sec_ph_type.CallDate,
sec_ph_type.CallHour,
sec_ph_type.Duration,
sec_ph_type.StartTower,
sec_ph_type.StartLAC,
sec_ph_type.CallType,
F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"])
& sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National")
.otherwise("International").alias('PhoneType'))
print final_df.show()
示例15: test_complex_nested_udt_in_df
def test_complex_nested_udt_in_df(self):
from pyspark.sql.functions import udf
schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
df = self.spark.createDataFrame(
[(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)],
schema=schema)
df.collect()
gd = df.groupby("key").agg({"val": "collect_list"})
gd.collect()
udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema))
gd.select(udf(*gd)).collect()