本文整理汇总了Python中pyspark.sql.functions.udf方法的典型用法代码示例。如果您正苦于以下问题:Python functions.udf方法的具体用法?Python functions.udf怎么用?Python functions.udf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.udf方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_featurizer_in_pipeline
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def test_featurizer_in_pipeline(self):
"""
Tests that featurizer fits into an MLlib Pipeline.
Does not test how good the featurization is for generalization.
"""
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
modelName=self.name)
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
pipeline = Pipeline(stages=[featurizer, lr])
# add arbitrary labels to run logistic regression
# TODO: it's weird that the test fails on some combinations of labels. check why.
label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))
lrModel = pipeline.fit(train_df)
# see if we at least get the training examples right.
# with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
pred_df_collected = lrModel.transform(train_df).collect()
for row in pred_df_collected:
self.assertEqual(int(row.prediction), row.label)
示例2: _decodeOutputAsPredictions
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _decodeOutputAsPredictions(self, df):
# If we start having different weights than imagenet, we'll need to
# move this logic to individual model building in NamedImageTransformer.
# Also, we could put the computation directly in the main computation
# graph or use a scala UDF for potentially better performance.
topK = self.getOrDefault(self.topK)
def decode(predictions):
pred_arr = np.expand_dims(np.array(predictions), axis=0)
decoded = decode_predictions(pred_arr, top=topK)[0]
# convert numpy dtypes to python native types
return [(t[0], t[1], t[2].item()) for t in decoded]
decodedSchema = ArrayType(
StructType([
StructField("class", StringType(), False),
StructField("description", StringType(), False),
StructField("probability", FloatType(), False)
]))
decodeUDF = udf(decode, decodedSchema)
interim_output = self._getIntermediateOutputCol()
return df \
.withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
.drop(interim_output)
示例3: _convertOutputToImage
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
height = int(output_shape[1])
width = int(output_shape[2])
def to_image(orig_image, numeric_data):
# Assume the returned image has float pixels but same #channels as input
mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
nChannels = orig_image.nChannels
return Row(
origin="",
mode=mode.ord,
height=height,
width=width,
nChannels=nChannels,
data=data)
to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
resDf = df.withColumn(self.getOutputCol(),
to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
return resDf.drop(tfs_output_col)
示例4: loadImagesInternal
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def loadImagesInternal(self, dataframe, inputCol):
"""
Load image files specified in dataset as image format specified in `sparkdl.image.imageIO`.
"""
# plan 1: udf(loader() + convert from np.array to imageSchema) -> call TFImageTransformer
# plan 2: udf(loader()) ... we don't support np.array as a dataframe column type...
loader = self.getImageLoader()
# Load from external resources can fail, so we should allow None to be returned
def load_image_uri_impl(uri):
try:
return imageArrayToStruct(_reverseChannels(loader(uri)))
except BaseException: # pylint: disable=bare-except
return None
load_udf = udf(load_image_uri_impl, ImageSchema.imageSchema['image'].dataType)
return dataframe.withColumn(self._loadedImageCol(), load_udf(dataframe[inputCol]))
示例5: _ndcg_at
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _ndcg_at(k, label_col):
def ndcg_at_k(predicted, actual):
# TODO: Taking in rn and then re-sorting might not be necessary, but i can't
# find any real guarantee that they would come in order after a groupBy + collect_list,
# since they were only ordered within the window function.
predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
dcg = 0.
for i, label in enumerate(predicted):
# This form is used to match EvalNDCG in xgboost
dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
idcg = 0.
for i, label in enumerate(actual):
idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
if idcg == 0:
return 0
else:
return dcg / idcg
return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())
示例6: append_features
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def append_features(df, *cols):
"""Append features from columns to the features vector.
Parameters
----------
df : pyspark.sql.DataFrame
cols : list of str
Returns
-------
pyspark.sql.DataFrame
"""
def add_features(feat, *other):
raw = feat.toArray()
return Vectors.dense(np.append(raw, list(map(float, other))))
add_features_udf = F.udf(add_features, VectorUDT())
new_feat_list = df.schema['features'].metadata['features'] + cols
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
示例7: zero_features
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def zero_features(df, *feature_names):
"""Zero out features in the feature vector.
Parameters
----------
df : pyspark.sql.DataFrame
feature_names : list of str
Returns
-------
pyspark.sql.DataFrame
"""
features = df.schema['features'].metadata['features']
idxs = [features.index(name) for name in feature_names]
def zero_features(feat):
raw = feat.toArray()
for idx in idxs:
raw[idx] = 0.
return Vectors.dense(raw)
zero_features_udf = F.udf(zero_features, VectorUDT())
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, zero_features_udf('features'), {'features': features}))
示例8: explode_features
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def explode_features(df, features=None):
"""Convert feature vector into individual columns
Parameters
----------
df : pyspark.sql.DataFrame
features : list of str or None
Returns
-------
pyspark.sql.DataFrame
"""
if features is None:
features = df.schema['features'].metadata['features']
def extract_feature(features, idx):
return float(features[idx])
extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
return df.select('*', *cols)
示例9: __call__
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def __call__(self, func):
"""Define a UDF (user-defined function) that operates element wise
on a Spark DataFrame.
Parameters
----------
input_type : List[ibis.expr.datatypes.DataType]
A list of the types found in :mod:`~ibis.expr.datatypes`. The
length of this list must match the number of arguments to the
function. Variadic arguments are not yet supported.
output_type : ibis.expr.datatypes.DataType
The return type of the function.
Examples
--------
>>> import ibis
>>> import ibis.expr.datatypes as dt
>>> from ibis.spark.udf import udf
>>> @udf.elementwise(input_type=[dt.string], output_type=dt.int64)
... def my_string_length(x):
... return len(x) * 2
"""
return SparkUDF(self._input_type, self._output_type)(func)
示例10: detect
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
#Encoding categorical features using one-hot.
df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
df1.show(n=2, truncate=False)
#Clustering points using KMeans
features = df1.select("features").rdd.map(lambda row: row[0]).cache()
model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)
#Adding the prediction column to df1
modelBC = sc.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=3, truncate=False)
#Adding the score column to df2; The higher the score, the more likely it is an anomaly
df3 = self.addScore(df2).cache()
df3.show(n=3, truncate=False)
return df3.where(df3.score > t)
示例11: detect
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
# Encoding categorical features using one-hot.
df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
df1.show(n=2, truncate=False)
# Clustering points using KMeans
features = df1.select("features").rdd.map(lambda row: row[0]).cache()
model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20)
# Adding the prediction column to df1
modelBC = sparkCt.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=3, truncate=False)
# Adding the score column to df2; The higher the score, the more likely it is an anomaly
df3 = self.addScore(df2).cache()
df3.show(n=3, truncate=False)
return df3.where(df3.score > t)
示例12: detect
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
# Encoding categorical features using one-hot.
df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
df1.show(n=2, truncate=False)
# Clustering points using KMeans
features = df1.select("features").rdd.map(lambda row: row[0]).cache()
model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0)
# model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)
# Adding the prediction column to df1
modelBC = sc.broadcast(model)
predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
df2.show(n=3, truncate=False)
# Adding the score column to df2; The higher the score, the more likely it is an anomaly
df3 = self.addScore(df2).cache()
df3.show(n=3, truncate=False)
return df3.where(df3.score > t)
示例13: _join_results_multi
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _join_results_multi(self, scaffolds_df, sampled_df):
def _join_scaffold(scaff, dec):
mol = usc.join(scaff, dec)
if mol:
return usc.to_smiles(mol)
def _format_attachment_point(smi, num):
smi = usc.add_first_attachment_point_number(smi, num)
return usc.to_smiles(uc.to_mol(smi)) # canonicalize
join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType())
return sampled_df.join(scaffolds_df, on="id")\
.withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\
.select(
join_scaffold_udf("smiles", "decoration").alias("smiles"),
psf.map_concat(
psf.create_map(psf.col("attachment_points")[0],
SampleScaffolds.cleanup_decoration_udf("decoration")),
"decorations",
).alias("decorations"),
"scaffold")
示例14: _join_results_single
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _join_results_single(self, scaffolds_df, sampled_df):
def _join_scaffold(scaff, decs):
mol = usc.join_joined_attachments(scaff, decs)
if mol:
return usc.to_smiles(mol)
join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
def _create_decorations_map(decorations_smi, attachment_points):
decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)}
create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType()))
return sampled_df.join(scaffolds_df, on="id")\
.select(
join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
"scaffold")
示例15: cosineSimilarity
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def cosineSimilarity(s1, s2):
(m1, v1) = Base64ToFloatArray(s1)
(m2, v2) = Base64ToFloatArray(s2)
if (m1 == 0) or (m2 == 0):
return 0
else :
return sum(x*y for x,y in zip(v1, v2))/(m1 * m2)
# Register udf functions so that it could be used in dataframe
#
# Perform same computation as cosineSimilarity()
#