Python functions.udf方法代码示例

本文整理汇总了Python中pyspark.sql.functions.udf方法的典型用法代码示例。如果您正苦于以下问题：Python functions.udf方法的具体用法？Python functions.udf怎么用？Python functions.udf使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions的用法示例。

在下文中一共展示了functions.udf方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_featurizer_in_pipeline

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label)

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:23，代码来源:named_image_test.py

示例2: _decodeOutputAsPredictions

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _decodeOutputAsPredictions(self, df):
        # If we start having different weights than imagenet, we'll need to
        # move this logic to individual model building in NamedImageTransformer.
        # Also, we could put the computation directly in the main computation
        # graph or use a scala UDF for potentially better performance.
        topK = self.getOrDefault(self.topK)

        def decode(predictions):
            pred_arr = np.expand_dims(np.array(predictions), axis=0)
            decoded = decode_predictions(pred_arr, top=topK)[0]
            # convert numpy dtypes to python native types
            return [(t[0], t[1], t[2].item()) for t in decoded]

        decodedSchema = ArrayType(
            StructType([
                StructField("class", StringType(), False),
                StructField("description", StringType(), False),
                StructField("probability", FloatType(), False)
            ]))
        decodeUDF = udf(decode, decodedSchema)
        interim_output = self._getIntermediateOutputCol()
        return df \
            .withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
            .drop(interim_output)

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:26，代码来源:named_image.py

示例3: _convertOutputToImage

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(
                origin="",
                mode=mode.ord,
                height=height,
                width=width,
                nChannels=nChannels,
                data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(self.getOutputCol(),
                              to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col)

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:24，代码来源:tf_image.py

示例4: loadImagesInternal

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def loadImagesInternal(self, dataframe, inputCol):
        """
        Load image files specified in dataset as image format specified in `sparkdl.image.imageIO`.
        """
        # plan 1: udf(loader() + convert from np.array to imageSchema) -> call TFImageTransformer
        # plan 2: udf(loader()) ... we don't support np.array as a dataframe column type...
        loader = self.getImageLoader()
        # Load from external resources can fail, so we should allow None to be returned

        def load_image_uri_impl(uri):
            try:
                return imageArrayToStruct(_reverseChannels(loader(uri)))
            except BaseException:  # pylint: disable=bare-except
                return None
        load_udf = udf(load_image_uri_impl, ImageSchema.imageSchema['image'].dataType)
        return dataframe.withColumn(self._loadedImageCol(), load_udf(dataframe[inputCol]))

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:18，代码来源:image_params.py

示例5: _ndcg_at

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _ndcg_at(k, label_col):
    def ndcg_at_k(predicted, actual):
        # TODO: Taking in rn and then re-sorting might not be necessary, but i can't
        # find any real guarantee that they would come in order after a groupBy + collect_list,
        # since they were only ordered within the window function.
        predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)]
        actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)]
        dcg = 0.
        for i, label in enumerate(predicted):
            # This form is used to match EvalNDCG in xgboost
            dcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        idcg = 0.
        for i, label in enumerate(actual):
            idcg += ((1 << label) - 1) / math.log(i + 2.0, 2)
        if idcg == 0:
            return 0
        else:
            return dcg / idcg
    return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:21，代码来源:metrics.py

示例6: append_features

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:21，代码来源:feature_engineering.py

示例7: zero_features

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features}))

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:25，代码来源:feature_engineering.py

示例8: explode_features

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def explode_features(df, features=None):
    """Convert feature vector into individual columns

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    features : list of str or None

    Returns
    -------
    pyspark.sql.DataFrame
    """
    if features is None:
        features = df.schema['features'].metadata['features']

    def extract_feature(features, idx):
        return float(features[idx])
    extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
    cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
    return df.select('*', *cols)

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:22，代码来源:feature_engineering.py

示例9: call

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def __call__(self, func):
            """Define a UDF (user-defined function) that operates element wise
            on a Spark DataFrame.

            Parameters
            ----------
            input_type : List[ibis.expr.datatypes.DataType]
                A list of the types found in :mod:`~ibis.expr.datatypes`. The
                length of this list must match the number of arguments to the
                function. Variadic arguments are not yet supported.
            output_type : ibis.expr.datatypes.DataType
                The return type of the function.

            Examples
            --------
            >>> import ibis
            >>> import ibis.expr.datatypes as dt
            >>> from ibis.spark.udf import udf
            >>> @udf.elementwise(input_type=[dt.string], output_type=dt.int64)
            ... def my_string_length(x):
            ...     return len(x) * 2
            """
            return SparkUDF(self._input_type, self._output_type)(func)

开发者ID:ibis-project，项目名称:ibis，代码行数:25，代码来源:udf.py

示例10: detect

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
        #Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        #Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        #Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        #Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)

开发者ID:hanhanwu，项目名称:Hanhan-Spark-Python，代码行数:22，代码来源:anomalies_detection.py

示例11: detect

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sparkCt.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)

开发者ID:hanhanwu，项目名称:Hanhan-Spark-Python，代码行数:22，代码来源:anomalies_detection.py

示例12: detect

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0)
        # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)

开发者ID:hanhanwu，项目名称:Hanhan-Spark-Python，代码行数:23，代码来源:anomalies_detection_spark_streaming.py

示例13: _join_results_multi

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _join_results_multi(self, scaffolds_df, sampled_df):
        def _join_scaffold(scaff, dec):
            mol = usc.join(scaff, dec)
            if mol:
                return usc.to_smiles(mol)

        def _format_attachment_point(smi, num):
            smi = usc.add_first_attachment_point_number(smi, num)
            return usc.to_smiles(uc.to_mol(smi))  # canonicalize

        join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())
        format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType())

        return sampled_df.join(scaffolds_df, on="id")\
            .withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\
            .select(
                join_scaffold_udf("smiles", "decoration").alias("smiles"),
                psf.map_concat(
                    psf.create_map(psf.col("attachment_points")[0],
                                   SampleScaffolds.cleanup_decoration_udf("decoration")),
                    "decorations",
                ).alias("decorations"),
                "scaffold")

开发者ID:undeadpixel，项目名称:reinvent-scaffold-decorator，代码行数:25，代码来源:sample_scaffolds.py

示例14: _join_results_single

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def _join_results_single(self, scaffolds_df, sampled_df):
        def _join_scaffold(scaff, decs):
            mol = usc.join_joined_attachments(scaff, decs)
            if mol:
                return usc.to_smiles(mol)
        join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())

        def _create_decorations_map(decorations_smi, attachment_points):
            decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
            return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)}
        create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType()))

        return sampled_df.join(scaffolds_df, on="id")\
            .select(
                join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
                create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
                "scaffold")

开发者ID:undeadpixel，项目名称:reinvent-scaffold-decorator，代码行数:19，代码来源:sample_scaffolds.py

示例15: cosineSimilarity

# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import udf [as 别名]
def cosineSimilarity(s1, s2):
  (m1, v1) = Base64ToFloatArray(s1)
  (m2, v2) = Base64ToFloatArray(s2)
  if (m1 == 0) or (m2 == 0):
    return 0
  else :
    return sum(x*y for x,y in zip(v1, v2))/(m1 * m2)

# Register udf functions so that it could be used in dataframe
#
# Perform same computation as cosineSimilarity()
#

开发者ID:graph-knowledgegraph，项目名称:KDD2019-HandsOn-Tutorial，代码行数:14，代码来源:TutorialClasses.py

注：本文中的pyspark.sql.functions.udf方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。