当前位置: 首页>>代码示例>>Python>>正文


Python functions.udf函数代码示例

本文整理汇总了Python中pyspark.sql.functions.udf函数的典型用法代码示例。如果您正苦于以下问题:Python udf函数的具体用法?Python udf怎么用?Python udf使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了udf函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setup_method

 def setup_method(self, method):
     sparkConf = create_spark_conf().setMaster("local[4]")\
         .setAppName("test wide and deep")
     self.sc = init_nncontext(sparkConf)
     self.sqlContext = SQLContext(self.sc)
     data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
     categorical_gender_udf = udf(lambda gender:
                                  categorical_from_vocab_list(gender, ["F", "M"], start=1))
     bucket_udf = udf(lambda feature1, feature2:
                      hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
     self.data_in = self.sqlContext.read.parquet(data_path) \
         .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
         .withColumn("occupation-gender",
                     bucket_udf(col("occupation"), col("gender")).cast("int"))
     self.column_info = ColumnFeatureInfo(
         wide_base_cols=["occupation", "gender"],
         wide_base_dims=[21, 3],
         wide_cross_cols=["occupation-gender"],
         wide_cross_dims=[100],
         indicator_cols=["occupation", "gender"],
         indicator_dims=[21, 3],
         embed_cols=["userId", "itemId"],
         embed_in_dims=[100, 100],
         embed_out_dims=[20, 20],
         continuous_cols=["age"])
开发者ID:ru003ar,项目名称:analytics-zoo,代码行数:25,代码来源:test_wideanddeep.py

示例2: test_pandas_udf_arrow_overflow

    def test_pandas_udf_arrow_overflow(self):
        from distutils.version import LooseVersion
        import pandas as pd
        import pyarrow as pa

        df = self.spark.range(0, 1)

        @pandas_udf(returnType="byte")
        def udf(column):
            return pd.Series([128])

        # Arrow 0.11.0+ allows enabling or disabling safe type check.
        if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
            # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
            with self.sql_conf({
                    "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
                with self.assertRaisesRegexp(Exception,
                                             "Exception thrown when converting pandas.Series"):
                    df.withColumn('udf', udf('id')).collect()

            # Disabling safe type check, let Arrow do the cast anyway.
            with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
                df.withColumn('udf', udf('id')).collect()
        else:
            # SQL config `arrowSafeTypeConversion` no matters for older Arrow.
            # Overflow cast causes an error.
            with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
                with self.assertRaisesRegexp(Exception,
                                             "Integer value out of bounds"):
                    df.withColumn('udf', udf('id')).collect()
开发者ID:Brett-A,项目名称:spark,代码行数:30,代码来源:test_pandas_udf.py

示例3: extract_pre

def extract_pre(sql_sc):
    schema = StructType([
        StructField("title", StringType()),
        StructField("text", StringType())
    ])
    first_ex = udf(extract_first_sentence, StringType())
    inf_ex2 = udf(extract_names, StringType())
    url_ex = udf(extract_urlpattern, StringType())

    df = sql_sc.read \
        .format("com.databricks.spark.csv") \
        .schema(schema) \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ',') \
        .load(DATAP + '/dump/articles.csv')
    #df = df.repartition(100)
    df = df.withColumn("first_sentence", first_ex(df.text))
    df = df.withColumn('infoboxnames', inf_ex2(df.text))
    df = df.withColumn("urlwords", url_ex(df.title))
    df['title', 'first_sentence', 'infoboxnames', 'urlwords'].write \
        .format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ",") \
        .csv(DATAP + '/dump/articles_annotated_pre')
开发者ID:softlang,项目名称:wikionto,代码行数:26,代码来源:extract_features.py

示例4: cat2Num

    def cat2Num(self, df, indices):
        '''sbaronia - extract the categorical data and make df out of it
        so oneHotEncoding can be run on them'''
        protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache()
        protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache()

        ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache()
        ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache()
        
        '''sbaronia - add those hot encoded features columns to original df'''
        int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache()
        int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache()

        '''sbaronia - now create a new column features which has 
        converted vector form and drop rest columns'''
        comb_udf = udf(replaceCat2Num,StringType())
        int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \
                                       comb_udf(int_join_2.rawFeatures, \
                                       int_join_2.num_features0, \
                                       int_join_2.num_features1).alias("features")).cache()
        
        '''sbaronia - convert list of numerical features to DenseVector
        so they can be used in KMeans'''
        dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT())
        feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache()
      
        return feat
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:27,代码来源:anomaly_detection.py

示例5: text_features

def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
开发者ID:rhasan,项目名称:machine-learning,代码行数:25,代码来源:Quora.py

示例6: extract_nlp

def extract_nlp(sql_sc):
    schema = StructType([
        StructField("title", StringType()),
        StructField("first_sentence", StringType()),
        StructField("infoboxnames", StringType()),
        StructField("urlwords", StringType())
    ])

    noun_ex = udf(extract_nouns, StringType())
    pos_ex = udf(extract_hypernyms, StringType())

    df = sql_sc.read \
        .format("com.databricks.spark.csv") \
        .schema(schema) \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ',') \
        .load(DATAP + '/dump/articles_annotated_pre.csv')
    df = df.withColumn("nouns", noun_ex(df.first_sentence))
    df = df.withColumn("pos_hyps", pos_ex(df.first_sentence))
    df['title', 'urlwords', 'infoboxnames', 'nouns', 'pos_hyps'].write \
        .format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ",") \
        .csv(DATAP + '/dump/articles_annotated')
开发者ID:softlang,项目名称:wikionto,代码行数:26,代码来源:extract_features.py

示例7: test_udf_in_generate

    def test_udf_in_generate(self):
        from pyspark.sql.functions import udf, explode
        df = self.spark.range(5)
        f = udf(lambda x: list(range(x)), ArrayType(LongType()))
        row = df.select(explode(f(*df))).groupBy().sum().first()
        self.assertEqual(row[0], 10)

        df = self.spark.range(3)
        res = df.select("id", explode(f(df.id))).collect()
        self.assertEqual(res[0][0], 1)
        self.assertEqual(res[0][1], 0)
        self.assertEqual(res[1][0], 2)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 2)
        self.assertEqual(res[2][1], 1)

        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
        res = df.select("id", explode(range_udf(df.id))).collect()
        self.assertEqual(res[0][0], 0)
        self.assertEqual(res[0][1], -1)
        self.assertEqual(res[1][0], 0)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 1)
        self.assertEqual(res[2][1], 0)
        self.assertEqual(res[3][0], 1)
        self.assertEqual(res[3][1], 1)
开发者ID:drewrobb,项目名称:spark,代码行数:26,代码来源:test_udf.py

示例8: test_udf_wrapper

    def test_udf_wrapper(self):
        from pyspark.sql.functions import udf
        from pyspark.sql.types import IntegerType

        def f(x):
            """Identity"""
            return x

        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)

        class F(object):
            """Identity"""
            def __call__(self, x):
                return x

        f = F()
        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)

        f = functools.partial(f, x=1)
        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)
开发者ID:drewrobb,项目名称:spark,代码行数:35,代码来源:test_udf.py

示例9: make_prediction

def  make_prediction(event, df):
    event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6]
    udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType())
    udf_shortest_distance=udf(utils.shortest_distance, FloatType())
    df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30")
    df = df.withColumn("event_sum_distance",
        udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2")
    df = df.sort('event_sum_distance')
    if df.count() < 10:
        return [0,0]
    a = pd.DataFrame(df.take(50))
    a.columns = df.columns

    speed_array = a.as_matrix(["avg_speed"])
    dist_sf_array = a.as_matrix(["dist_sf"])
    distance_array = a["trip_distance"].tolist()
    fare_array = a["total_notip"].tolist()
    time_array = a["trip_time_in_secs"].tolist()

    #set initial parameter values
    x0 = [0.5, 0.5, 3.0, 3.0]
    bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10))
    
    #perform the fit
    res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds)
    grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

    #get the predictions
    time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean())
    fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3])
    if res.success == True:
        return [fare_pred, time_pred]
    else:
        return [0,0]
开发者ID:jgran,项目名称:TaxiPredict,代码行数:34,代码来源:local_fare.py

示例10: test_pandas_udf_detect_unsafe_type_conversion

    def test_pandas_udf_detect_unsafe_type_conversion(self):
        from distutils.version import LooseVersion
        import pandas as pd
        import numpy as np
        import pyarrow as pa

        values = [1.0] * 3
        pdf = pd.DataFrame({'A': values})
        df = self.spark.createDataFrame(pdf).repartition(1)

        @pandas_udf(returnType="int")
        def udf(column):
            return pd.Series(np.linspace(0, 1, 3))

        # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
        if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
            with self.sql_conf({
                    "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
                with self.assertRaisesRegexp(Exception,
                                             "Exception thrown when converting pandas.Series"):
                    df.select(['A']).withColumn('udf', udf('A')).collect()

        # Disabling Arrow safe type check.
        with self.sql_conf({
                "spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
            df.select(['A']).withColumn('udf', udf('A')).collect()
开发者ID:Brett-A,项目名称:spark,代码行数:26,代码来源:test_pandas_udf.py

示例11: test_nondeterministic_udf

 def test_nondeterministic_udf(self):
     # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
     import random
     udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
     self.assertEqual(udf_random_col.deterministic, False)
     df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
     udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
     [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
     self.assertEqual(row[0] + 10, row[1])
开发者ID:amolthacker,项目名称:spark,代码行数:9,代码来源:test_udf.py

示例12: select_prediction_udf

    def select_prediction_udf(self, column):
        if column not in self.get_output_names():
            raise ValueError("Column '" + column + "' is not defined as the output column in MOJO Pipeline.")

        if self.get_named_mojo_output_columns():
            func = udf(lambda d: d, DoubleType())
            return func("prediction." + column).alias(column)
        else:
            idx = self.get_output_names().index(column)
            func = udf(lambda arr: arr[idx], DoubleType())
            return func("prediction.preds").alias(column)
开发者ID:h2oai,项目名称:sparkling-water,代码行数:11,代码来源:models.py

示例13: cat2Num

    def cat2Num(self, df, indices):
        unique_values = []
        for i in indices:
            d = udf(lambda r: r[i], StringType())
            dt = df.select(d(df.rawFeatures)).distinct().collect()
            unique_values.extend(dt)

        unique_count = len(unique_values)
        convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType()))
        newdf = df.withColumn("features", convertUDF(df.rawFeatures))

        return newdf
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:12,代码来源:anomalies_detection.py

示例14: processMSC

def processMSC():
    """
    Parses MSC records as per defined rules
    :return: Records returned in pipe-delimited format
    """
    # Assumption: MSC folder under the provided input path
    inputDir = os.path.join(args.inputdir, "INPUT")
    lines = sc.textFile(inputDir)

    # Call the parsing function
    parsedMSCLines = lines.map(parseMSCRecords)

    # The schema is encoded in a string.
    schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType"
    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    # Apply the schema to the RDD.
    schemaData = sqlContext.createDataFrame(parsedMSCLines, schema)
    
    modify_phone_number_udf = udf(mod_number, StringType())
    ph_num_mod = schemaData.select(
        schemaData.RecordType,
        modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'),
        modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'),
        schemaData.CallDate,
        schemaData.CallHour,
        schemaData.Duration,
        schemaData.StartTower,
        schemaData.StartLAC,
        schemaData.CallType)

    get_phone_type_udf = udf(get_phone_type, StringType())

    first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum))

    sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum))

    final_df = sec_ph_type.select(
        sec_ph_type.RecordType,
        sec_ph_type.FirstNum,
        sec_ph_type.SecondNum,
        sec_ph_type.CallDate,
        sec_ph_type.CallHour,
        sec_ph_type.Duration,
        sec_ph_type.StartTower,
        sec_ph_type.StartLAC,
        sec_ph_type.CallType,
        F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"])
               & sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National")
            .otherwise("International").alias('PhoneType'))

    print final_df.show()
开发者ID:binal165,项目名称:spark-dataframes,代码行数:53,代码来源:parsing-pyspark-new.py

示例15: test_complex_nested_udt_in_df

    def test_complex_nested_udt_in_df(self):
        from pyspark.sql.functions import udf

        schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
        df = self.spark.createDataFrame(
            [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)],
            schema=schema)
        df.collect()

        gd = df.groupby("key").agg({"val": "collect_list"})
        gd.collect()
        udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema))
        gd.select(udf(*gd)).collect()
开发者ID:JingchengDu,项目名称:spark,代码行数:13,代码来源:test_types.py


注:本文中的pyspark.sql.functions.udf函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。