当前位置: 首页>>代码示例>>Python>>正文


Python functions.col函数代码示例

本文整理汇总了Python中pyspark.sql.functions.col函数的典型用法代码示例。如果您正苦于以下问题:Python col函数的具体用法?Python col怎么用?Python col使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了col函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_basic

    def test_basic(self):
        df = self.data
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        # Groupby one column and aggregate one UDF with literal
        result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id')
        expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())

        # Groupby one expression and aggregate one UDF with literal
        result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\
            .sort(df.id + 1)
        expected2 = df.groupby((col('id') + 1))\
            .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1)
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())

        # Groupby one column and aggregate one UDF without literal
        result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id')
        expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id')
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())

        # Groupby one expression and aggregate one UDF without literal
        result4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(weighted_mean_udf(df.v, df.w))\
            .sort('id')
        expected4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(mean(df.v).alias('weighted_mean(v, w)'))\
            .sort('id')
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
开发者ID:Brett-A,项目名称:spark,代码行数:29,代码来源:test_pandas_udf_grouped_agg.py

示例2: test_vectorized_udf_dates

    def test_vectorized_udf_dates(self):
        schema = StructType().add("idx", LongType()).add("date", DateType())
        data = [(0, date(1969, 1, 1),),
                (1, date(2012, 2, 2),),
                (2, None,),
                (3, date(2100, 4, 4),),
                (4, date(2262, 4, 12),)]
        df = self.spark.createDataFrame(data, schema=schema)

        date_copy = pandas_udf(lambda t: t, returnType=DateType())
        df = df.withColumn("date_copy", date_copy(col("date")))

        @pandas_udf(returnType=StringType())
        def check_data(idx, date, date_copy):
            msgs = []
            is_equal = date.isnull()
            for i in range(len(idx)):
                if (is_equal[i] and data[idx[i]][1] is None) or \
                        date[i] == data[idx[i]][1]:
                    msgs.append(None)
                else:
                    msgs.append(
                        "date values are not equal (date='%s': data[%d][1]='%s')"
                        % (date[i], idx[i], data[idx[i]][1]))
            return pd.Series(msgs)

        result = df.withColumn("check_data",
                               check_data(col("idx"), col("date"), col("date_copy"))).collect()

        self.assertEquals(len(data), len(result))
        for i in range(len(result)):
            self.assertEquals(data[i][1], result[i][1])  # "date" col
            self.assertEquals(data[i][1], result[i][2])  # "date_copy" col
            self.assertIsNone(result[i][3])  # "check_data" col
开发者ID:q977734161,项目名称:spark,代码行数:34,代码来源:test_pandas_udf_scalar.py

示例3: test_vectorized_udf_string_in_udf

 def test_vectorized_udf_string_in_udf(self):
     import pandas as pd
     df = self.spark.range(10)
     str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType())
     actual = df.select(str_f(col('id')))
     expected = df.select(col('id').cast('string'))
     self.assertEquals(expected.collect(), actual.collect())
开发者ID:Brett-A,项目名称:spark,代码行数:7,代码来源:test_pandas_udf_scalar.py

示例4: test_mixed_sql_and_udf

    def test_mixed_sql_and_udf(self):
        df = self.data
        w = self.unbounded_window
        ow = self.ordered_window
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
        expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))

        # Test mixing sql window function and window udf in the same expression
        result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
        expected2 = expected1

        # Test chaining sql aggregate function and udf
        result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('min_v', min(df['v']).over(w)) \
                    .withColumn('v_diff', col('max_v') - col('min_v')) \
                    .drop('max_v', 'min_v')
        expected3 = expected1

        # Test mixing sql window function and udf
        result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('rank', rank().over(ow))
        expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('rank', rank().over(ow))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
开发者ID:git-prodigy,项目名称:spark,代码行数:31,代码来源:test_pandas_udf_window.py

示例5: data

 def data(self):
     from pyspark.sql.functions import array, explode, col, lit
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
开发者ID:JingchengDu,项目名称:spark,代码行数:7,代码来源:test_pandas_udf_grouped_agg.py

示例6: test_column_getitem

    def test_column_getitem(self):
        from pyspark.sql.functions import col

        self.assertIsInstance(col("foo")[1:3], Column)
        self.assertIsInstance(col("foo")[0], Column)
        self.assertIsInstance(col("foo")["bar"], Column)
        self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
开发者ID:Brett-A,项目名称:spark,代码行数:7,代码来源:test_column.py

示例7: scalar_pandas_udf_example

def scalar_pandas_udf_example(spark):
    # $example on:scalar_pandas_udf$
    import pandas as pd

    from pyspark.sql.functions import col, pandas_udf
    from pyspark.sql.types import LongType

    # Declare the function and create the UDF
    def multiply_func(a, b):
        return a * b

    multiply = pandas_udf(multiply_func, returnType=LongType())

    # The function for a pandas_udf should be able to execute with local Pandas data
    x = pd.Series([1, 2, 3])
    print(multiply_func(x, x))
    # 0    1
    # 1    4
    # 2    9
    # dtype: int64

    # Create a Spark DataFrame, 'spark' is an existing SparkSession
    df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

    # Execute function as a Spark vectorized UDF
    df.select(multiply(col("x"), col("x"))).show()
开发者ID:BaiBenny,项目名称:spark,代码行数:26,代码来源:arrow.py

示例8: generateExpr

 def generateExpr(columnName, listIntervals):
     if (len(listIntervals) == 1):
         return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None)
     else:
         return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]),
                      len(listIntervals) - 1)
                 .otherwise(generateExpr(columnName, listIntervals[1:])))
开发者ID:mood-agency,项目名称:optimus,代码行数:7,代码来源:DfAnalizer.py

示例9: setup_method

 def setup_method(self, method):
     sparkConf = create_spark_conf().setMaster("local[4]")\
         .setAppName("test wide and deep")
     self.sc = init_nncontext(sparkConf)
     self.sqlContext = SQLContext(self.sc)
     data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
     categorical_gender_udf = udf(lambda gender:
                                  categorical_from_vocab_list(gender, ["F", "M"], start=1))
     bucket_udf = udf(lambda feature1, feature2:
                      hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
     self.data_in = self.sqlContext.read.parquet(data_path) \
         .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
         .withColumn("occupation-gender",
                     bucket_udf(col("occupation"), col("gender")).cast("int"))
     self.column_info = ColumnFeatureInfo(
         wide_base_cols=["occupation", "gender"],
         wide_base_dims=[21, 3],
         wide_cross_cols=["occupation-gender"],
         wide_cross_dims=[100],
         indicator_cols=["occupation", "gender"],
         indicator_dims=[21, 3],
         embed_cols=["userId", "itemId"],
         embed_in_dims=[100, 100],
         embed_out_dims=[20, 20],
         continuous_cols=["age"])
开发者ID:ru003ar,项目名称:analytics-zoo,代码行数:25,代码来源:test_wideanddeep.py

示例10: test_smvPlusDateTime

    def test_smvPlusDateTime(self):
        df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
        r1 = df.select(col("t").smvPlusDays(-10).alias("ts"))
        r2 = df.select(col("t").smvPlusMonths(1).alias("ts"))
        r3 = df.select(col("t").smvPlusWeeks(3).alias("ts"))
        r4 = df.select(col("t").smvPlusYears(2).alias("ts"))
        r5 = df.select(col("t").smvPlusYears(4).alias("ts"))

        s = "ts: Timestamp[yyyy-MM-dd hh:mm:ss.S]"
        e1 = self.createDF(
            s,
            "1976-01-21 00:00:00.0;" +
            "2012-02-19 00:00:00.0")
        e2 = self.createDF(
            s,
            "1976-02-29 00:00:00.0;" +
            "2012-03-29 00:00:00.0")
        e3 = self.createDF(
            s,
            "1976-02-21 00:00:00.0;" +
            "2012-03-21 00:00:00.0")
        e4 = self.createDF(
            s,
            "1978-01-31 00:00:00.0;" +
            "2014-02-28 00:00:00.0")
        e5 = self.createDF(
            s,
            "1980-01-31 00:00:00.0;" +
            "2016-02-29 00:00:00.0")

        self.should_be_same(e1, r1)
        self.should_be_same(e2, r2)
        self.should_be_same(e3, r3)
        self.should_be_same(e4, r4)
        self.should_be_same(e5, r5)
开发者ID:jacobdr,项目名称:SMV,代码行数:35,代码来源:testColumnHelper.py

示例11: test_udf_with_filter_function

    def test_udf_with_filter_function(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql.functions import udf, col
        from pyspark.sql.types import BooleanType

        my_filter = udf(lambda a: a < 2, BooleanType())
        sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2"))
        self.assertEqual(sel.collect(), [Row(key=1, value='1')])
开发者ID:drewrobb,项目名称:spark,代码行数:8,代码来源:test_udf.py

示例12: get_latest_data

    def get_latest_data(self):
        from pyspark.sql import SparkSession
        import config
        import pandas as pd
        # initialise sparkContext
        spark1 = SparkSession.builder \
            .master(config.sp_master) \
            .appName(config.sp_appname) \
            .config('spark.executor.memory', config.sp_memory) \
            .config("spark.cores.max", config.sp_cores) \
            .getOrCreate()

        sc = spark1.sparkContext

        # using SQLContext to read parquet file
        from pyspark.sql import SQLContext
        sqlContext = SQLContext(sc)

        from datetime import datetime
        t1 = datetime.now()
        df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
        df2 =  sqlContext.read.parquet(config.proj_path+'/datas/appid_attribute_parquet')
        df2 = df2[['attribute_id','source','target_address','location']]

        #renaming the column
        from pyspark.sql.functions import col

        df2 = df2.select(col("attribute_id").alias("target_attribute_id"),
                           col("source").alias("source_y"),
                               col("target_address").alias("target_address_y"),
                           col("location").alias("location"), 
                      )
        # merging the dfs

        df_merge = df.join(df2,how='left',on='target_attribute_id')

        # Needed data extraction
        t1 = datetime.now()
        data = df_merge.registerTempTable('dummy')
        data = sqlContext.sql('select sum(byte_count) as byte_count_sum  , time_stamp, location from dummy group by location, time_stamp')
        data = data[data.byte_count_sum > 0]

        # data cleaning
        self.p7_df=data.toPandas()
        t2 =datetime.now()
        time_to_fetch = str(t2-t1)

        self.p7_df['bw'] = self.p7_df['byte_count_sum']/(8*3600)
        self.p7_df = self.p7_df.sort_values(by='location',ascending=True)       
        dates_outlook = pd.to_datetime(pd.Series(self.p7_df.time_stamp),unit='ms')
        self.p7_df.index = dates_outlook   


        self.p7_df['date'] = self.p7_df.index.date
        self.p7_df = self.p7_df.sort_values(by='time_stamp')

        t2 =datetime.now()
        time_to_fetch = str(t2-t1)
开发者ID:abhoopathi,项目名称:friendly-lamp,代码行数:58,代码来源:p7_api.py

示例13: test_smvRenameField_preserve_meta_for_unrenamed_fields

    def test_smvRenameField_preserve_meta_for_unrenamed_fields(self):
        df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
        desc = "c description"
        res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
                 .smvDesc(("c", desc))
        self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])

        res2 = res1.smvRenameField(("a", "d"))
        self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:9,代码来源:testDataFrameHelper.py

示例14: test_cast_to_string_with_udt

    def test_cast_to_string_with_udt(self):
        from pyspark.sql.functions import col
        row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
        schema = StructType([StructField("point", ExamplePointUDT(), False),
                             StructField("pypoint", PythonOnlyUDT(), False)])
        df = self.spark.createDataFrame([row], schema)

        result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head()
        self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
开发者ID:JingchengDu,项目名称:spark,代码行数:9,代码来源:test_types.py

示例15: test_smvDayMonth70

    def test_smvDayMonth70(self):
        df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
        r1 = df.select(col("t").smvDay70().alias("t_day70"))
        r2 = df.select(col("t").smvMonth70().alias("t_month70"))

        e1 = self.createDF("t_day70: Integer", "2221;15399")
        e2 = self.createDF("t_month70: Integer", "72;505")

        self.should_be_same(e1, r1)
        self.should_be_same(e2, r2)
开发者ID:jacobdr,项目名称:SMV,代码行数:10,代码来源:testColumnHelper.py


注:本文中的pyspark.sql.functions.col函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。