Python SQLContext.registerFunction方法代码示例

本文整理汇总了Python中pyspark.SQLContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题：Python SQLContext.registerFunction方法的具体用法？Python SQLContext.registerFunction怎么用？Python SQLContext.registerFunction使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext的用法示例。

在下文中一共展示了SQLContext.registerFunction方法的8个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SparkConf

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
        return name + "," + "good"


if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName("sql_udf")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    json_path = os.path.abspath("../doc/book.json")

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # UDF自定义函数
    sqlContext.registerFunction("name_place", name_place)

    evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book")

    #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    evalRDD.show()

    # 查询结果进行隐射
    bookMap = evalRDD.map(lambda books: (books.book_eval))

    general_list = []
    good_list = []

    for book in bookMap.collect():
        book = book.encode("utf-8").split(',')

开发者ID:imperio-wxm，项目名称:spark-by-python，代码行数:33，代码来源:sql_udf.py

示例2: str

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
    logger.debug('-->users:' + str(users.count()))
    schema_string = "id gw_id supp_id user_id user_type " \
                    "user_name login_time logout_time mac ip " \
                    "user_agent download_flow upload_flow os browser " \
                    "ratio batch_no hos_id"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_users = sql_context.applySchema(users, schema)
    schema_users.registerTempTable("wxcity_userlogin_info")

    # regist udf
    sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
    sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
    sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
    sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
    sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())

    lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)

    # group by day,hosid,(mac),2, 5, 10, 30, 60
    #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
    repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
    schema_string = "day hos_id mac t2 t5 " \
                    "t10 t30 t60"
    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_repeat_list = sql_context.applySchema(repeat_list, schema)

开发者ID:wangcunxin，项目名称:spark_py，代码行数:33，代码来源:userlogin_repeat.py

示例3: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
    master = "spark://hadoop:7077"
    appName = "spark_loginflowlog"
    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
    input = '/input/loginfowlog/*'

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)

开发者ID:wangcunxin，项目名称:spark_py，代码行数:32，代码来源:login_days_main.py

示例4: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
    master = "local[*]"

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    # logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day
    logFile = "/input/loginfowlog/02*"
    conf = (SparkConf()
            .setMaster(master)
            .setAppName("loginflowlog2mysql")
            # .set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString", "true"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType())

    df = sqlContext.read.parquet(logFile)

    rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac')

    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

开发者ID:wangcunxin，项目名称:spark_py，代码行数:32，代码来源:loginflowlog2mysql_update.py

示例5: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
    # --set datetime
    DAY_OFFSET = 1
    now = datetime.datetime.now()
    pro_time = now - datetime.timedelta(days=DAY_OFFSET)
    day = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_pageflow_outflow"
    input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    sc = SparkContext(master, appName)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
    sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")

    _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
           "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
           "group by to_str(url),to_day(createtime)" % day

    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    list = []
    for r in rs:

开发者ID:wangcunxin，项目名称:spark_py，代码行数:33，代码来源:main.py

示例6: analysis_email

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
import os


def analysis_email(email):
    """
    邮箱分割
    """
    return email.split("@")[1].split(".")[0]

if __name__ == "__main__":
    conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # UDF自定义函数注册
    sqlContext.registerFunction("analysis_email", analysis_email)

    file_path = os.path.abspath("../doc/analysis.txt")
    lines = sc.textFile(file_path)

    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")

    """

开发者ID:imperio-wxm，项目名称:spark-by-python，代码行数:33，代码来源:analysis_demo.py

示例7: dateformat

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
    _adLoadDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
        {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
    ]).registerAsTable("adplay")
    _adClickDF =sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
    ]).registerAsTable("adclick")
    '''
    sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())

    adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
                            'from adload where createtime is not null and dateformat(createtime)=%s '
                            'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")

    adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
                            'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
                            'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")

    # sqlContext.sql('select sum(pv) from radplay').foreach(printx)
    adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
                            'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
                            'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")

开发者ID:wangcunxin，项目名称:spark_py，代码行数:31，代码来源:adcount.py

示例8: SQLContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3
    # write back out to s3
    out.save("s3n://datasets-396316040607/cos_sim/", format="json")

开发者ID:JasonSanchez，项目名称:email-like-enron，代码行数:31，代码来源:cross_product.py

注：本文中的pyspark.SQLContext.registerFunction方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。