当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.registerFunction方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.registerFunction方法的具体用法?Python HiveContext.registerFunction怎么用?Python HiveContext.registerFunction使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.registerFunction方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("TwitterAnalytics")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("strong_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
开发者ID:datafibers,项目名称:BigData-Analytics,代码行数:29,代码来源:TwitterAnalytics.py

示例2: Row

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
    return Row(**mydict)

convertRDD = hc.sql(
    "select col1, col2, col3 from temp_source").map(convert)

mytable = hc.inferSchema(convertRDD)

mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()

hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")


hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row

datas = hc.sql("select * from temp_mytable").collect()
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_cache.py

示例3: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

lines = sc.parallelize(["a", "b", "c"])

people = lines.map(lambda value: Row(name=value))

peopleSchema = hc.inferSchema(people)

peopleSchema.registerTempTable("people")


def myfunc(value):
    return value.upper()

hc.registerFunction("myfunc", myfunc, StringType())

rows = hc.sql("select myfunc(name) from people").rdd.filter(
    lambda row: isinstance(row, tuple)).collect()

sc.stop()

for row in rows:
    print row, type(row[0])
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_udf.py

示例4: parseCDN

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
    table = hc.applySchema(rows, schema)

    table.registerTempTable("temp_table")

    def parseCDN(video_cdn):
        if not video_cdn:
            return ""

        words = video_cdn.split("s=")

        if len(words) >= 2:
            return words[1].split(",")[0]

        return ""

    hc.registerFunction("parseCDN", parseCDN)

    def cal_buffer_num(set):
        buffer_count = 0
        buffer_t_sum = 0
        buffer_smaller_500ms_count = 0
        buffer_bigger_2min_count = 0

        if set == None:
            pass
        else:
            list = set

            for s in list:
                if s >= 500 and s <= 120000:
                    buffer_count = buffer_count + 1
开发者ID:Leaderman,项目名称:pyspark,代码行数:33,代码来源:app_picserversweibof6vwt_wapvideodownload.py

示例5: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from __future__ import absolute_import, print_function, division, unicode_literals

import sys

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, IntegerType

if __name__ == '__main__':
    conf = SparkConf().setAppName('Restaurants Parquet')
    sc = SparkContext(conf=conf)
    hive_ctx = HiveContext(sc)

    inputs = hive_ctx.parquetFile(sys.argv[1])
    inputs.registerTempTable('restaurants')

    hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType())

    print('### Schema ###')
    inputs.printSchema()
    print()

    print('### Restaurants in Tokyo ###')
    restaurants_in_tokyo = hive_ctx.sql("""
        SELECT
            r.id,
            r.alphabet
        FROM
            restaurants r
        WHERE
            r.pref_id = '13'
        AND r.alphabet <> ''
开发者ID:pippobaudos,项目名称:spark-examples-1,代码行数:33,代码来源:restaurants_parquet.py

示例6: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
sc = SparkContext(conf=conf)

hc = HiveContext(sc)


def split_idc(idc):
    if idc == None or idc == '' or (not isinstance(idc, basestring)):
        return ''
    else:
        words = idc.split('.')
        if len(words) >= 2:
            return words[0] + '.' + words[1]
        else:
            return ''

hc.registerFunction("temp_split_idc", split_idc)

#--------------------------2.0 RDD-----------------------
spark_sql = '''select '1' as job_date,cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count,
             sum(sum_play_process) as sum_play_process,
             sum(sum_video_init_duration) as sum_video_init_duration,
             sum(sum_buffer_t_sum) as sum_buffer_t_sum,
             sum(num) as num
             from(
             select cdn,province,isp,ua,play_process_group,version,init_timetag,buffer_count,sum_play_process,sum_video_init_duration,sum_buffer_t_sum,num,
             temp_split_idc(idc) as idc
             from datacubic.app_picserversweibof6vwt_wapvideodownload
             where log_dir= '20151012110000' and version>='5.4.5' limit 10
             )a
             group by cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count'''
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:wapvideodownload_cdn_province_isp_idc.py

示例7: if_in_top_10_domain

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == "" or domain == None or len(domain) < 3:
        return "no"
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return "no"


hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)

spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (
                select domain,
                split(request,'\\\\?')[0] as url,
                body_bytes_sent
                from temp_schema
                where body_bytes_sent>0 and temp_if_in_top_10_domain(domain)!='no'
                )A
           group by domain,url limit 100
"""

rows_temp = hc.sql(spark_sql).map(lambda row: ((row.domain, if_in_top_10_domain(row.domain), row.url, row.flow), None))


def partitionFunc(key):
开发者ID:Leaderman,项目名称:pyspark,代码行数:33,代码来源:sae_nginx_top_url.py

示例8: len

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("Usage: hive input file")
        exit(-1)

    path = sys.argv[1]

    conf = SparkConf().setAppName("spark_sql_hive")

    sc = SparkContext(conf=conf)

    hc = HiveContext(sc)

    # 创建表
    hc.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    # 加载数据
    hc.sql("LOAD DATA INPATH '%s' INTO TABLE src" % path)
    # 注册函数
    hc.registerFunction("myfunc", lambda name: name.upper())

    rows = hc.sql("select key, myfunc(value) from src").take(5)

    for row in rows:
        print row

    sc.stop()

开发者ID:2221758805,项目名称:SparkDemo,代码行数:30,代码来源:hive_sql.py

示例9: HiveContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
hc = HiveContext(sc)

source = sc.parallelize([("value",)])

schema = StructType([StructField("col", StringType(), False)])

table = hc.applySchema(source, schema)

table.registerTempTable("temp_table")


def func_string():
    return "abc"

hc.registerFunction("func_string", func_string)

rows = hc.sql("select func_string() from temp_table").collect()


def func_int():
    return 123

hc.registerFunction("func_int", func_int, IntegerType())

rows = hc.sql("select func_int() from temp_table").collect()


def func_array():
    # list or tuple
    return [1, 2, 3]
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_udf.py


注:本文中的pyspark.sql.HiveContext.registerFunction方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。