本文整理汇总了Python中pyspark.sql.HiveContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.registerFunction方法的具体用法?Python HiveContext.registerFunction怎么用?Python HiveContext.registerFunction使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.registerFunction方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys
if __name__ == "__main__":
inputFile = sys.argv[1]
conf = SparkConf().setAppName("TwitterAnalytics")
sc = SparkContext()
hiveCtx = HiveContext(sc)
print "Loading tweets from " + inputFile
input = hiveCtx.jsonFile(inputFile)
input.registerTempTable("tweets")
topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
print topTweets.collect()
topTweetText = topTweets.map(lambda row : row.text)
print topTweetText.collect()
# Make a happy person row
happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable("strong_people")
# Make a UDF to tell us how long some text is
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
print lengthSchemaRDD.collect()
sc.stop()
示例2: Row
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
return Row(**mydict)
convertRDD = hc.sql(
"select col1, col2, col3 from temp_source").map(convert)
mytable = hc.inferSchema(convertRDD)
mytable.registerTempTable("temp_mytable")
"""
def convert(val):
return val.upper()
hc.registerFunction("temp_convert", convert)
convertRDD = hc.sql(
"select temp_convert(col1) as col1, col2, col3 from temp_source")
convertRDD.registerAsTable("temp_mytable")
hc.cacheTable("temp_mytable")
def printRows(rows):
for row in rows:
print row
datas = hc.sql("select * from temp_mytable").collect()
示例3: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType
conf = SparkConf().setAppName("spark_sql_udf")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
lines = sc.parallelize(["a", "b", "c"])
people = lines.map(lambda value: Row(name=value))
peopleSchema = hc.inferSchema(people)
peopleSchema.registerTempTable("people")
def myfunc(value):
return value.upper()
hc.registerFunction("myfunc", myfunc, StringType())
rows = hc.sql("select myfunc(name) from people").rdd.filter(
lambda row: isinstance(row, tuple)).collect()
sc.stop()
for row in rows:
print row, type(row[0])
示例4: parseCDN
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
table = hc.applySchema(rows, schema)
table.registerTempTable("temp_table")
def parseCDN(video_cdn):
if not video_cdn:
return ""
words = video_cdn.split("s=")
if len(words) >= 2:
return words[1].split(",")[0]
return ""
hc.registerFunction("parseCDN", parseCDN)
def cal_buffer_num(set):
buffer_count = 0
buffer_t_sum = 0
buffer_smaller_500ms_count = 0
buffer_bigger_2min_count = 0
if set == None:
pass
else:
list = set
for s in list:
if s >= 500 and s <= 120000:
buffer_count = buffer_count + 1
示例5: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
from __future__ import absolute_import, print_function, division, unicode_literals
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, IntegerType
if __name__ == '__main__':
conf = SparkConf().setAppName('Restaurants Parquet')
sc = SparkContext(conf=conf)
hive_ctx = HiveContext(sc)
inputs = hive_ctx.parquetFile(sys.argv[1])
inputs.registerTempTable('restaurants')
hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType())
print('### Schema ###')
inputs.printSchema()
print()
print('### Restaurants in Tokyo ###')
restaurants_in_tokyo = hive_ctx.sql("""
SELECT
r.id,
r.alphabet
FROM
restaurants r
WHERE
r.pref_id = '13'
AND r.alphabet <> ''
示例6: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
def split_idc(idc):
if idc == None or idc == '' or (not isinstance(idc, basestring)):
return ''
else:
words = idc.split('.')
if len(words) >= 2:
return words[0] + '.' + words[1]
else:
return ''
hc.registerFunction("temp_split_idc", split_idc)
#--------------------------2.0 RDD-----------------------
spark_sql = '''select '1' as job_date,cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count,
sum(sum_play_process) as sum_play_process,
sum(sum_video_init_duration) as sum_video_init_duration,
sum(sum_buffer_t_sum) as sum_buffer_t_sum,
sum(num) as num
from(
select cdn,province,isp,ua,play_process_group,version,init_timetag,buffer_count,sum_play_process,sum_video_init_duration,sum_buffer_t_sum,num,
temp_split_idc(idc) as idc
from datacubic.app_picserversweibof6vwt_wapvideodownload
where log_dir= '20151012110000' and version>='5.4.5' limit 10
)a
group by cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count'''
示例7: if_in_top_10_domain
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")
hc.registerRDDAsTable(jsonRDD, "temp_schema")
def if_in_top_10_domain(domain):
if domain == "" or domain == None or len(domain) < 3:
return "no"
else:
if top_domain_dict.has_key(domain):
return top_domain_dict[domain]
else:
return "no"
hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)
spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (
select domain,
split(request,'\\\\?')[0] as url,
body_bytes_sent
from temp_schema
where body_bytes_sent>0 and temp_if_in_top_10_domain(domain)!='no'
)A
group by domain,url limit 100
"""
rows_temp = hc.sql(spark_sql).map(lambda row: ((row.domain, if_in_top_10_domain(row.domain), row.url, row.flow), None))
def partitionFunc(key):
示例8: len
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: hive input file")
exit(-1)
path = sys.argv[1]
conf = SparkConf().setAppName("spark_sql_hive")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
# 创建表
hc.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
# 加载数据
hc.sql("LOAD DATA INPATH '%s' INTO TABLE src" % path)
# 注册函数
hc.registerFunction("myfunc", lambda name: name.upper())
rows = hc.sql("select key, myfunc(value) from src").take(5)
for row in rows:
print row
sc.stop()
示例9: HiveContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import registerFunction [as 别名]
hc = HiveContext(sc)
source = sc.parallelize([("value",)])
schema = StructType([StructField("col", StringType(), False)])
table = hc.applySchema(source, schema)
table.registerTempTable("temp_table")
def func_string():
return "abc"
hc.registerFunction("func_string", func_string)
rows = hc.sql("select func_string() from temp_table").collect()
def func_int():
return 123
hc.registerFunction("func_int", func_int, IntegerType())
rows = hc.sql("select func_int() from temp_table").collect()
def func_array():
# list or tuple
return [1, 2, 3]