本文整理汇总了Python中pyspark.SQLContext.registerFunction方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerFunction方法的具体用法?Python SQLContext.registerFunction怎么用?Python SQLContext.registerFunction使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.registerFunction方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SparkConf
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
return name + "," + "good"
if __name__ == "__main__":
conf = SparkConf().setMaster("local[2]").setAppName("sql_udf")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
json_path = os.path.abspath("../doc/book.json")
# json读取并隐射
json_df = sqlContext.read.json(json_path)
json_df.registerTempTable("json_book")
# UDF自定义函数
sqlContext.registerFunction("name_place", name_place)
evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book")
#bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))
evalRDD.show()
# 查询结果进行隐射
bookMap = evalRDD.map(lambda books: (books.book_eval))
general_list = []
good_list = []
for book in bookMap.collect():
book = book.encode("utf-8").split(',')
示例2: str
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
logger.debug('-->users:' + str(users.count()))
schema_string = "id gw_id supp_id user_id user_type " \
"user_name login_time logout_time mac ip " \
"user_agent download_flow upload_flow os browser " \
"ratio batch_no hos_id"
fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
schema = StructType(fields)
schema_users = sql_context.applySchema(users, schema)
schema_users.registerTempTable("wxcity_userlogin_info")
# regist udf
sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())
lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)
# group by day,hosid,(mac),2, 5, 10, 30, 60
#repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
schema_string = "day hos_id mac t2 t5 " \
"t10 t30 t60"
fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
schema = StructType(fields)
schema_repeat_list = sql_context.applySchema(repeat_list, schema)
示例3: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
master = "spark://hadoop:7077"
appName = "spark_loginflowlog"
#input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
input = '/input/loginfowlog/*'
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
conf = (SparkConf()
.setMaster(master)
.setAppName(appName)
.set("spark.sql.parquet.binaryAsString","true")
)
sc = SparkContext(conf = conf)
sql_context = SQLContext(sc)
sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())
parquet_df = sql_context.read.parquet(input)
sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
#_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
_sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
rs_df = sql_context.sql(_sql)
rs = rs_df.collect()
logger.info("---->" + str(len(rs)))
lists = []
for r in rs:
usermac = r[0]
days = r[1]
t = (usermac,days)
lists.append(t)
示例4: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
master = "local[*]"
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
# logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day
logFile = "/input/loginfowlog/02*"
conf = (SparkConf()
.setMaster(master)
.setAppName("loginflowlog2mysql")
# .set("spark.kryoserializer.buffer.mb", "256")
.set("spark.sql.parquet.binaryAsString", "true"))
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType())
df = sqlContext.read.parquet(logFile)
rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac')
fields = [
StructField('logintype', StringType(), True),
StructField('logtype', StringType(), True),
StructField('hosid', StringType(), True),
StructField('suppid', StringType(), True),
StructField('logtime', LongType(), True),
StructField('usermac', StringType(), True)
]
schema = StructType(fields)
示例5: SparkContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
# --set datetime
DAY_OFFSET = 1
now = datetime.datetime.now()
pro_time = now - datetime.timedelta(days=DAY_OFFSET)
day = pro_time.strftime("%Y%m%d")
master = "spark://hadoop:7077"
appName = "spark_pageflow_outflow"
input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
sc = SparkContext(master, appName)
sql_context = SQLContext(sc)
sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())
parquet_df = sql_context.read.parquet(input)
sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")
_sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
"from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
"group by to_str(url),to_day(createtime)" % day
rs_df = sql_context.sql(_sql)
rs = rs_df.collect()
logger.info("---->" + str(len(rs)))
list = []
for r in rs:
示例6: analysis_email
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
import os
def analysis_email(email):
"""
邮箱分割
"""
return email.split("@")[1].split(".")[0]
if __name__ == "__main__":
conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# UDF自定义函数注册
sqlContext.registerFunction("analysis_email", analysis_email)
file_path = os.path.abspath("../doc/analysis.txt")
lines = sc.textFile(file_path)
info = lines.map(lambda lines: lines.split("----")). \
map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
idcard=info[3], password=info[4], phone=info[5]))
schemaInfo = sqlContext.createDataFrame(info)
schemaInfo.registerTempTable("information")
# cache表
#sqlContext.cacheTable("information")
#sqlContext.uncacheTable("information")
"""
示例7: dateformat
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
_adLoadDF=sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
{'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
{'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
{'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
]).registerAsTable("adload")
_adPlayDF=sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
{'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
{'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
]).registerAsTable("adplay")
_adClickDF =sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
]).registerAsTable("adclick")
'''
sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())
adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
'from adload where createtime is not null and dateformat(createtime)=%s '
'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")
adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")
# sqlContext.sql('select sum(pv) from radplay').foreach(printx)
adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")
示例8: SQLContext
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import registerFunction [as 别名]
sqlContext = SQLContext(sc)
# path to hillary/enron avro
enr = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
hil = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/hillary/*.avro").repartition(16)
# register tables
sqlContext.registerDataFrameAsTable(hil, "hillary")
sqlContext.registerDataFrameAsTable(enr, "enron")
# register udf
sqlContext.registerFunction(
"getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
)
# do the cosine similarity on the text, get the top 1000 matches
out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
"e.contents e_mail, h.contents h_mail, "
"getCos(e.contents, h.contents) as cos_sim "
"from hillary as h join enron as e order by cos_sim "
"desc limit 1000")
# write back out to s3
# write back out to s3
out.save("s3n://datasets-396316040607/cos_sim/", format="json")