本文整理汇总了Python中pyspark.SQLContext.applySchema方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.applySchema方法的具体用法?Python SQLContext.applySchema怎么用?Python SQLContext.applySchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.applySchema方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: str
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import applySchema [as 别名]
gwid_hosid_dict[gw_id] = hos_id
logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
.map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
logger.debug('-->users:' + str(users.count()))
schema_string = "id gw_id supp_id user_id user_type " \
"user_name login_time logout_time mac ip " \
"user_agent download_flow upload_flow os browser " \
"ratio batch_no hos_id"
fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
schema = StructType(fields)
schema_users = sql_context.applySchema(users, schema)
schema_users.registerTempTable("wxcity_userlogin_info")
# regist udf
sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())
lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)
# group by day,hosid,(mac),2, 5, 10, 30, 60
#repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
schema_string = "day hos_id mac t2 t5 " \
示例2: StructField
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import applySchema [as 别名]
# hosid_count.foreach(print_str)
user_top_hosid = hosid_count.groupByKey().mapValues(list).sortByKey() \
.map(topcount)
# (u'00:66:4B:9B:0F:C9', u'')
# user_top_hosid.foreach(print_str)
# user,days,count
days_count = parts.map(convert_day).groupByKey().mapValues(set).map(compute_mark)
#(u'95:15:DF:EE:41:E9', u'\u5b558\u6708|\u5b55\u524d\u671f', u'2014-04-06')
#days_count.foreach(print_str)
# join:mac,mark,hosid
mac_mark_hosid = days_count.join(user_top_hosid).map(generate_ret)
#mac_mark_hosid.take(10)
#mac_mark_hosid.foreach(print_str)
fields = [
StructField('user', StringType(), True),
StructField('stage', StringType(), True),
StructField('conceive', StringType(), True),
StructField('area', StringType(), True)
]
schema = StructType(fields)
dest = sqlContext.applySchema(mac_mark_hosid, schema)
dest.registerTempTable("user_profile")
# combine partition
dest.coalesce(10).write.parquet(output,'overwrite')
sc.stop()