本文整理匯總了Python中pyspark.SQLContext.applySchema方法的典型用法代碼示例。如果您正苦於以下問題:Python SQLContext.applySchema方法的具體用法?Python SQLContext.applySchema怎麽用?Python SQLContext.applySchema使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.applySchema方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: str
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import applySchema [as 別名]
gwid_hosid_dict[gw_id] = hos_id
logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
.map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
logger.debug('-->users:' + str(users.count()))
schema_string = "id gw_id supp_id user_id user_type " \
"user_name login_time logout_time mac ip " \
"user_agent download_flow upload_flow os browser " \
"ratio batch_no hos_id"
fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
schema = StructType(fields)
schema_users = sql_context.applySchema(users, schema)
schema_users.registerTempTable("wxcity_userlogin_info")
# regist udf
sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())
lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)
# group by day,hosid,(mac),2, 5, 10, 30, 60
#repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
schema_string = "day hos_id mac t2 t5 " \
示例2: StructField
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import applySchema [as 別名]
# hosid_count.foreach(print_str)
user_top_hosid = hosid_count.groupByKey().mapValues(list).sortByKey() \
.map(topcount)
# (u'00:66:4B:9B:0F:C9', u'')
# user_top_hosid.foreach(print_str)
# user,days,count
days_count = parts.map(convert_day).groupByKey().mapValues(set).map(compute_mark)
#(u'95:15:DF:EE:41:E9', u'\u5b558\u6708|\u5b55\u524d\u671f', u'2014-04-06')
#days_count.foreach(print_str)
# join:mac,mark,hosid
mac_mark_hosid = days_count.join(user_top_hosid).map(generate_ret)
#mac_mark_hosid.take(10)
#mac_mark_hosid.foreach(print_str)
fields = [
StructField('user', StringType(), True),
StructField('stage', StringType(), True),
StructField('conceive', StringType(), True),
StructField('area', StringType(), True)
]
schema = StructType(fields)
dest = sqlContext.applySchema(mac_mark_hosid, schema)
dest.registerTempTable("user_profile")
# combine partition
dest.coalesce(10).write.parquet(output,'overwrite')
sc.stop()