本文整理汇总了Python中pyspark.sql.HiveContext.jsonRDD方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.jsonRDD方法的具体用法?Python HiveContext.jsonRDD怎么用?Python HiveContext.jsonRDD使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.jsonRDD方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonRDD [as 别名]
def run(inpath, outpath, mode='append'):
gc.disable()
print("===== Checking if Log Exists =====")
check_log(inpath)
print("===== Pass Log Checking =====")
# initial SparkContext
conf = SparkConf().setAppName("Forgate Log Parser")
sc = SparkContext(conf=conf)
sqlCtx = HiveContext(sc)
start_time = time.time()
print("===== INPUT FILE PATH: %s =====" % (str(inpath)))
print("===== OUTPUT FILE PATH: %s =====" % (str(outpath)))
print("===== %s Reading Data From HDFS" % (now()))
distFile = sc.textFile(inpath)
cnt_raw = distFile.count()
print("===== Count of Input Data: %s =====" % (str(cnt_raw)))
print("===== %s Parsing Data" % (now()))
parsedData = parse_data(sc, distFile)
print("===== Count of Parsed Data: %s =====" % (str(parsedData.count())))
print("===== %s Saving Data" % (now()))
jsonData = sqlCtx.jsonRDD(parsedData)
old_col=['time','date']
new_col=['time_','dt']
jsonData = rename_column(jsonData, old_col, new_col)
jsonData.write.partitionBy('dt').parquet(outpath, mode=mode)
print("===== %s Checking Data" % (now()))
confirm_row(sqlCtx, outpath)
write_log(inpath)
print("---Total took %s seconds ---" % (time.time() - start_time))
sc.stop()
gc.enable()
示例2: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonRDD [as 别名]
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
conf = SparkConf().setAppName("spark_sql_cache")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
source = sc.parallelize(
['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}'])
sourceRDD = hc.jsonRDD(source)
sourceRDD.registerTempTable("temp_source")
"""
def convert(row):
mydict = row.asDict()
mydict["col1"] = mydict["col1"].upper()
return Row(**mydict)
convertRDD = hc.sql(
"select col1, col2, col3 from temp_source").map(convert)
mytable = hc.inferSchema(convertRDD)
示例3: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonRDD [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
import re
conf = SparkConf().setAppName("spark_sql_json")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
source = sc.parallelize(
['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}'])
table = hc.jsonRDD(source)
table.registerAsTable("temp_mytable")
datas = hc.sql("select * from temp_mytable").collect()
sc.stop()
if datas:
for data in datas:
print data.col1, data.col2, data.col3