本文整理汇总了Python中pyspark.sql.HiveContext.jsonFile方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.jsonFile方法的具体用法?Python HiveContext.jsonFile怎么用?Python HiveContext.jsonFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.jsonFile方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonFile [as 别名]
from pyspark import SparkContext,SparkConf
from pyspark.sql import HiveContext,SQLContext,Row
import json
conf = SparkConf().setAppName("Task")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
hc.sql("set spark.sql.shuffle.partitions=10")
artists = hc.jsonFile("artists_en.json")
movies = hc.jsonFile("movies_en.json")
movies.registerTempTable("movies")
artists.registerTempTable("artists")
# function to print data using foreach
def printx(x) :
print(x)
#Question 2 solution :
movies_clean = hc.sql("select id,title,year,director,genre,country,actors from movies")
#Question 3 solution :
mUs_movies = hc.sql("select year,title from movies") \
.map(lambda row : (row.year,row.title)).groupByKey() \
.mapValues(lambda data : [title for title in data])
# Question 4 solution :
mUs_directors = hc.sql("select director,title from movies") \
示例2: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonFile [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys
if __name__ == "__main__":
inputFile = sys.argv[1]
conf = SparkConf().setAppName("TwitterAnalytics")
sc = SparkContext()
hiveCtx = HiveContext(sc)
print "Loading tweets from " + inputFile
input = hiveCtx.jsonFile(inputFile)
input.registerTempTable("tweets")
topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
print topTweets.collect()
topTweetText = topTweets.map(lambda row : row.text)
print topTweetText.collect()
# Make a happy person row
happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable("strong_people")
# Make a UDF to tell us how long some text is
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
print lengthSchemaRDD.collect()
sc.stop()
示例3: if_in_top_10_domain
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import jsonFile [as 别名]
u"www.backgrounds.sinaapp.com": 7,
u"liukebin.sinaapp.com": 13,
}
"""
i = 1
for domain in top_domain_list:
top_domain_dict[domain[0]] = i
i = i + 1
print top_domain_dict
"""
jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")
hc.registerRDDAsTable(jsonRDD, "temp_schema")
def if_in_top_10_domain(domain):
if domain == "" or domain == None or len(domain) < 3:
return "no"
else:
if top_domain_dict.has_key(domain):
return top_domain_dict[domain]
else:
return "no"
hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)