本文整理汇总了Python中pyspark.SQLContext.jsonFile方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.jsonFile方法的具体用法?Python SQLContext.jsonFile怎么用?Python SQLContext.jsonFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.jsonFile方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import jsonFile [as 别名]
def main():
sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
#sc = SparkContext("local[*]", appName="RedditBatchLayer")
bcURL = sc.broadcast(urlTitlePool)
sqlContext = SQLContext(sc)
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
#conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)
def addTitleURL(cmtTuple):
# 150,000/ 3000 = avg 50 comments/topic
onePst = bcURL.value[randint(0, 3000)]
return cmtTuple + (onePst[0], onePst[1]) # adding title and url
if (smallBatch):
logFile = 's3a://reddit-comments/2007/RC_2007-10'
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
year = 2007
month = 12
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
users_row.foreachPartition(insert_into_cassandra)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: (x[10], x[0]))
#graph = post2user.join(post2user)\ # self join to find user relationship by posts
# .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship
# .map(makeAscOrder)\ # make to asc order by user name
# .distinct()\ # remove duplicated user pairs, because the relationship is mutual
# .map(lambda x: (x[1], 1))\ # ready tho count number of common edges
# .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship
# .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
graph = post2user.join(post2user)\
.filter(lambda x: x[1][0] != x[1][1])\
.map(makeAscOrder)\
.distinct()\
.map(lambda x: (x[1], 1))\
.reduceByKey(lambda x, y: x+y)\
.map(lambda x: (x[0][0], x[1], x[0][1]))
graph.foreachPartition(insert_graph)
else:
for key in bucket.list():
if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
continue
logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
year = logFile.split('-')[1][-4:]
month = logFile.split('-')[2]
from_year = FROM_YEAR_MONTH.split('_')[0]
from_month = FROM_YEAR_MONTH.split('_')[1]
if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
continue
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
# 0 1 2 3 4 5 6 7 8 9 (title) 10(url)
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
users_row.foreachPartition(insert_into_cassandra)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: (x[10], x[0]))
#graph = post2user.join(post2user)\ # self join to find user relationship by posts
# .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship
# .map(makeAscOrder)\ # make to asc order by user name
# .distinct()\ # remove duplicated user pairs, because the relationship is mutual
# .map(lambda x: (x[1], 1))\ # ready tho count number of common edges
# .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship
# .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
graph = post2user.join(post2user)\
.filter(lambda x: x[1][0] != x[1][1])\
.map(makeAscOrder)\
.distinct()\
.map(lambda x: (x[1], 1))\
.reduceByKey(lambda x, y: x+y)\
.map(lambda x: (x[0][0], x[1], x[0][1]))
#.repartition(REPARTITION_SIZE)
graph.foreachPartition(insert_graph)
sc.stop()
示例2: main
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import jsonFile [as 别名]
def main():
sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
#sc = SparkContext("local[*]", appName="RedditBatchLayer")
bcURL = sc.broadcast(urlTitlePool)
sqlContext = SQLContext(sc)
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
#conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)
def addTitleURL(cmtTuple):
# 150,000/ 3000 = avg 50 comments/topic
onePst = bcURL.value[randint(0, 3000)]
return cmtTuple + (onePst[0], onePst[1]) # adding title and url
if (smallBatch):
logFile = 's3a://reddit-comments/2007/RC_2007-10'
#logFile = 's3a://reddit-comments/2012/RC_2012-12'
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
year = 2007
month = 12
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: x[0])
nUsers = post2user.distinct().count()
pp.pprint("distinct user number:" + str(nUsers) + "\n")
else:
for key in bucket.list():
if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
continue
logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
year = logFile.split('-')[1][-4:]
month = logFile.split('-')[2]
from_year = FROM_YEAR_MONTH.split('_')[0]
from_month = FROM_YEAR_MONTH.split('_')[1]
if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
continue
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
# 0 1 2 3 4 5 6 7 8 9 (title) 10(url)
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: x[0])
nUsers = post2user.distinct()\
.count()
pp.pprint("distinct user number:" + str(nUsers) + "\n")
sc.stop()