本文整理匯總了Python中pyspark.SQLContext.jsonFile方法的典型用法代碼示例。如果您正苦於以下問題:Python SQLContext.jsonFile方法的具體用法?Python SQLContext.jsonFile怎麽用?Python SQLContext.jsonFile使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.jsonFile方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import jsonFile [as 別名]
def main():
sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
#sc = SparkContext("local[*]", appName="RedditBatchLayer")
bcURL = sc.broadcast(urlTitlePool)
sqlContext = SQLContext(sc)
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
#conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)
def addTitleURL(cmtTuple):
# 150,000/ 3000 = avg 50 comments/topic
onePst = bcURL.value[randint(0, 3000)]
return cmtTuple + (onePst[0], onePst[1]) # adding title and url
if (smallBatch):
logFile = 's3a://reddit-comments/2007/RC_2007-10'
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
year = 2007
month = 12
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
users_row.foreachPartition(insert_into_cassandra)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: (x[10], x[0]))
#graph = post2user.join(post2user)\ # self join to find user relationship by posts
# .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship
# .map(makeAscOrder)\ # make to asc order by user name
# .distinct()\ # remove duplicated user pairs, because the relationship is mutual
# .map(lambda x: (x[1], 1))\ # ready tho count number of common edges
# .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship
# .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
graph = post2user.join(post2user)\
.filter(lambda x: x[1][0] != x[1][1])\
.map(makeAscOrder)\
.distinct()\
.map(lambda x: (x[1], 1))\
.reduceByKey(lambda x, y: x+y)\
.map(lambda x: (x[0][0], x[1], x[0][1]))
graph.foreachPartition(insert_graph)
else:
for key in bucket.list():
if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
continue
logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
year = logFile.split('-')[1][-4:]
month = logFile.split('-')[2]
from_year = FROM_YEAR_MONTH.split('_')[0]
from_month = FROM_YEAR_MONTH.split('_')[1]
if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
continue
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
# 0 1 2 3 4 5 6 7 8 9 (title) 10(url)
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
users_row.foreachPartition(insert_into_cassandra)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: (x[10], x[0]))
#graph = post2user.join(post2user)\ # self join to find user relationship by posts
# .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship
# .map(makeAscOrder)\ # make to asc order by user name
# .distinct()\ # remove duplicated user pairs, because the relationship is mutual
# .map(lambda x: (x[1], 1))\ # ready tho count number of common edges
# .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship
# .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
graph = post2user.join(post2user)\
.filter(lambda x: x[1][0] != x[1][1])\
.map(makeAscOrder)\
.distinct()\
.map(lambda x: (x[1], 1))\
.reduceByKey(lambda x, y: x+y)\
.map(lambda x: (x[0][0], x[1], x[0][1]))
#.repartition(REPARTITION_SIZE)
graph.foreachPartition(insert_graph)
sc.stop()
示例2: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import jsonFile [as 別名]
def main():
sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
#sc = SparkContext("local[*]", appName="RedditBatchLayer")
bcURL = sc.broadcast(urlTitlePool)
sqlContext = SQLContext(sc)
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
#conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)
def addTitleURL(cmtTuple):
# 150,000/ 3000 = avg 50 comments/topic
onePst = bcURL.value[randint(0, 3000)]
return cmtTuple + (onePst[0], onePst[1]) # adding title and url
if (smallBatch):
logFile = 's3a://reddit-comments/2007/RC_2007-10'
#logFile = 's3a://reddit-comments/2012/RC_2012-12'
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
year = 2007
month = 12
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: x[0])
nUsers = post2user.distinct().count()
pp.pprint("distinct user number:" + str(nUsers) + "\n")
else:
for key in bucket.list():
if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS
continue
logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8'))
year = logFile.split('-')[1][-4:]
month = logFile.split('-')[2]
from_year = FROM_YEAR_MONTH.split('_')[0]
from_month = FROM_YEAR_MONTH.split('_')[1]
if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)):
continue
#df = sqlContext.read.json(logFile)
df = sqlContext.jsonFile(logFile)
users_rdd = df.filter(df['author'] != '[deleted]')
# 0 1 2 3 4 5 6 7 8 9 (title) 10(url)
users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
.map(addTitleURL)
#.repartition(REPARTITION_SIZE)
# calculate user relationship graph
# (URL, user) tuple
post2user = users_row.map(lambda x: x[0])
nUsers = post2user.distinct()\
.count()
pp.pprint("distinct user number:" + str(nUsers) + "\n")
sc.stop()