本文整理汇总了Python中pyspark.sql.SQLContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerDataFrameAsTable方法的具体用法?Python SQLContext.registerDataFrameAsTable怎么用?Python SQLContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.registerDataFrameAsTable方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: save_pslist
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def save_pslist(sContext):
volatility = utils.SparkVolatility('pslist')
sqlContext = SQLContext(sContext)
df = sqlContext.load('Volatility/imageinfo')
sqlContext.registerDataFrameAsTable(df, "imageinfo")
imginfo = sqlContext.sql("SELECT * FROM imageinfo")
pslist = imginfo.map(volatility.Execute)
pslist.cache()
df = pslist.toDF()
df.save('Volatility/pslist', 'parquet', 'append')
示例2: travelTime
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def travelTime(startInt, endInt, routeIndex):
# Get route info
# [route_short_name] [start_lat] [start_lon] [end_lat] [end_lon]
curs.execute(BIXISQL.getRouteRow(routeIndex))
route = curs.fetchall()
if route[0][0] != 24: return # REMOVE
# Construct SQL Query for ttc raw data table
initQuery = BIXISQL.getTTCRawRouteSQL(tableName, startInt, endInt, route[0][0])
#print "Initial Query: " + initQuery
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Get tables from SQL Database
#print "BQ"
ttcRawTable = sqlContext.load(None, "jdbc", None, url=BIXISQL.getSrcUrl(), dbtable=initQuery, driver=BIXISQL.getDriverName())
sqlContext.registerDataFrameAsTable(ttcRawTable, "rawData")
#print ttcRawTable.count()
if ttcRawTable.count() < 1:
sc.stop()
return
#print "AQ"
#routeTable = sqlContext.load(None, "jdbc", None, url=urlDest, dbtable=routeQuery, driver=driverName)
# change into accessible array
#route = routeTable.collect()
#idList = sqlContext.sql("SELECT DISTINCT(vehicle_id) FROM rawData").sample(False, sampleRate).collect()
#print "idList: " + str(len(idList)) + " [" + str(route[0][0]) + "]"
#for row in idList:
#print row
# print "vehicle_id: " + str(row.vehicle_id)
# tempTable = sqlContext.sql("SELECT dateTime, dirTag FROM rawData WHERE vehicle_id=" + str(row.vehicle_id))
# print "Count: " + str(tempTable.count())
# print "start: "
# tempTable.sort(asc('dateTime')).show(n=1)
# print "end: "
# tempTable.sort(desc('dateTime')).show(n=1)
curTime = startInt
#print "route: " + str(route[i].route_short_name)
# Get the upper and lower bounds for the start location of the route
#startLatUpper = round(float(str(route[0][1])), Prec) + Tol
#startLatLower = round(float(str(route[0][1])), Prec) - Tol
#startLonUpper = round(float(str(route[0][2])), Prec) + Tol
#startLonLower = round(float(str(route[0][2])), Prec) - Tol
#endLatUpper = round(float(str(route[0][3])), Prec) + Tol
#endLatLower = round(float(str(route[0][3])), Prec) - Tol
#endLonUpper = round(float(str(route[0][4])), Prec) + Tol
#endLonLower = round(float(str(route[0][4])), Prec) - Tol
#print "start: " + str(startLatLower) + " " + str(startLatUpper) + " " + str(startLonLower) + " " + str(startLonUpper)
#print "end: " + str(endLatLower) + " " + str(endLatUpper) + " " + str(endLonLower) + " " + str(endLonUpper)
# Select a sample list of bus ids
idList = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(startInt) + "' AND dateTime<'" + str(startInt + timeInt) + "' ORDER BY nbBikes ASC").limit(maxSampleSize)
sqlContext.registerDataFrameAsTable(idList, "idTable")
curTime = startInt + timeInt
while curTime < endInt:
temp = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "' ORDER BY nbBikes").limit(maxSampleSize)
idList = idList.unionAll(temp)
curTime += timeInt
idList.show()
idList = idList.distinct().collect()
# Loop through bus id list to calculate travel time
print "Route: " + str(route[0][0])
trvSchm = ['startTime', 'trvTime']
trvTimeList = sqlContext.createDataFrame([('00:00:00',0)], trvSchm).limit(0)
#newRow = sqlContext.createDataFrame([('00:00:01',1)], schema)
#trvTimeList = trvTimeList.unionAll(newRow)
for busrow in idList:
print busrow.nbBikes
temp = sqlContext.sql("SELECT dateTime FROM rawData WHERE station_id=" + str(busrow.station_id) + " ORDER BY dateTime ASC").collect()
rangeSize = len(temp)
print str(temp[0].dateTime) + " " + str(temp[0].dirTag)
print "List Size: " + str(rangeSize)
trvStart = temp[0].dateTime
trvCount = 0
trvSum = 0
trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
for i in range(1, rangeSize):
#print temp[i]
if temp[i].dirTag != temp[i-1].dirTag:
trvEnd = temp[i-1].dateTime #DT.datetime.strptime(temp[i-1].dateTime, "%Y-%m-%d %H:%M:%S")
tempTrip = (trvEnd - trvStart).total_seconds() / 60 # caculate travel time in minutes
if tempTrip > minTravel:
trvSum += tempTrip
trvCount += 1
#trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
#newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
#trvTimeList = trvTimeList.unionAll(newRow)
#print "new: " + str(trvStart.hour) + " " + str(trvInt) + " " + str(tempTrip)
trvStart = temp[i].dateTime
if (int(trvStart.hour / timeIntHr) * timeIntHr != trvInt) and (trvCount != 0):
# print "trvInt: " + str(trvInt) + " " + str(trvStart.hour / timeIntHr)
#.........这里部分代码省略.........
示例3: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
from pyspark.sql import SQLContext, Row
if __name__ == '__main__':
# Define Spark configuration
conf = SparkConf()
conf.setAppName("spark_demo")
conf.set("spark.cassandra.connection.host", "CASSANDRA_DOCKER_IP")
conf.set("spark.cassandra.auth.username", "username")
conf.set("spark.cassandra.auth.password", "password")
# Define the Cassandra keyspace and column family to query
keyspace = "spark_demo"
table = "person_data"
spark_context = SparkContext(conf=conf)
sql_context = SQLContext(spark_context)
data_frame = sql_context.read.format("org.apache.spark.sql.cassandra") \
.options(keyspace=keyspace, table=table).load()
sql_context.registerDataFrameAsTable(data_frame, table)
# Perform an SQL query against the Spark data frame
query = sql_context.sql("""SELECT first_name, count(*) as count from person_data group by first_name order by count desc""")
# Show the output via sql_context (top 25)
query.show(25)
spark_context.stop()
示例4: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext, Row
conf = (SparkConf().set("spark.driver.memory", "5G"))
sc = SparkContext(conf=conf,appName="Challenge-2")
sql = SQLContext(sc)
# Loading Train and Test Data
train_triplets = sc.textFile('file:///users/jeevan4/challenge2/hash_train_triplets.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
train_visible = sc.textFile('file:///users/jeevan4/challenge2/hash_year1_*visible.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
test_hidden = sc.textFile('file:///users/jeevan4/challenge2/hash_year1_*hidden.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
train_data = train_triplets.union(train_visible)
# total_data = train_data.union(test_hidden)
# Creating a dataframe so that they can be used as Structured way
train_df = sql.createDataFrame(train_data)
test_df = sql.createDataFrame(test_hidden)
# Creating a logical table from dataframe
sql.registerDataFrameAsTable(train_df,'train_data_table')
sql.registerDataFrameAsTable(test_df,'test_data_table')
# Generating Normalized test dataset by calculating max counts from the entire dataset
test_norm_data = sql.sql('Select x.user,song,x.counts/y.maximum from test_data_table x INNER JOIN (select user,max(counts) as maximum from train_data_table group by user) y on x.user = y.user')
# saving the test normalized dataset into filesystem with single partition
test_norm_data.map(lambda x: x.user+','+x.song+','+str(x.c2).coalesce(1).saveAsTextFile('file:///users/jeevan4/challenge2/test_norm')
示例5: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
endTime = DT.datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
station_id = sys.argv[3]
fileName = sys.argv[4]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]
# Start Spark SQL Context
sc = SparkContext("local", "TTCData")
sqlContext = SQLContext(sc)
# Query lat and lon data for specified time period and routeTag
initQuery = "(SELECT dateTime, nbBikes, lat, lon FROM BIXI WHERE dateTime>='" + str(startTime) + "' AND dateTime<'" + str(endTime) + \
"' AND station_id=" + str(station_id) + ") AS T"
routeTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName)
sqlContext.registerDataFrameAsTable(routeTable, "rawData")
# Query route stop lat and lon
initQuery = "(SELECT start_lat, start_lon FROM BIXI_ROUTES WHERE route_short_name=" + str(routeTag) + ") AS T"
routeLoc = sqlContext.load(None, "jdbc", None, url=dbDestUrl, dbtable=initQuery, driver=driverName).collect()
# Calculate lat and lon ranges
startLatUpper = round(float(str(routeLoc[0].start_lat)), Prec) + Tol
startLatLower = round(float(str(routeLoc[0].start_lat)), Prec) - Tol
startLonUpper = round(float(str(routeLoc[0].start_lon)), Prec) + Tol
startLonLower = round(float(str(routeLoc[0].start_lon)), Prec) - Tol
# Loop through at time interval
curTime = startTime
freqList = []
freqCount = {}
示例6:
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
)
# display the schema of the dataframe.
df.show()
# display the schema of the dataframe.
df.printSchema()
# data frame is euivalent to the relational table in spark SQL. To select the column from dataframe.
print df.select(df.name).collect()
# registering data frame as a temporary table.
# df.registerTempTable("person")
# register dataframe as a sqlContext table.
sqlContext.registerDataFrameAsTable(df, "person")
# perform sql select query upon the registered context table person.
d = sqlContext.sql("select name from person")
# craeting new RDD by applying function to each row.
names = d.map(lambda p: "name: " + p.name)
# to print new row data.
for name in names.collect():
print name
# to print list of tables in the current database.
print sqlContext.tableNames()
# creating new data frame containing union of rows in two dataframes.
示例7: findAgeCoded
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
age2 = findAgeCoded(row2['age'], listCoded, listOfRows)
resultAge = abs(age2-age1)
resultGender = antiSimOfGender(row1['gender'],row2['gender'])
resultOccupation = antiSimOfOccupation(row1['occupation'],row2['occupation'])
resultZipCode = antiSimOfZipCode(row1['zipCode'],row2['zipCode'])
return (resultGender+resultOccupation+resultZipCode+resultAge)/4
conf = SparkConf().setAppName('rawDataToOutlineData').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
dfUser = sqlContext.read.parquet('user_base')
dfUser.show()
print(dfUser.dtypes)
print(dfUser.schema)
sqlContext.registerDataFrameAsTable(dfUser,'user_base')
"""对序数性属性age进行编码"""
dfUserGrouped = dfUser.groupBy(dfUser.age).count()
listOfRows = dfUserGrouped.orderBy('age').collect()
print(listOfRows)
num = len(listOfRows)
l = []
for index in range(num):
temp = {}
if(index == 0):
temp[listOfRows[index]['age']] = 1
else:
M = l[index-1][listOfRows[index-1]['age']]+listOfRows[index-1]['count']
temp[listOfRows[index]['age']] = M
l.append(temp)
print(l)
示例8: map
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
# Replace ' ' by '_' in column names
col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)
f_crimes.names = col_names
refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
f_crimes = f_crimes.drop("Date")
# Expose H2O frames as Spark DataFrame
df_weather = h2oContext.as_spark_frame(f_weather)
df_census = h2oContext.as_spark_frame(f_census)
df_crimes = h2oContext.as_spark_frame(f_crimes)
# Use Spark SQL to join datasets
# Register DataFrames as tables in SQL context
sqlContext.registerDataFrameAsTable(df_weather, "chicagoWeather")
sqlContext.registerDataFrameAsTable(df_census, "chicagoCensus")
sqlContext.registerDataFrameAsTable(df_crimes, "chicagoCrime")
crimeWithWeather = sqlContext.sql("""SELECT
a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,
a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,
a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,
b.minTemp, b.maxTemp, b.meanTemp,
c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,
c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,
c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
FROM chicagoCrime a
JOIN chicagoWeather b
ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day
示例9: count
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
full['average_stars'].isnull().sum()
###############################################################################################################
#### Data Exploration for each Dataset
###############################################################################################################
business = train_bus
user = train_usr
review = train_rvw
checkin = train_check
################# Summary of User Table ###########################
user.describe().show()
sqlContext.registerDataFrameAsTable(user, "user")
# no duplicate records
# no duplicate on userid column
# only one column has zero average_stars --- zero average_stars means no rating given by user but the user has given review
# average of average_stars = 3.75
user.printSchema()
t = user.dropDuplicates(['user_id'])
t.count()
df2 = sqlContext.sql("SELECT count(review_count) as count from user where review_count = 0.0 ")
# For test data
test_usr.describe().show()
# min review count is one
test_usr.printSchema()
t = test_usr.dropDuplicates(['user_id'])
示例10: StructField
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
#load stores ... I'm lazy and use string ids
stores_rdd=sc.textFile(base_path+"stores.txt").map(lambda x:x.split('\t')) # id | name
stores_fields=[
StructField('id',StringType(),False), # name,type,nullable
StructField('name',StringType(),False),
]
stores=sqlCtx.createDataFrame(stores_rdd,StructType(stores_fields))
products_rdd=sc.textFile(base_path+"products.txt").map(lambda x:x.split('\t')) # id | name | category
products_fields=[
StructField('id',StringType(),False), # name,type,nullable
StructField('name',StringType(),False),
StructField('category',StringType(),True),
]
products=sqlCtx.createDataFrame(products_rdd,StructType(products_fields))
sqlCtx.registerDataFrameAsTable(sales,"sales")
sqlCtx.registerDataFrameAsTable(stores,"stores")
sqlCtx.registerDataFrameAsTable(products,"products")
# can do SQL, including joins and GroupBy !!
sqlCtx.sql("SELECT * FROM sales sa join stores st on sa.store=st.id").show()
sqlCtx.sql("SELECT * FROM sales s join products p on s.product=p.id").show()
# .explain
# .agg
from pyspark.sql import functions as funcs
sales.groupBy('day').agg(funcs.min('store').alias('MinStore'),funcs.max('quantity').alias('MaxQty')).show()
示例11: countBus
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def countBus(startInt, endInt):
#if startInt < DT.datetime.strptime("2015-03-23 12:00:00", "%Y-%m-%d %H:%M:%S"): return # ATTENTION: TAKE OUT
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Query route list
routeData = requests.get('http://portal.cvst.ca/api/0.1/ttc/routes')
routeTable = routeData.json()
routeSize = len(routeTable)
# Count bus freq for list of routes
for route in routeTable:
#if route.route_short_name != 85:
# continue
# Get the upper and lower bounds for the start location of the route
trgLat = route.start_lat
trgLon = route.start_lon
latUpper = round(trgLat, Prec) + Tol
latLower = round(trgLat, Prec) - Tol
lonUpper = round(trgLon, Prec) + Tol
lonLower = round(trgLon, Prec) - Tol
# Scale factor for route with same start and end location
rpFactor = 1
if (route.end_lat >= latLower) and (route.end_lat <= latUpper) and (route.end_lon >= lonLower) and (route.end_lon <= lonUpper):
rpFactor = 2
else:
rpFactor = 1
# get ttc raw data table
Period = {startTime:startInt,endTime:endInt}
ttcRawTable = requests.get('http://portal.cvst.ca/api/0.1/ttc/routetag/'+route.route_short_name,params=Period)
sqlContext.registerDataFrameAsTable(ttcRawTable, "rawData")
# Filter data with matching location
resTable = sqlContext.sql("SELECT nbBikes, dateTime FROM rawData WHERE " +\
"lat>=" + str(latLower) + " AND lat<=" + str(latUpper) +\
" AND lon>=" + str(lonLower) + " AND lon<=" + str(lonUpper))
sqlContext.registerDataFrameAsTable(resTable, "resData")
# Select unique buses in matched data
idList = sqlContext.sql("SELECT nbBikes FROM resData").collect()
# Find freq of each unique bus
freq = 0
for idRow in idList:
finList = sqlContext.sql("SELECT dateTime FROM resData WHERE station_id=" + str(idRow.station_id) + " ORDER BY dateTime ASC").collect()
for i in range(0, len(finList) - 1):
if (finList[i + 1].dateTime - finList[i].dateTime) > timeInt:
freq += 1
freq += 1
#insert into db
freq_table = {routeTag:route.route_short_name, startTime:startInt, freq: int(round(freq/rpFactor))}
r = requests.post('http://portal.cvst.ca/api/0.1/ttc_freq/store',data=freq_table)
# Stop Spark
sc.stop()
return
示例12: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]
#print sys.argv[1] + " " + sys.argv[2] + ' ' + sys.argv[3] + ' ' + sys.argv[4]
# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)
# Query lat and lon data
initQuery = "(SELECT dateTime, lat, lon FROM BIXI WHERE dateTime<'" + str(endTime) + "' AND dateTime>='" + str(startTime) + "'" + \
" AND station_id=" + str(station_id) + ") AS T"
#print initQuery
lineTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName).sample(False, sampleRate)
sqlContext.registerDataFrameAsTable(lineTable, "lineTbl")
# Compute standard deviations and means of lat and lon data sets
latList = []
lonList = []
for row in lineTable.collect():
latList.append(row.lat)
lonList.append(row.lon)
latDev = numpy.std(latList)
lonDev = numpy.std(lonList)
latMean = numpy.mean(latList)
lonMean = numpy.mean(lonList)
# Compute valid ranges of lat and lon based on <stdDevOut>
示例13: they
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
#defining the sql and conf objects for spark
#Note: these need not be defines if the code is run n databricks cluster, becuase they (SC and CONF) are defined by default
conf = SparkConf().setAppName("Dota2")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#path should be changed where data resides.
#In our case we are using databricks sparks cluster.
#So pthis path is databricks tables where in the data files are resided on aws S3 storage.
#registering the data as datafrmae
df = sqlContext.read.json("/FileStore/tables/3v7vab741462231829765/my_match_history.json")
df.printSchema()
#registering the data from datafram to table
sqlContext.registerDataFrameAsTable(df, "my_full_table")
#performing few ETL and Field profiling using the queries.
#Hero_summary and item_summar ETl and profiling
hero_summary = sqlContext.read.json("/FileStore/tables/wbn4nbby1461991735655/hero_summary.json")
sqlContext.registerDataFrameAsTable(hero_summary, "hero_summary")
hero_summary.printSchema()
item_summary = sqlContext.read.json("/FileStore/tables/vvzkk2qc1462214278912/item_summmary.json")
sqlContext.registerDataFrameAsTable(item_summary, "item_summary")
item_summary.printSchema()
#Query for Changing the data values - profiling
hero_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.hero_id = hero_summary.heroes.localized_name from hero_summary where my_full_table.reslt.players.hero_id = hero_summary.heroes.hero_id")
item0_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.item_0 = item_summary.items.localized_name from item_summary where my_full_table.reslt.players.item_0 = item_summary.items.id")
item1_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.item_1 = item_summary.items.localized_name from item_summary where my_full_table.reslt.players.item_1 = item_summary.items.id")
示例14: SparkContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
master_public_dns = os.environ['master_public_dns']
# setting SparkContext and SQLContext
sc = SparkContext("spark://" + master_ip + ":7077", "2013_events")
sqlContext = SQLContext(sc)
# reading events data for 2013 from HDFS
df13 = sqlContext.jsonFile("hdfs://" + master_public_dns + ":9000/data_jan2015/2013-*.*")
# filtering rows with just the three relevant events
df13_watch = df13.filter("type='WatchEvent'")
df13_commit = df13.filter("type='CommitCommentEvent'")
df13_fork = df13.filter("type='ForkEvent'")
# registering dataframes as tables to be able to select just the three relevant columns
sqlContext.registerDataFrameAsTable(df13_watch, "df13_watch_table")
sqlContext.registerDataFrameAsTable(df13_commit, "df13_commit_table")
sqlContext.registerDataFrameAsTable(df13_fork, "df13_fork_table")
# creating new dataframes with just the relevant columns
df13_watch_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_watch_table")
df13_commit_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_commit_table")
df13_fork_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_fork_table")
# registering dataframes as tables to get a union of all
sqlContext.registerDataFrameAsTable(df13_watch_altered, "df13_watch_altered_table")
sqlContext.registerDataFrameAsTable(df13_commit_altered, "df13_commit_altered_table")
sqlContext.registerDataFrameAsTable(df13_fork_altered, "df13_fork_altered_table")
# unifying tables with filtered events and columns
df13_altered_union = sqlContext.sql("SELECT * from df13_watch_altered_table UNION ALL SELECT * from df13_commit_altered_table UNION ALL SELECT * from df13_fork_altered_table")
示例15: __repr__
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def __repr__(self):
return (self.username, self.follows)
# connecting to "watch_events" keyspace in Cassandra and creating new Spark Context
connection.setup(['127.0.0.1', '52.8.41.216'], "watch_events")
sc = SparkContext("spark://ip-172-31-2-89:7077", "watchevents")
sqlContext = SQLContext(sc)
# reading data from HDFS and filtering with 'WatchEvent', 'CommitEvent', 'PushEvent'
df = sqlContext.jsonFile("hdfs://ec2-52-8-127-252.us-west-1.compute.amazonaws.com:9000/data_jan2015/2015-01-01-*")
df_watch = df.filter("type='WatchEvent'")
df_push = df.filter("type='PushEvent'")
df_commit = df.filter("type='CommitEvent'")
# registering data frames as tables to unite all rows
sqlContext.registerDataFrameAsTable(df_watch, "df_watch_table")
sqlContext.registerDataFrameAsTable(df_push, "df_push_table")
sqlContext.registerDataFrameAsTable(df_commit, "df_commit_table")
df_union = sqlContext.sql("SELECT * from df_watch_table UNION ALL SELECT * from df_push_table")
# spark job to group all repos for a given username considering the three filtered events above
userrepomap = df_union.map(lambda p: (p.actor.login, tuple((p.repo.name, datetime.strptime(p.created_at[:-1], "%Y-%m-%dT%H:%M:%S"))))).groupByKey()
userrepocoll = userrepomap.map(lambda x: (x[0], dict(x[1]))).collect()
# getting users->followers from HDFS
# reverse mapping it to users->following
dffoll = sqlContext.jsonFile("hdfs://ec2-52-8-127-252.us-west-1.compute.amazonaws.com:9000/user/2015-01-01-0_followers.json")
collfoll = dffoll.map(lambda x : (x.login, x.following)).collect()
follr = []
# follr is a list of dicts with users and people who these users follow