当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.registerDataFrameAsTable方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.registerDataFrameAsTable方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.registerDataFrameAsTable方法的具体用法?Python SQLContext.registerDataFrameAsTable怎么用?Python SQLContext.registerDataFrameAsTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.registerDataFrameAsTable方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: save_pslist

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def save_pslist(sContext):
    volatility = utils.SparkVolatility('pslist')
    sqlContext = SQLContext(sContext)
    df = sqlContext.load('Volatility/imageinfo')
    sqlContext.registerDataFrameAsTable(df, "imageinfo")
    imginfo = sqlContext.sql("SELECT * FROM imageinfo")
    pslist = imginfo.map(volatility.Execute)
    pslist.cache()
    df = pslist.toDF()
    df.save('Volatility/pslist', 'parquet', 'append')
开发者ID:jleaniz,项目名称:bdsa,代码行数:12,代码来源:pslist.py

示例2: travelTime

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def travelTime(startInt, endInt, routeIndex):
    # Get route info
    # [route_short_name] [start_lat] [start_lon] [end_lat] [end_lon]
    curs.execute(BIXISQL.getRouteRow(routeIndex))
    route = curs.fetchall()

    if route[0][0] != 24: return # REMOVE

    # Construct SQL Query for ttc raw data table
    initQuery = BIXISQL.getTTCRawRouteSQL(tableName, startInt, endInt, route[0][0])
    #print "Initial Query: " + initQuery

    # Start Spark SQL Context
    sc = SparkContext("local", "BIXIData")
    sqlContext = SQLContext(sc)

    # Get tables from SQL Database
    #print "BQ"
    ttcRawTable = sqlContext.load(None, "jdbc", None, url=BIXISQL.getSrcUrl(), dbtable=initQuery, driver=BIXISQL.getDriverName())
    sqlContext.registerDataFrameAsTable(ttcRawTable, "rawData")
    #print ttcRawTable.count()
    if ttcRawTable.count() < 1:
        sc.stop()
        return
    #print "AQ"
    #routeTable = sqlContext.load(None, "jdbc", None, url=urlDest, dbtable=routeQuery, driver=driverName)

    # change into accessible array
    #route = routeTable.collect()

    #idList = sqlContext.sql("SELECT DISTINCT(vehicle_id) FROM rawData").sample(False, sampleRate).collect()
    #print "idList: " + str(len(idList)) + " [" + str(route[0][0]) + "]"
    #for row in idList:
        #print row
    #    print "vehicle_id: " + str(row.vehicle_id)
    #    tempTable = sqlContext.sql("SELECT dateTime, dirTag FROM rawData WHERE vehicle_id=" + str(row.vehicle_id))
    #    print "Count: " + str(tempTable.count())
    #    print "start: "
    #    tempTable.sort(asc('dateTime')).show(n=1)
    #    print "end: "
    #    tempTable.sort(desc('dateTime')).show(n=1)

    curTime = startInt
    #print "route: " + str(route[i].route_short_name)

    # Get the upper and lower bounds for the start location of the route
    #startLatUpper = round(float(str(route[0][1])), Prec) + Tol
    #startLatLower = round(float(str(route[0][1])), Prec) - Tol
    #startLonUpper = round(float(str(route[0][2])), Prec) + Tol
    #startLonLower = round(float(str(route[0][2])), Prec) - Tol
    #endLatUpper = round(float(str(route[0][3])), Prec) + Tol
    #endLatLower = round(float(str(route[0][3])), Prec) - Tol
    #endLonUpper = round(float(str(route[0][4])), Prec) + Tol
    #endLonLower = round(float(str(route[0][4])), Prec) - Tol
    #print "start: " + str(startLatLower) + " " + str(startLatUpper) + " " + str(startLonLower) + " " + str(startLonUpper)
    #print "end: " + str(endLatLower) + " " + str(endLatUpper) + " " + str(endLonLower) + " " + str(endLonUpper) 

    # Select a sample list of bus ids
    idList = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(startInt) + "' AND dateTime<'" + str(startInt + timeInt) + "' ORDER BY nbBikes ASC").limit(maxSampleSize)
    sqlContext.registerDataFrameAsTable(idList, "idTable")
    curTime = startInt + timeInt
    while curTime < endInt:
        temp = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "' ORDER BY nbBikes").limit(maxSampleSize)
        idList = idList.unionAll(temp)
        curTime += timeInt
    idList.show()
    idList = idList.distinct().collect()

    # Loop through bus id list to calculate travel time
    print "Route: " + str(route[0][0])
    trvSchm = ['startTime', 'trvTime']
    trvTimeList = sqlContext.createDataFrame([('00:00:00',0)], trvSchm).limit(0)
    #newRow = sqlContext.createDataFrame([('00:00:01',1)], schema)
    #trvTimeList = trvTimeList.unionAll(newRow)

    for busrow in idList:
        print busrow.nbBikes
        temp = sqlContext.sql("SELECT dateTime FROM rawData WHERE station_id=" + str(busrow.station_id) + " ORDER BY dateTime ASC").collect()
        rangeSize = len(temp)
        print str(temp[0].dateTime) + " " + str(temp[0].dirTag)
        print "List Size: " + str(rangeSize)
        trvStart = temp[0].dateTime
        trvCount = 0
        trvSum = 0
        trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
        for i in range(1, rangeSize):
            #print temp[i]
            if temp[i].dirTag != temp[i-1].dirTag:
                trvEnd = temp[i-1].dateTime #DT.datetime.strptime(temp[i-1].dateTime, "%Y-%m-%d %H:%M:%S")
                tempTrip = (trvEnd - trvStart).total_seconds() / 60  # caculate travel time in minutes
                if tempTrip > minTravel:
                    trvSum += tempTrip
                    trvCount += 1
                    #trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
                    #newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
                    #trvTimeList = trvTimeList.unionAll(newRow)
                    #print "new: " + str(trvStart.hour) + " " + str(trvInt) + " " + str(tempTrip)
                trvStart = temp[i].dateTime
                if (int(trvStart.hour / timeIntHr) * timeIntHr != trvInt) and (trvCount != 0):
                #    print "trvInt: " + str(trvInt) + " " + str(trvStart.hour / timeIntHr)
#.........这里部分代码省略.........
开发者ID:zsy2053,项目名称:BIXI,代码行数:103,代码来源:BIXITravel.py

示例3: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
from pyspark.sql import SQLContext, Row


if __name__ == '__main__':
    # Define Spark configuration
    conf = SparkConf()
    conf.setAppName("spark_demo")
    conf.set("spark.cassandra.connection.host", "CASSANDRA_DOCKER_IP")
    conf.set("spark.cassandra.auth.username", "username")
    conf.set("spark.cassandra.auth.password", "password")

    # Define the Cassandra keyspace and column family to query
    keyspace = "spark_demo"
    table = "person_data"

    spark_context = SparkContext(conf=conf)
    sql_context = SQLContext(spark_context)

    data_frame = sql_context.read.format("org.apache.spark.sql.cassandra") \
            .options(keyspace=keyspace, table=table).load()

    sql_context.registerDataFrameAsTable(data_frame, table)

    # Perform an SQL query against the Spark data frame
    query = sql_context.sql("""SELECT first_name, count(*) as count from person_data group by first_name order by count desc""")
    
    # Show the output via sql_context (top 25)
    query.show(25)

    spark_context.stop()
开发者ID:danpilch,项目名称:cassandra-spark-analytics,代码行数:32,代码来源:count.py

示例4: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext, Row
conf = (SparkConf().set("spark.driver.memory", "5G"))
sc = SparkContext(conf=conf,appName="Challenge-2")
sql = SQLContext(sc)

# Loading Train and Test Data
train_triplets = sc.textFile('file:///users/jeevan4/challenge2/hash_train_triplets.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
train_visible = sc.textFile('file:///users/jeevan4/challenge2/hash_year1_*visible.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
test_hidden = sc.textFile('file:///users/jeevan4/challenge2/hash_year1_*hidden.txt').map(lambda x:x.split('\t')).map(lambda x: Row(user=x[0],song=x[1],counts=int(x[2])))
train_data = train_triplets.union(train_visible)

# total_data = train_data.union(test_hidden)

# Creating a dataframe so that they can be used as Structured way 
train_df = sql.createDataFrame(train_data)

test_df = sql.createDataFrame(test_hidden)

# Creating a logical table from dataframe
sql.registerDataFrameAsTable(train_df,'train_data_table')

sql.registerDataFrameAsTable(test_df,'test_data_table')

# Generating Normalized test dataset by calculating max counts from the entire dataset
test_norm_data = sql.sql('Select x.user,song,x.counts/y.maximum from test_data_table x INNER JOIN (select user,max(counts) as maximum from train_data_table group by user) y on x.user = y.user')

# saving the test normalized dataset into filesystem with single partition
test_norm_data.map(lambda x: x.user+','+x.song+','+str(x.c2).coalesce(1).saveAsTextFile('file:///users/jeevan4/challenge2/test_norm')
开发者ID:jeevan4,项目名称:recommendation_system,代码行数:31,代码来源:test_norm.py

示例5: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
endTime = DT.datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
station_id = sys.argv[3]
fileName = sys.argv[4]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]

# Start Spark SQL Context
sc = SparkContext("local", "TTCData")
sqlContext = SQLContext(sc)

# Query lat and lon data for specified time period and routeTag
initQuery = "(SELECT dateTime, nbBikes, lat, lon FROM BIXI WHERE dateTime>='" + str(startTime) + "' AND dateTime<'" + str(endTime) + \
                "' AND station_id=" + str(station_id) + ") AS T"
routeTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName)
sqlContext.registerDataFrameAsTable(routeTable, "rawData")

# Query route stop lat and lon
initQuery = "(SELECT start_lat, start_lon FROM BIXI_ROUTES WHERE route_short_name=" + str(routeTag) + ") AS T"
routeLoc = sqlContext.load(None, "jdbc", None, url=dbDestUrl, dbtable=initQuery, driver=driverName).collect()

# Calculate lat and lon ranges
startLatUpper = round(float(str(routeLoc[0].start_lat)), Prec) + Tol
startLatLower = round(float(str(routeLoc[0].start_lat)), Prec) - Tol
startLonUpper = round(float(str(routeLoc[0].start_lon)), Prec) + Tol
startLonLower = round(float(str(routeLoc[0].start_lon)), Prec) - Tol

# Loop through at time interval
curTime = startTime
freqList = []
freqCount = {}
开发者ID:zsy2053,项目名称:BIXI,代码行数:33,代码来源:BIXIFreq.py

示例6:

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
)

# display the schema of the dataframe.
df.show()

# display the schema of the dataframe.
df.printSchema()

# data frame is euivalent to the relational table in spark SQL. To select the column from dataframe.
print df.select(df.name).collect()

# registering data frame as a temporary table.
# df.registerTempTable("person")

# register dataframe as a sqlContext table.
sqlContext.registerDataFrameAsTable(df, "person")

# perform sql select query upon the registered context table person.
d = sqlContext.sql("select name from person")

# craeting new RDD by applying function to each row.
names = d.map(lambda p: "name: " + p.name)

# to print new row data.
for name in names.collect():
    print name

# to print list of tables in the current database.
print sqlContext.tableNames()

# creating new data frame containing union of rows in two dataframes.
开发者ID:rajbirk,项目名称:python-spark-mysql-example,代码行数:33,代码来源:sql_code.py

示例7: findAgeCoded

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
    age2 = findAgeCoded(row2['age'], listCoded, listOfRows)
    resultAge = abs(age2-age1)
    resultGender = antiSimOfGender(row1['gender'],row2['gender'])
    resultOccupation = antiSimOfOccupation(row1['occupation'],row2['occupation'])
    resultZipCode = antiSimOfZipCode(row1['zipCode'],row2['zipCode'])
    return (resultGender+resultOccupation+resultZipCode+resultAge)/4
    
conf = SparkConf().setAppName('rawDataToOutlineData').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

dfUser = sqlContext.read.parquet('user_base')
dfUser.show()
print(dfUser.dtypes)
print(dfUser.schema)
sqlContext.registerDataFrameAsTable(dfUser,'user_base')
"""对序数性属性age进行编码"""
dfUserGrouped = dfUser.groupBy(dfUser.age).count()
listOfRows = dfUserGrouped.orderBy('age').collect()
print(listOfRows)
num = len(listOfRows)
l = []
for index in range(num):
    temp = {}
    if(index == 0):
        temp[listOfRows[index]['age']] = 1
    else:
        M = l[index-1][listOfRows[index-1]['age']]+listOfRows[index-1]['count']
        temp[listOfRows[index]['age']] = M
    l.append(temp)
print(l)
开发者ID:raymon-tian,项目名称:recommenderSystemBasedOnSpark,代码行数:33,代码来源:countAntiSimForTwoUsers.py

示例8: map

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
# Replace ' ' by '_' in column names
col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)
f_crimes.names = col_names
refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
f_crimes = f_crimes.drop("Date")

# Expose H2O frames as Spark DataFrame

df_weather = h2oContext.as_spark_frame(f_weather)
df_census = h2oContext.as_spark_frame(f_census)
df_crimes = h2oContext.as_spark_frame(f_crimes)

# Use Spark SQL to join datasets

# Register DataFrames as tables in SQL context
sqlContext.registerDataFrameAsTable(df_weather, "chicagoWeather")
sqlContext.registerDataFrameAsTable(df_census, "chicagoCensus")
sqlContext.registerDataFrameAsTable(df_crimes, "chicagoCrime")


crimeWithWeather = sqlContext.sql("""SELECT
a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,
a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,
a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,
b.minTemp, b.maxTemp, b.meanTemp,
c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,
c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,
c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
FROM chicagoCrime a
JOIN chicagoWeather b
ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day
开发者ID:Zolok,项目名称:sparkling-water,代码行数:33,代码来源:ChicagoCrime.py

示例9: count

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
full['average_stars'].isnull().sum()

###############################################################################################################
#### Data Exploration for each Dataset
###############################################################################################################

business = train_bus
user = train_usr
review = train_rvw
checkin = train_check



################# Summary of User Table ###########################
user.describe().show()
sqlContext.registerDataFrameAsTable(user, "user")
# no duplicate records
# no duplicate on userid column
# only one column has zero average_stars --- zero average_stars means no rating given by user but the user has given review
# average of average_stars = 3.75
user.printSchema()
t = user.dropDuplicates(['user_id'])
t.count()

df2 = sqlContext.sql("SELECT count(review_count) as count from user where review_count = 0.0 ")

# For test data
test_usr.describe().show()
# min review count is one
test_usr.printSchema()
t = test_usr.dropDuplicates(['user_id'])
开发者ID:USF-ML2,项目名称:Rectastic-,代码行数:33,代码来源:DataExploration.py

示例10: StructField

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
#load stores ... I'm lazy and use string ids
stores_rdd=sc.textFile(base_path+"stores.txt").map(lambda x:x.split('\t')) # id | name
stores_fields=[
	StructField('id',StringType(),False), # name,type,nullable
	StructField('name',StringType(),False), 
]
stores=sqlCtx.createDataFrame(stores_rdd,StructType(stores_fields))

products_rdd=sc.textFile(base_path+"products.txt").map(lambda x:x.split('\t')) # id | name | category
products_fields=[
	StructField('id',StringType(),False), # name,type,nullable
	StructField('name',StringType(),False), 
	StructField('category',StringType(),True), 
]
products=sqlCtx.createDataFrame(products_rdd,StructType(products_fields))

sqlCtx.registerDataFrameAsTable(sales,"sales")
sqlCtx.registerDataFrameAsTable(stores,"stores")
sqlCtx.registerDataFrameAsTable(products,"products")

# can do SQL, including joins and GroupBy !!
sqlCtx.sql("SELECT * FROM sales sa join stores st on sa.store=st.id").show()
sqlCtx.sql("SELECT * FROM sales s join products p on s.product=p.id").show()

# .explain
# .agg
from pyspark.sql import functions as funcs

sales.groupBy('day').agg(funcs.min('store').alias('MinStore'),funcs.max('quantity').alias('MaxQty')).show()
开发者ID:Spicysheep,项目名称:spark-pycon15,代码行数:31,代码来源:sales_datatable.py

示例11: countBus

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
def countBus(startInt, endInt):
    #if startInt < DT.datetime.strptime("2015-03-23 12:00:00", "%Y-%m-%d %H:%M:%S"): return    # ATTENTION: TAKE OUT

    # Start Spark SQL Context
    sc = SparkContext("local", "BIXIData")
    sqlContext = SQLContext(sc)

    # Query route list
    routeData = requests.get('http://portal.cvst.ca/api/0.1/ttc/routes')
    routeTable = routeData.json()
    routeSize = len(routeTable) 

    # Count bus freq for list of routes
    for route in routeTable:
        #if route.route_short_name != 85:
        #    continue
        # Get the upper and lower bounds for the start location of the route
        trgLat = route.start_lat
        trgLon = route.start_lon
        latUpper = round(trgLat, Prec) + Tol
        latLower = round(trgLat, Prec) - Tol
        lonUpper = round(trgLon, Prec) + Tol
        lonLower = round(trgLon, Prec) - Tol

        # Scale factor for route with same start and end location
        rpFactor = 1
        if (route.end_lat >= latLower) and (route.end_lat <= latUpper) and (route.end_lon >= lonLower) and (route.end_lon <= lonUpper):
            rpFactor = 2
        else:
            rpFactor = 1

        # get ttc raw data table
        Period = {startTime:startInt,endTime:endInt}
        ttcRawTable = requests.get('http://portal.cvst.ca/api/0.1/ttc/routetag/'+route.route_short_name,params=Period)
        sqlContext.registerDataFrameAsTable(ttcRawTable, "rawData")

        # Filter data with matching location
        resTable = sqlContext.sql("SELECT nbBikes, dateTime FROM rawData WHERE " +\
                                "lat>=" + str(latLower) + " AND lat<=" + str(latUpper) +\
                                " AND lon>=" + str(lonLower) + " AND lon<=" + str(lonUpper))
        sqlContext.registerDataFrameAsTable(resTable, "resData")

        # Select unique buses in matched data
        idList = sqlContext.sql("SELECT nbBikes FROM resData").collect()

        # Find freq of each unique bus
        freq = 0
        for idRow in idList:
            finList = sqlContext.sql("SELECT dateTime FROM resData WHERE station_id=" + str(idRow.station_id) + " ORDER BY dateTime ASC").collect()
            for i in range(0, len(finList) - 1):
                if (finList[i + 1].dateTime - finList[i].dateTime) > timeInt:
                    freq += 1
            freq += 1

        #insert into db
        freq_table = {routeTag:route.route_short_name, startTime:startInt, freq: int(round(freq/rpFactor))}
        r = requests.post('http://portal.cvst.ca/api/0.1/ttc_freq/store',data=freq_table) 

    # Stop Spark
    sc.stop()
    return
开发者ID:zsy2053,项目名称:BIXI,代码行数:63,代码来源:BIXIFreq_modified.py

示例12: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
driverName = environ["jdbcDriverName"]
dbSrcUrl = environ["mysql_srcUrl"]
dbDestUrl = environ["mysql_destUrl"]
#print sys.argv[1] + " " + sys.argv[2] + ' ' + sys.argv[3] + ' ' + sys.argv[4]


# Start Spark SQL Context
sc = SparkContext("local", "BIXIData")
sqlContext = SQLContext(sc)

# Query lat and lon data
initQuery = "(SELECT dateTime, lat, lon FROM BIXI WHERE dateTime<'" + str(endTime) + "' AND dateTime>='" + str(startTime) + "'" + \
            " AND station_id=" + str(station_id) + ") AS T"
#print initQuery
lineTable = sqlContext.load(None, "jdbc", None, url=dbSrcUrl, dbtable=initQuery, driver=driverName).sample(False, sampleRate)
sqlContext.registerDataFrameAsTable(lineTable, "lineTbl")

# Compute standard deviations and means of lat and lon data sets
latList = []
lonList = []

for row in lineTable.collect():
    latList.append(row.lat)
    lonList.append(row.lon)

latDev = numpy.std(latList)
lonDev = numpy.std(lonList)
latMean = numpy.mean(latList)
lonMean = numpy.mean(lonList)

# Compute valid ranges of lat and lon based on <stdDevOut>
开发者ID:zsy2053,项目名称:BIXI,代码行数:33,代码来源:BIXILine.py

示例13: they

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
#defining the sql and conf objects for spark
#Note: these need not be defines if the code is run n databricks cluster, becuase they (SC and CONF) are defined by default
conf = SparkConf().setAppName("Dota2")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


#path should be changed where data resides.
#In our case we are using databricks sparks cluster.
#So pthis path is databricks tables  where in the data files are resided on aws S3 storage.
#registering the data as datafrmae
df = sqlContext.read.json("/FileStore/tables/3v7vab741462231829765/my_match_history.json")
df.printSchema()

#registering the data from datafram to table
sqlContext.registerDataFrameAsTable(df, "my_full_table")

#performing few ETL and Field profiling using the queries.
#Hero_summary and item_summar ETl and profiling
hero_summary = sqlContext.read.json("/FileStore/tables/wbn4nbby1461991735655/hero_summary.json")
sqlContext.registerDataFrameAsTable(hero_summary, "hero_summary")
hero_summary.printSchema()

item_summary = sqlContext.read.json("/FileStore/tables/vvzkk2qc1462214278912/item_summmary.json")
sqlContext.registerDataFrameAsTable(item_summary, "item_summary")
item_summary.printSchema()

#Query for Changing the data values - profiling
hero_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.hero_id = hero_summary.heroes.localized_name from hero_summary where my_full_table.reslt.players.hero_id = hero_summary.heroes.hero_id")
item0_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.item_0 = item_summary.items.localized_name from item_summary where my_full_table.reslt.players.item_0 = item_summary.items.id")
item1_profile = sqlContext.sql("UPDATE my_full_table SET my_full_table.reslt.players.item_1 = item_summary.items.localized_name from item_summary where my_full_table.reslt.players.item_1 = item_summary.items.id")
开发者ID:asha-mandot,项目名称:BigDota2,代码行数:33,代码来源:spark.py

示例14: SparkContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
master_public_dns = os.environ['master_public_dns']

# setting SparkContext and SQLContext
sc = SparkContext("spark://" + master_ip + ":7077", "2013_events")
sqlContext = SQLContext(sc)

# reading events data for 2013 from HDFS
df13 = sqlContext.jsonFile("hdfs://" + master_public_dns + ":9000/data_jan2015/2013-*.*")

# filtering rows with just the three relevant events
df13_watch = df13.filter("type='WatchEvent'")
df13_commit = df13.filter("type='CommitCommentEvent'")
df13_fork = df13.filter("type='ForkEvent'")

# registering  dataframes as tables to be able to select just the three relevant columns
sqlContext.registerDataFrameAsTable(df13_watch, "df13_watch_table")
sqlContext.registerDataFrameAsTable(df13_commit, "df13_commit_table")
sqlContext.registerDataFrameAsTable(df13_fork, "df13_fork_table")

# creating new dataframes with just the relevant columns
df13_watch_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_watch_table")
df13_commit_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_commit_table")
df13_fork_altered = sqlContext.sql("SELECT actor, repository, type FROM df13_fork_table")

# registering dataframes as tables to get a union of all
sqlContext.registerDataFrameAsTable(df13_watch_altered, "df13_watch_altered_table")
sqlContext.registerDataFrameAsTable(df13_commit_altered, "df13_commit_altered_table")
sqlContext.registerDataFrameAsTable(df13_fork_altered, "df13_fork_altered_table")

# unifying tables with filtered events and columns
df13_altered_union = sqlContext.sql("SELECT * from df13_watch_altered_table UNION ALL SELECT * from df13_commit_altered_table UNION ALL SELECT * from df13_fork_altered_table")
开发者ID:patrickzheng,项目名称:gitgraph,代码行数:33,代码来源:2013_events.py

示例15: __repr__

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import registerDataFrameAsTable [as 别名]
  def __repr__(self):
    return (self.username, self.follows)

# connecting to "watch_events" keyspace in Cassandra and creating new Spark Context
connection.setup(['127.0.0.1', '52.8.41.216'], "watch_events")
sc = SparkContext("spark://ip-172-31-2-89:7077", "watchevents")
sqlContext = SQLContext(sc)

# reading data from HDFS and filtering with 'WatchEvent', 'CommitEvent', 'PushEvent'
df = sqlContext.jsonFile("hdfs://ec2-52-8-127-252.us-west-1.compute.amazonaws.com:9000/data_jan2015/2015-01-01-*")
df_watch = df.filter("type='WatchEvent'")
df_push = df.filter("type='PushEvent'")
df_commit = df.filter("type='CommitEvent'")

# registering data frames as tables to unite all rows
sqlContext.registerDataFrameAsTable(df_watch, "df_watch_table")
sqlContext.registerDataFrameAsTable(df_push, "df_push_table")
sqlContext.registerDataFrameAsTable(df_commit, "df_commit_table")
df_union = sqlContext.sql("SELECT * from df_watch_table UNION ALL SELECT * from df_push_table")

# spark job to group all repos for a given username considering the three filtered events above
userrepomap = df_union.map(lambda p: (p.actor.login, tuple((p.repo.name, datetime.strptime(p.created_at[:-1], "%Y-%m-%dT%H:%M:%S"))))).groupByKey()
userrepocoll = userrepomap.map(lambda x: (x[0], dict(x[1]))).collect()

# getting users->followers from HDFS
# reverse mapping it to users->following
dffoll = sqlContext.jsonFile("hdfs://ec2-52-8-127-252.us-west-1.compute.amazonaws.com:9000/user/2015-01-01-0_followers.json")
collfoll = dffoll.map(lambda x : (x.login, x.following)).collect()
follr = []

# follr is a list of dicts with users and people who these users follow
开发者ID:patrickzheng,项目名称:gitgraph,代码行数:33,代码来源:2015-01-01-0-graph.py


注:本文中的pyspark.sql.SQLContext.registerDataFrameAsTable方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。