本文整理汇总了Python中pyspark.sql.SQLContext.clearCache方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.clearCache方法的具体用法?Python SQLContext.clearCache怎么用?Python SQLContext.clearCache使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.clearCache方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: travelTime
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
#.........这里部分代码省略.........
if (int(trvStart.hour / timeIntHr) * timeIntHr != trvInt) and (trvCount != 0):
# print "trvInt: " + str(trvInt) + " " + str(trvStart.hour / timeIntHr)
# print "new: " + str((trvInt-1) * timeIntHr) + " " + str(trvSum/trvCount)
# newRow = sqlContext.createDataFrame([((trvInt-1) * timeIntHr, int(trvSum / trvCount))], trvSchm)
# trvTimeList = trvTimeList.unionAll(newRow)
print "new: " + " " + str(trvInt) + " " + str(int(trvSum / trvCount))
newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
trvTimeList = trvTimeList.unionAll(newRow)
trvCount = 0
trvSum = 0
trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
# trvInt = trvStart.hour / timeIntHr
#print str(busrow.vehicle_id) + " > " + str(trvStart.hour) + " / " + str(timeIntHr) + " = " + str(trvInt)
elif i == (rangeSize - 1):
trvEnd = temp[i].dateTime
#trvSum += (trvEnd - trvStart).total_seconds() / 60 # caculate travel time in minutes
tempTrip = (trvEnd - trvStart).total_seconds() / 60
if tempTrip > minTravel:
#trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
#newRow = sqlContext.createDataFrame([(trvInt, int(tempTrip))], trvSchm)
#trvTimeList = trvTimeList.unionAll(newRow)
#print "new: " + str(trvInt) + " " + str(tempTrip)
trvSum += tempTrip
trvCount += 1
print "new: " + " " + str(trvInt) + " " + str(int(trvSum / trvCount))
newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
trvTimeList = trvTimeList.unionAll(newRow)
#trvCount += 1
#newRow = sqlContext.createDataFrame([(trvInt * timeIntHr, int(trvSum / trvCount))], trvSchm)
#trvTimeList = trvTimeList.unionAll(newRow)
trvTimeList = trvTimeList.groupBy('startTime').avg('trvTime').collect()
for row in trvTimeList:
tempTime = startTime + timedelta(hours = int(row.startTime))
tempStr = "INSERT INTO BIXI_TRAVEL (station_id, startTime, travelTime) VALUES ("
tempStr += str(route[0][0]) + ", '" + str(tempTime) + "', " + str(round(row[1], timePrec)) + ");"
print tempStr
curs.execute(tempStr)
sqlContext.clearCache()
sc.stop()
conn.commit()
sys.exit() # REMOVE
return
# Calculate 1 travel time per 3-hr interval
while curTime < endInt:
idList = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "'").sample(False, sampleRate).collect()
print "idList: " + str(len(idList)) + " [" + str(route[0][0]) + "]"
for row in idList:
tempTable = sqlContext.sql("SELECT dateTime FROM rawData WHERE station_id=" + str(row.station_id))
print "Count: " + str(tempTable.count()) + " [" + str(row.station_id) + "]"
print str(tempTable.sort(asc('dateTime')).collect()[0][0]) + " - " + str(tempTable.sort(desc('dateTime')).collect()[0][0])
curTime += timeInt
continue
tempTable = sqlContext.sql("SELECT * FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "'")
sqlContext.registerDataFrameAsTable(tempTable, "results")
startTable = sqlContext.sql("SELECT station_id, dateTime FROM results WHERE lat>=" + str(startLatLower) + \
" AND lat<=" + str(startLatUpper) + " AND lon>=" + str(startLonLower) + \
" AND lon<=" + str(startLonUpper) + " ORDER BY dateTime ASC")
startTableSize = startTable.count()
startTable = startTable.first()
endTable = sqlContext.sql("SELECT station_id, dateTime FROM results WHERE lat>=" + str(endLatLower) + \
" AND lat<=" + str(endLatUpper) + " AND lon>=" + str(endLonLower) + \
" AND lon<=" + str(endLonUpper) + " ORDER BY dateTime ASC")
endTableSize = endTable.count()
endTable = endTable.collect()
#print str(startTableSize) + " " + str(endTableSize)
#STOPPED HERE count zeros
if endTableSize < 1 or startTableSize < 1:
curTime += timeInt
continue
#print "startTable: " + str(startTable[0]) + " " + str(startTable[1])
# Loop to find first matching stop in endTable
for j in range (0, endTableSize):
#tripTime = DT.datetime.strptime(endTable[j].dateTime, "%Y-%m-%d %H:%M:%S") - DT.datetime.strptime(startTable[1], "%Y-%m-%d %H:%M:%S")
tripTime = endTable[j].dateTime - startTable[1]
#print str(endTable[j].dateTime) + " - " + str(startTable[1]) + " = " + str(tripTime) + " > " + str(minTravel) + " " + str(tripTime > minTravel)
#print str(tripTime) + " " + str(startTable[0]) + " == " + str(endTable[j].vehicle_id) + " " + str(endTable[j].vehicle_id == startTable[0])
if (endTable[j].station_id == startTable[0]) and (tripTime > minTravel):
#print str(endTable[j].vehicle_id) + " " + str(tripTime)
tempStr = "INSERT INTO BIXI_TRAVEL (station_id, startTime, travelTime) VALUES ("
tempStr += str(route[0][0]) + ", '" + str(curTime) + "', '" + str(endTable[j].dateTime - startTable.dateTime) + "');"
#print tempStr
curs.execute(tempStr)
break
curTime += timeInt
# Stop Spark Context
sc.stop()
return
示例2: StructType
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229','sip:[email protected]'),
('XO4','missing','2014-02-13T12:32:36.881','sip:[email protected]'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323','sip:[email protected]')])
schema3 = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True),
StructField('ANI', StringType(), True)])
df3 = sqlContext.createDataFrame(rdd3, schema3)
#######################################################################################
# USEFUL CODE SNIPPETS
#######################################################################################
IVRCallLogs.columns # show all column headers
IVRCallLogs.show(10) # show first ten rows of a dataframe
IVRCallLogs.take(10) # show first ten rows of an RDD
sqlContext.clearCache() # Removes all cached tables from the in-memory cache.
#######################################################################################
# DATA EXPLORATION TASKS
#######################################################################################
# Frequency Counts
df2.Department.distinct().count()
#######################################################################################
# DATA MUNGING TASKS
#######################################################################################
# DEALING WITH DUPLICATES--------------------------------------------------------------
示例3: __init__
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
class DataAnalysis:
def __init__(self):
self.conf = (SparkConf()
.setAppName("Data Analysis")
.set("spark.executor.cores", '1')
# 指定mysql 驱动jar包
.set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')
.set("spark.shuffle.blockTransferService", "nio")
)
self.sc = SparkContext(conf=self.conf)
self.sql_context = SQLContext(self.sc)
# 批量插入
self.BATCH_SIZE = 10
# 链接到数据库
self.mysqlconn = MysqlConnection(db='core', host='10.9.29.212', passwd='')
self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8'
logging.config.fileConfig('./conf/logging.conf')
self.logger = logging.getLogger('simpleLogger')
self.hdfs_root_url = 'hdfs://master:9000'
self.local_root_url = '/data/jjzhu'
# self.logger = Logger()
def load_from_mysql(self, db, dbtable):
"""
通过指定mysql将数据库中的表加载为DataFrame
:param db: 数据库名
:param dbtable: 表名
:return: DataFrame
"""
url = "jdbc:mysql://10.9.29.212:3306/" + db + "?user=root&characterEncoding=UTF-8"
df = self.sql_context.read.format("jdbc").options(url=url,
dbtable=dbtable,
driver="com.mysql.jdbc.Driver").load()
# df = sqlContext.read.jdbc(url, dbtable, properties=properties)
return df
def update_acc_detail(self):
"""
更新客户明细表中 有些只有15位账号而没有22位账号的 和 有些只有22位账号而没有15位账号的
:return:
"""
no15_update_query = u'update t_CMMS_ACCOUNT_DETAIL set ACC_NO15=%s where ACC_NO22=%s'
no22_update_query = u'update t_CMMS_ACCOUNT_DETAIL set ACC_NO22=%s where ACC_NO15=%s'
# 加载明细表
acc_detail_df = self.load_from_mysql(u'core', u't_CMMS_ACCOUNT_DETAIL')
acc_detail_df.cache() # 缓存一下
# 加载客户列表
acc_list_df = self.load_from_mysql(u'core', u't_CMMS_ACCOUNT_LIST')
# 只要客户列表中 15->22 账号的映射关系
acc_list_filter_df = acc_list_df.select(acc_list_df.ACC_NO22.alias('l_no22'),
acc_list_df.ACC_NO15.alias('l_no15'))
acc_list_filter_df.cache() # 缓存一下
# 筛选出明细表中15账号位空的记录
no15_empty_df = acc_detail_df.filter(acc_detail_df.ACC_NO15 == '').select('ACC_NO22')
# 左外部连接两个表
union_df = no15_empty_df.join(acc_list_filter_df,
no15_empty_df.ACC_NO22 == acc_list_filter_df.l_no22,
'left_outer')
result = []
for row in union_df.collect():
row_dic = row.asDict()
if result.__len__() >= self.BATCH_SIZE: # 批量插入
self.mysqlconn.execute_many(no15_update_query, result)
result.clear() # 清空列表
# 如果15的不为空
if row_dic['l_no15'] is not None:
# print(row_dic)
# 添加到待更新列表中
result.append((row_dic['l_no15'], row_dic['ACC_NO22']))
if result.__len__() != 0:
self.mysqlconn.execute_many(no15_update_query, result)
result.clear()
'''
以下是更新22位账号为空的记录,操作和更新15位的一毛一样
'''
no22_empty_df = acc_detail_df.filter(acc_detail_df.ACC_NO22 == '').select('ACC_NO15')
union_df = no22_empty_df.join(acc_list_filter_df,
no22_empty_df.ACC_NO15 == acc_list_filter_df.l_no15,
'left_outer')
for row in union_df.take(10):
row_dic = row.asDict()
if result.__len__() >= self.BATCH_SIZE:
self.mysqlconn.execute_many(no22_update_query, result)
result.clear()
if row_dic['l_no22'] is not None:
print(row_dic)
result.append((row_dic['l_no22'], row_dic['ACC_NO15']))
if result.__len__() != 0:
self.mysqlconn.execute_many(no22_update_query, result)
result.clear()
# 清缓存
self.sql_context.clearCache()
def update_acc_list(self):
"""
#.........这里部分代码省略.........
示例4: str
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
save data and proceed to next time interval
print str(trvInt)
newRow = sqlContext.createDataFrame([(str(trvInt), int(trvSum / trvCount))], trvSchm)
trvTimeList = trvTimeList.unionAll(newRow)
trvCount = 0
trvSum = 0
trvInt = str(trvStart.date()) + " " + str(trvStart.hour).zfill(2) + ":00:00"
trvInt = DT.datetime.strptime(trvInt, "%Y-%m-%d %H:%M:%S")
temp = trvTimeList.groupBy('startTime').agg({'trvTime':'sum', 'startTime':'count'}).collect() # group travel time by interval
addResult(temp)
trvTimeList = trvTimeList.limit(0) # clear data in dataframe to prevent overflow (max 100)
#Stop Spark Context
sqlContext.clearCache()
sc.stop()
# Compute average travel time
for key in trvSumList.keys():
tempTime = DT.datetime.strptime(key, "%Y-%m-%d %H:%M:%S")
tempStr = tempTime.strftime("%H:%M") + " - " + (tempTime + timeInt).strftime("%H:%M")
travelTime['startTime'] = tempStr
if trvCountList[key] == 0:
travelTime['travelTime'] = trvSumList[key]
else:
travelTime['travelTime'] = round(trvSumList[key] / trvCountList[key])
travelList.append(travelTime.copy())
# Construct output JSON string