当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.clearCache方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.clearCache方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.clearCache方法的具体用法?Python SQLContext.clearCache怎么用?Python SQLContext.clearCache使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.clearCache方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: travelTime

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]

#.........这里部分代码省略.........
                if (int(trvStart.hour / timeIntHr) * timeIntHr != trvInt) and (trvCount != 0):
                #    print "trvInt: " + str(trvInt) + " " + str(trvStart.hour / timeIntHr)
                #    print "new: " + str((trvInt-1) * timeIntHr) + " " + str(trvSum/trvCount)
                #    newRow = sqlContext.createDataFrame([((trvInt-1) * timeIntHr, int(trvSum / trvCount))], trvSchm)
                #    trvTimeList = trvTimeList.unionAll(newRow)
                    print "new: " + " " + str(trvInt) + " " + str(int(trvSum / trvCount))
                    newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
                    trvTimeList = trvTimeList.unionAll(newRow)
                    trvCount = 0
                    trvSum = 0
                    trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
                #    trvInt = trvStart.hour / timeIntHr
                    #print str(busrow.vehicle_id) + " > " + str(trvStart.hour) + " / " + str(timeIntHr) + " = " + str(trvInt)
            elif i == (rangeSize - 1):
                trvEnd = temp[i].dateTime
                #trvSum += (trvEnd - trvStart).total_seconds() / 60  # caculate travel time in minutes
                tempTrip = (trvEnd - trvStart).total_seconds() / 60
                if tempTrip > minTravel:
                    #trvInt = int(trvStart.hour / timeIntHr) * timeIntHr
                    #newRow = sqlContext.createDataFrame([(trvInt, int(tempTrip))], trvSchm)
                    #trvTimeList = trvTimeList.unionAll(newRow)
                    #print "new: " + str(trvInt) + " " + str(tempTrip)
                    trvSum += tempTrip
                    trvCount += 1
                    print "new: " + " " + str(trvInt) + " " + str(int(trvSum / trvCount))
                    newRow = sqlContext.createDataFrame([(trvInt, int(trvSum / trvCount))], trvSchm)
                    trvTimeList = trvTimeList.unionAll(newRow)


                #trvCount += 1
                #newRow = sqlContext.createDataFrame([(trvInt * timeIntHr, int(trvSum / trvCount))], trvSchm)
                #trvTimeList = trvTimeList.unionAll(newRow)

    trvTimeList = trvTimeList.groupBy('startTime').avg('trvTime').collect()

    for row in trvTimeList:
        tempTime = startTime + timedelta(hours = int(row.startTime))
        tempStr = "INSERT INTO BIXI_TRAVEL (station_id, startTime, travelTime) VALUES ("
        tempStr += str(route[0][0]) + ", '" + str(tempTime) + "', " + str(round(row[1], timePrec)) + ");"
        print tempStr
        curs.execute(tempStr)

    sqlContext.clearCache()
    sc.stop()
    conn.commit()
    sys.exit()  # REMOVE
    return

    # Calculate 1 travel time per 3-hr interval
    while curTime < endInt:
        idList = sqlContext.sql("SELECT nbBikes FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "'").sample(False, sampleRate).collect()
        print "idList: " + str(len(idList)) + " [" + str(route[0][0]) + "]"
        for row in idList:
            tempTable = sqlContext.sql("SELECT dateTime FROM rawData WHERE station_id=" + str(row.station_id))
            print "Count: " + str(tempTable.count()) + " [" + str(row.station_id) + "]"
            print str(tempTable.sort(asc('dateTime')).collect()[0][0]) + " - " + str(tempTable.sort(desc('dateTime')).collect()[0][0])

            
        curTime += timeInt
        continue

        tempTable = sqlContext.sql("SELECT * FROM rawData WHERE dateTime>='" + str(curTime) + "' AND dateTime<'" + str(curTime + timeInt) + "'")
        sqlContext.registerDataFrameAsTable(tempTable, "results")
        startTable = sqlContext.sql("SELECT station_id, dateTime FROM results WHERE lat>=" + str(startLatLower) + \
                                   " AND lat<=" + str(startLatUpper) + " AND lon>=" + str(startLonLower) + \
                                   " AND lon<=" + str(startLonUpper) + " ORDER BY dateTime ASC")
        startTableSize = startTable.count()
        startTable = startTable.first()
        endTable = sqlContext.sql("SELECT station_id, dateTime FROM results WHERE lat>=" + str(endLatLower) + \
                                   " AND lat<=" + str(endLatUpper) + " AND lon>=" + str(endLonLower) + \
                                   " AND lon<=" + str(endLonUpper) + " ORDER BY dateTime ASC")
        endTableSize = endTable.count()
        endTable = endTable.collect()
        #print str(startTableSize) + " " + str(endTableSize)

        #STOPPED HERE        count zeros
        if endTableSize < 1 or startTableSize < 1: 
            curTime += timeInt
            continue
        #print "startTable: " + str(startTable[0]) + " " + str(startTable[1])

        # Loop to find first matching stop in endTable
        for j in range (0, endTableSize):
            #tripTime = DT.datetime.strptime(endTable[j].dateTime, "%Y-%m-%d %H:%M:%S") -  DT.datetime.strptime(startTable[1], "%Y-%m-%d %H:%M:%S")
            tripTime = endTable[j].dateTime - startTable[1]
            #print str(endTable[j].dateTime) + " - " + str(startTable[1]) + " = " + str(tripTime) + " > " + str(minTravel) + " " + str(tripTime > minTravel)
            #print str(tripTime) + " " + str(startTable[0]) + " == " + str(endTable[j].vehicle_id) + " " + str(endTable[j].vehicle_id == startTable[0])

            if (endTable[j].station_id == startTable[0]) and (tripTime > minTravel):
                #print str(endTable[j].vehicle_id) + " " +  str(tripTime)
                tempStr = "INSERT INTO BIXI_TRAVEL (station_id, startTime, travelTime) VALUES ("
                tempStr += str(route[0][0]) + ", '" + str(curTime) + "', '" + str(endTable[j].dateTime - startTable.dateTime) + "');"
                #print tempStr
                curs.execute(tempStr)
                break
        curTime += timeInt

    # Stop Spark Context
    sc.stop()
    return
开发者ID:zsy2053,项目名称:BIXI,代码行数:104,代码来源:BIXITravel.py

示例2: StructType

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
                       ('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229','sip:[email protected]'),
                       ('XO4','missing','2014-02-13T12:32:36.881','sip:[email protected]'),
                       ('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323','sip:[email protected]')])
schema3 = StructType([StructField('ID', StringType(), True),
                          StructField('EndDateTime', StringType(), True),
                          StructField('StartDateTime', StringType(), True),
                          StructField('ANI', StringType(), True)])
df3 = sqlContext.createDataFrame(rdd3, schema3)

#######################################################################################
# USEFUL CODE SNIPPETS
#######################################################################################
IVRCallLogs.columns         # show all column headers
IVRCallLogs.show(10)        # show first ten rows of a dataframe
IVRCallLogs.take(10)        # show first ten rows of an RDD
sqlContext.clearCache()     # Removes all cached tables from the in-memory cache.

#######################################################################################
# DATA EXPLORATION TASKS
#######################################################################################

# Frequency Counts
df2.Department.distinct().count()


#######################################################################################
# DATA MUNGING TASKS
#######################################################################################

# DEALING WITH DUPLICATES--------------------------------------------------------------
开发者ID:macjas,项目名称:Apache-Spark-Code-Snippets,代码行数:32,代码来源:2.+Spark+Cookbook.py

示例3: __init__

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
class DataAnalysis:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("Data Analysis")
                     .set("spark.executor.cores", '1')
                     # 指定mysql 驱动jar包
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')
                     .set("spark.shuffle.blockTransferService", "nio")
                     )
        self.sc = SparkContext(conf=self.conf)
        self.sql_context = SQLContext(self.sc)
        # 批量插入
        self.BATCH_SIZE = 10

        # 链接到数据库
        self.mysqlconn = MysqlConnection(db='core', host='10.9.29.212', passwd='')
        self.mysql_url = 'jdbc:mysql://10.9.29.212:3306/core?user=root&characterEncoding=UTF-8'
        logging.config.fileConfig('./conf/logging.conf')
        self.logger = logging.getLogger('simpleLogger')
        self.hdfs_root_url = 'hdfs://master:9000'
        self.local_root_url = '/data/jjzhu'

        # self.logger = Logger()

    def load_from_mysql(self, db, dbtable):
        """
        通过指定mysql将数据库中的表加载为DataFrame
        :param db: 数据库名
        :param dbtable: 表名
        :return: DataFrame
        """
        url = "jdbc:mysql://10.9.29.212:3306/" + db + "?user=root&characterEncoding=UTF-8"
        df = self.sql_context.read.format("jdbc").options(url=url,
                                                          dbtable=dbtable,
                                                          driver="com.mysql.jdbc.Driver").load()
        # df = sqlContext.read.jdbc(url, dbtable, properties=properties)
        return df

    def update_acc_detail(self):
        """
        更新客户明细表中 有些只有15位账号而没有22位账号的  和 有些只有22位账号而没有15位账号的
        :return:
        """
        no15_update_query = u'update t_CMMS_ACCOUNT_DETAIL set ACC_NO15=%s where ACC_NO22=%s'
        no22_update_query = u'update t_CMMS_ACCOUNT_DETAIL set ACC_NO22=%s where ACC_NO15=%s'
        # 加载明细表
        acc_detail_df = self.load_from_mysql(u'core', u't_CMMS_ACCOUNT_DETAIL')
        acc_detail_df.cache()  # 缓存一下
        # 加载客户列表
        acc_list_df = self.load_from_mysql(u'core', u't_CMMS_ACCOUNT_LIST')
        # 只要客户列表中 15->22 账号的映射关系
        acc_list_filter_df = acc_list_df.select(acc_list_df.ACC_NO22.alias('l_no22'),
                                                acc_list_df.ACC_NO15.alias('l_no15'))
        acc_list_filter_df.cache()  # 缓存一下
        # 筛选出明细表中15账号位空的记录
        no15_empty_df = acc_detail_df.filter(acc_detail_df.ACC_NO15 == '').select('ACC_NO22')
        # 左外部连接两个表
        union_df = no15_empty_df.join(acc_list_filter_df,
                                      no15_empty_df.ACC_NO22 == acc_list_filter_df.l_no22,
                                      'left_outer')
        result = []

        for row in union_df.collect():
            row_dic = row.asDict()

            if result.__len__() >= self.BATCH_SIZE:  # 批量插入
                self.mysqlconn.execute_many(no15_update_query, result)
                result.clear()  # 清空列表
            # 如果15的不为空
            if row_dic['l_no15'] is not None:
                # print(row_dic)
                # 添加到待更新列表中
                result.append((row_dic['l_no15'], row_dic['ACC_NO22']))
        if result.__len__() != 0:
            self.mysqlconn.execute_many(no15_update_query, result)
            result.clear()

        '''
        以下是更新22位账号为空的记录,操作和更新15位的一毛一样
        '''
        no22_empty_df = acc_detail_df.filter(acc_detail_df.ACC_NO22 == '').select('ACC_NO15')
        union_df = no22_empty_df.join(acc_list_filter_df,
                                      no22_empty_df.ACC_NO15 == acc_list_filter_df.l_no15,
                                      'left_outer')
        for row in union_df.take(10):
            row_dic = row.asDict()
            if result.__len__() >= self.BATCH_SIZE:
                self.mysqlconn.execute_many(no22_update_query, result)
                result.clear()
            if row_dic['l_no22'] is not None:
                print(row_dic)
                result.append((row_dic['l_no22'], row_dic['ACC_NO15']))
        if result.__len__() != 0:
            self.mysqlconn.execute_many(no22_update_query, result)
            result.clear()
        # 清缓存
        self.sql_context.clearCache()

    def update_acc_list(self):
        """
#.........这里部分代码省略.........
开发者ID:summer-apple,项目名称:spark,代码行数:103,代码来源:jj_analysis.py

示例4: str

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import clearCache [as 别名]
                save data and proceed to next time interval
                print str(trvInt)
                newRow = sqlContext.createDataFrame([(str(trvInt), int(trvSum / trvCount))], trvSchm)
                trvTimeList = trvTimeList.unionAll(newRow)
                trvCount = 0
                trvSum = 0
                trvInt = str(trvStart.date()) + " " + str(trvStart.hour).zfill(2) + ":00:00"
                trvInt = DT.datetime.strptime(trvInt, "%Y-%m-%d %H:%M:%S")

    temp = trvTimeList.groupBy('startTime').agg({'trvTime':'sum', 'startTime':'count'}).collect()  # group travel time by interval
    addResult(temp)

    trvTimeList = trvTimeList.limit(0)  # clear data in dataframe to prevent overflow (max 100)

#Stop Spark Context
sqlContext.clearCache()
sc.stop()

# Compute average travel time
for key in trvSumList.keys():
    tempTime = DT.datetime.strptime(key, "%Y-%m-%d %H:%M:%S")
    tempStr = tempTime.strftime("%H:%M") + " - " + (tempTime + timeInt).strftime("%H:%M")
    travelTime['startTime'] = tempStr

    if trvCountList[key] == 0:
        travelTime['travelTime'] = trvSumList[key]
    else:
        travelTime['travelTime'] = round(trvSumList[key] / trvCountList[key])
    travelList.append(travelTime.copy())

# Construct output JSON string
开发者ID:zsy2053,项目名称:BIXI,代码行数:33,代码来源:BIXITravel.py


注:本文中的pyspark.sql.SQLContext.clearCache方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。