本文整理汇总了Python中pyspark.SQLContext.clearCache方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.clearCache方法的具体用法?Python SQLContext.clearCache怎么用?Python SQLContext.clearCache使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.clearCache方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import clearCache [as 别名]
#.........这里部分代码省略.........
aum_now = self.sqlctx.sql(aum_now_sql)
# 清除缓存表
self.sqlctx.dropTempTable('group_in')
# 联合
union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')
# 计算用户开户至今时间(months)
# 载入账户表
account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache()
account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account')
account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \
"(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \
"from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO"
account_age = self.sqlctx.sql(account_age_aql)
# calculate last tran date
account_1 = account.select('CUST_NO', 'ACC_NO15')
detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT')
a_d = account_1.join(detail, 'ACC_NO15', 'outer')
a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable')
last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO"
last_tr_date = self.sqlctx.sql(last_tr_date_sql)
# 联合 season aum_now account_age last_tr_date
unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer')
# 清除缓存表
self.sqlctx.dropTempTable('account')
self.sqlctx.dropTempTable('adtable')
self.sqlctx.clearCache()
# 结果插入表
print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...')
insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)"
# 缓冲区
temp = []
for row in unions.collect():
row_dic = row.asDict()
if len(temp) >= 1000: # 批量写入数据库
self.mysql_helper.executemany(insert_lifecycle_sql, temp)
temp.clear()
# 加载数据到缓冲区
try:
# 计算增长率
increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)']
except Exception:
increase = 0
# 计算开户时长(月份数) 若无则视为6个月以上
if row_dic['ACCOUNT_AGE'] is None:
row_dic['ACCOUNT_AGE'] = 7
# 最后交易日期
ltd = row_dic['LAST_TR_DATE']
if ltd is not None:
try:
ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')
except Exception:
示例2: __init__
# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import clearCache [as 别名]
class Credit:
def __init__(self):
self.conf = (SparkConf()
.setAppName("CREDIT")
.set("spark.cores.max", "2")
.set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
self.sc = SparkContext(conf=self.conf)
self.sqlctx = SQLContext(self.sc)
self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
self.base = 'hdfs://master:9000/gmc/'
def load_from_mysql(self, table, database='core'):
url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
return df
def sql_operate(self, sql, rdd, once_size=1000):
temp = []
for row in rdd.collect():
# print(row)
if len(temp) >= once_size:
self.mysql_helper.executemany(sql, temp)
temp.clear()
temp.append(row)
if len(temp) != 0:
self.mysql_helper.executemany(sql, temp)
temp.clear()
def prepare_fpgrowth_data(self):
tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
'MER_CAT_CD') \
.filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")
result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()
def m(x):
k = x[0]
l = list(x[1])
v = set()
for i in l:
v.add(i[0])
return set(v)
result = result.map(m)
for i in result.take(10):
print(i)
model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
result = model.freqItemsets().collect()
for r in result:
print(r)
def cycle_credit(self):
'''
信用卡聚类数据预处理
:return:
'''
print('---------------------------信用卡-Start--------------------------')
# 交易流水
credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT',
'BILL_AMTFLAG').filter(
"BILL_AMTFLAG ='-'").cache()
# 卡账户信息
credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE')
# 还款计算
return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT')
return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED'))
# 去除0最低还款额,即未消费的账单月
join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0')
# 清除缓存
self.sqlctx.clearCache()
def which_cycle_type(line):
mindue = line['STM_MINDUE']
returned = line['RETURNED']
'''
0:normal,all returned
1:cycle credit
2:overdue,don't return money
'''
if mindue is not None and returned is None:
flag = 2
elif returned >= mindue * 10:
flag = 0
elif returned > mindue and returned < mindue * 10:
flag = 1
else:
flag = 9
return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag,
STM_MINDUE=line['STM_MINDUE'])
#.........这里部分代码省略.........