本文整理匯總了Python中pyspark.SQLContext.clearCache方法的典型用法代碼示例。如果您正苦於以下問題:Python SQLContext.clearCache方法的具體用法?Python SQLContext.clearCache怎麽用?Python SQLContext.clearCache使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.clearCache方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import clearCache [as 別名]
#.........這裏部分代碼省略.........
aum_now = self.sqlctx.sql(aum_now_sql)
# 清除緩存表
self.sqlctx.dropTempTable('group_in')
# 聯合
union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')
# 計算用戶開戶至今時間(months)
# 載入賬戶表
account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache()
account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account')
account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \
"(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \
"from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO"
account_age = self.sqlctx.sql(account_age_aql)
# calculate last tran date
account_1 = account.select('CUST_NO', 'ACC_NO15')
detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT')
a_d = account_1.join(detail, 'ACC_NO15', 'outer')
a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable')
last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO"
last_tr_date = self.sqlctx.sql(last_tr_date_sql)
# 聯合 season aum_now account_age last_tr_date
unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer')
# 清除緩存表
self.sqlctx.dropTempTable('account')
self.sqlctx.dropTempTable('adtable')
self.sqlctx.clearCache()
# 結果插入表
print('結果插入臨時表:t_CMMS_TEMP_LIFECYCLE...')
insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)"
# 緩衝區
temp = []
for row in unions.collect():
row_dic = row.asDict()
if len(temp) >= 1000: # 批量寫入數據庫
self.mysql_helper.executemany(insert_lifecycle_sql, temp)
temp.clear()
# 加載數據到緩衝區
try:
# 計算增長率
increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)']
except Exception:
increase = 0
# 計算開戶時長(月份數) 若無則視為6個月以上
if row_dic['ACCOUNT_AGE'] is None:
row_dic['ACCOUNT_AGE'] = 7
# 最後交易日期
ltd = row_dic['LAST_TR_DATE']
if ltd is not None:
try:
ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')
except Exception:
示例2: __init__
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import clearCache [as 別名]
class Credit:
def __init__(self):
self.conf = (SparkConf()
.setAppName("CREDIT")
.set("spark.cores.max", "2")
.set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
self.sc = SparkContext(conf=self.conf)
self.sqlctx = SQLContext(self.sc)
self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
self.base = 'hdfs://master:9000/gmc/'
def load_from_mysql(self, table, database='core'):
url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
return df
def sql_operate(self, sql, rdd, once_size=1000):
temp = []
for row in rdd.collect():
# print(row)
if len(temp) >= once_size:
self.mysql_helper.executemany(sql, temp)
temp.clear()
temp.append(row)
if len(temp) != 0:
self.mysql_helper.executemany(sql, temp)
temp.clear()
def prepare_fpgrowth_data(self):
tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
'MER_CAT_CD') \
.filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")
result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()
def m(x):
k = x[0]
l = list(x[1])
v = set()
for i in l:
v.add(i[0])
return set(v)
result = result.map(m)
for i in result.take(10):
print(i)
model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
result = model.freqItemsets().collect()
for r in result:
print(r)
def cycle_credit(self):
'''
信用卡聚類數據預處理
:return:
'''
print('---------------------------信用卡-Start--------------------------')
# 交易流水
credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT',
'BILL_AMTFLAG').filter(
"BILL_AMTFLAG ='-'").cache()
# 卡賬戶信息
credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE')
# 還款計算
return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT')
return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED'))
# 去除0最低還款額,即未消費的賬單月
join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0')
# 清除緩存
self.sqlctx.clearCache()
def which_cycle_type(line):
mindue = line['STM_MINDUE']
returned = line['RETURNED']
'''
0:normal,all returned
1:cycle credit
2:overdue,don't return money
'''
if mindue is not None and returned is None:
flag = 2
elif returned >= mindue * 10:
flag = 0
elif returned > mindue and returned < mindue * 10:
flag = 1
else:
flag = 9
return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag,
STM_MINDUE=line['STM_MINDUE'])
#.........這裏部分代碼省略.........