Commit 78c9b003 authored by 王家华's avatar 王家华

Monitor V0.1

parents
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 26 21:44:56 2018
@author: Jason Wang
"""
import time
import os
import pymysql
import pandas as pd
import numpy as np
import openpyxl
import decimal
import matplotlib.pyplot as plt
import os
from matplotlib.font_manager import FontProperties
from matplotlib.lines import Line2D
import datetime
from django.db import transaction, DatabaseError
kalist = [1,198,214,217,333,159507,159384,159563,159561,159538,159609,159537]
############################## SQL ##############################################
#applied_channel = [1,214,217,198,159384,159483,159479,159478,333,158748,158764,158932,159457,159459,159519,159507,159538,159561]
#applice_type = []
#channelDict = {159384:'平安H5高净值',159483:'平安低净值',159479:'车险保单贷',159478:'法人贷',333:'融360',158748:'汽车之家',158764:'翼支付',158932:'拉卡拉',159457:'惠金所',159459:'惠金所',159519:'亿融普惠'}
appliedTypeList = ['1,2,3','1','2','3']
appliedType_type = ['客群总体','首申','复申','复贷']
# extract channel list where except recalling channel
sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel FROM risk_analysis
WHERE transacted = 1
AND real_loan_amount > 20000
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01')
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528)
'''
sql = '''
SELECT date_format(applied_at,'%Y-%m-%d') as applied_at,applied_from,applied_type,@feature FROM risk_analysis
WHERE DATE_FORMAT(applied_at,'%Y-%m') BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -2 MONTH),'%Y-%m')
AND DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m')
AND applied_from in (@applied_channel)
AND applied = 1
'''
########################## DB Configuration #####################################
risk_analysis_config = {'user' : 'jiahua_wang',
'password' : 'IqHKCIyZ',
'host' : '172.20.6.9',
'port' : 9030,
'database' : 'risk_analysis',
'encoding' : 'utf8'}
#################################################################################
pwd = os.getcwd()
path = "E:\\Python\\su Project\\plot\\VLM\\"
path_alarm = "E:\\Python\\su Project\\plot\\VLM\\alarm\\"
path_sepatate = "E:\\Python\\su Project\\plot\\separateByChannel\\"
now = time.strftime("%Y-%m-%d")
# make directory, if it exists return path, else return created folder path
#def mkdir(path,name):
# folder = os.path.exists(path+name)
# if folder:
# return path+name+'\\'
# else:
# os.makedirs(path+name)
# return path+name+'\\'
# VLM with one variable
def plotLine(title,y,row,col,table,save_path,upperBoundary=0,bottomBoundary=0):
# if x less than 10, ignored its plot
if len(col) <= 10 | int(y.sum()) <= 10 :
return 1
else:
cols = [item[5:] for item in col.values.tolist()]
#print(content)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['savefig.dpi'] = 226 #图片像素
#分辨率
fig,axs = plt.subplots(1,1,figsize=(33,11),linewidth=0.1)
x = range(len(col))
axs.plot(x,y)
axs.add_line(Line2D((x[0],x[-1]),(y.mean(),y.mean()),linestyle='--',color='darkorange'))
plt.annotate(s = '月均{}'.format(round(y.mean(),2)),xy=(x[-1] + 0.1,y.mean()))
# upper boundary
if upperBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() + 3 * y.std(),y.mean() + 3 * y.std()),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean+3STD\n{}'.format(round(y.mean() + 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() + 3 * y.std()))
else:
axs.add_line(Line2D((x[0],x[-1]),(upperBoundary,upperBoundary),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean+3STD\n{}'.format(round(upperBoundary,2)),xy = (x[-1] + 0.1,upperBoundary))
# bottom boundary
if bottomBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() - 3 * y.std(),y.mean() - 3 * y.std()),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean-3STD\n{}'.format(round(y.mean() - 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() - 3 * y.std()))
else:
print('gonna here')
axs.add_line(Line2D((x[0],x[-1]),(bottomBoundary,bottomBoundary),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean-3STD\n{}'.format(round(bottomBoundary,2)),xy = (x[-1] + 0.1,bottomBoundary))
# draw vertical line of each points
bottom = 0
if y.min() - y.std() * 3 - y.mean() * 0.02 > 0:
bottom = y.min() - y.std() * 3 - y.std() * 0.1
plt.vlines(x,[bottom],y,color = 'lightgrey',linestyle = '--')
axs.grid()
plt.xticks([])
the_table = plt.table(cellText=table,
rowLabels=row,
colLabels=cols,
colWidths=[0.91 / (len(col) - 1)]*len(col),
loc='bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(9)
fig.subplots_adjust(left=0.032,right=0.97)
fig.set_size_inches(33,11)
#fig.suptitle(title)
plt.title(title,fontsize=18)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
def readExcel(path,sheet=None):
return pd.read_excel(path,sheet)
#conn = connect2DB()
dict_keylist = []
dict_vallist = []
dict_DD = readExcel("E:\\Python\\su Project\\DD.xlsx")
modelList = [model for model in dict_DD.keys()]
def mkdir(path,fd):
if not os.path.exists(path+fd):
folder = mkdir(path,fd)
return folder
else:
return path + 'fd'
def connect2DB(db_config):
db = pymysql.connect(
host = db_config['host'],
port = db_config['port'],
user = db_config['user'],
passwd = db_config['password'],
db = db_config['database'],
charset = db_config['encoding'])
return db
def query_sql(sql,db_config=risk_analysis_config):
try:
conn = connect2DB(db_config)
df = pd.read_sql(sql,conn)
conn.close()
return df
except Exception as e:
return 0
def dataManipul(df,keyword):
#df_withoutna = df.dropna(axis=0).sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
#df = pd.merge(df_withoutna[keyword].iloc[int(len(df_withoutna)*0.01):int(len(df_withoutna)*0.99)]
#df.dropna(axis=0).loc[:,keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
df_count = df[['applied_at',keyword]].groupby('applied_at').count()[keyword] # need 2 recheck
df_zeros = pd.Series(np.zeros(df_count.shape),index = df_count.index)
df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
df_missing = pd.concat([df_zeros,df_missing], axis = 1, sort = True).fillna(0)[keyword]
# df_shape = pd.DataFrame(np.zeros(df_count.shape))
#
# df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
# df_missing = df_shape + df_missing
missing_rate = df_missing / (df_count + df_missing) * 100
del df_missing
df_zero = df[df[keyword] == 0].groupby('applied_at')[keyword].count()
df_zero = pd.concat([df_zeros,df_zero], axis = 1, sort = True).fillna(0)[keyword]
zero_rate = df_zero / df_count * 100
del df_zero
df_noneNA = df.dropna(axis = 0)
df_noneNA = df_noneNA.sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
df_sum = df_noneNA.iloc[int(len(df_noneNA)*0.01):int(len(df_noneNA)*0.99)].groupby('applied_at').agg(['mean','std','count'])
df_sum = pd.concat([df_zeros,df_sum], axis = 1, sort = True).fillna(0).drop(columns=[0])
df_sum.columns = ['mean','std','count']
cols = df_count.index
return zero_rate.fillna(0).round(1),missing_rate.fillna(0).round(1),cols,df_sum
#########################################################################
# check via channel details
def separateBychannel(df,key,meansub3std,meanpls3std):
try:
for appliedFrom in kalist:
try:
#df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df[df.applied_from == appliedFrom][['applied_at',key]],key)
table = []
y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist()
table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'with'+str(appliedFrom)+'-VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_sepatate)
del table
except ValueError as e: #ValueError
continue
except Exception as e :
print('channel Exception : ',key,appliedType,e)
########### extract channel list #############
applied_channel = query_sql(sql_channel).applied_from.tolist()
sql = sql.replace('@applied_channel',str(applied_channel).strip('[').strip(']'))
#########################################################################
#for model in modelList:
# df_model = dict_DD[model].dropna(axis = 0)
# dict_keylist.append(df_model.feature.tolist())
# dict_keylist.append(df_model.query.tolist())
# dict_vallist.append(df_model.description.tolist())
#
#for li in dict_keylist:
for i in range(len(modelList)):
# drop colums from data dict where there has no description
df_model_list = dict_DD[modelList[i]].dropna(axis = 0)
#feature key list
features = df_model_list.reset_index().feature
# query key list
queries = df_model_list.reset_index().queries
#feature descriptions list
description = df_model_list.reset_index().description
modelVar_index = 0
for fea_i in range(len(features)):
appliedType_index = 0
try:
key = queries[fea_i].strip()
print('key: ',key)
df = query_sql(sql.replace('@feature',queries[fea_i]))
# except None
df.loc[:,key] = df.loc[:,key].map(lambda x : np.nan if x == None else x)
df.loc[:,key] = df.loc[:,key].map(lambda x : np.nan if x < 0 else x)
# exception of interger == mysql query meets a exception
except Exception as a:
print(a)
continue
for appliedType in appliedTypeList:
print('appliedType',appliedType)
if appliedType_index == 0:
df_tmp = df[['applied_at','applied_from',key]]
else:
df_tmp = df[df.applied_type == int(appliedType)][['applied_at','applied_from',key]]
#print('appliedType: ',appliedType)
try:
#df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key)
table = []
y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist()
table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
meanpls3std = y_total.mean() + y_total.std() * 3
meansub3std = y_total.mean() - y_total.std() * 3
#mean_mean = y_total.mean()
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm)
separateBychannel(df_tmp,key,meansub3std,meanpls3std)
else:
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path)
del table
except Exception as e: #ValueError
print('Mean Exception : ',key,appliedType,e)
appliedType_index += 1
continue
try:
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key)
table = []
y_total = df_sum['std']
table.append(df_sum['std'].round(1))
table.append(df_sum['count'])
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
del df_sum
stdpls3std = y_total.mean() + y_total.std() * 3
stdsub3std = y_total.mean() - y_total.std() * 3
#std_mean = y_total.mean()
if (y_total.iloc[-30:-1].max() > stdpls3std) | (y_total.iloc[-30:-1].min() < stdsub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm)
else:
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path)
del table
except Exception as e:
print('Std Exception : ',e)
appliedType_index += 1
continue
appliedType_index += 1
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 3 18:18:12 2018
@author: Jason Wang
"""
import time
import os
import pymysql
import pandas as pd
import numpy as np
import openpyxl
import decimal
import matplotlib.pyplot as plt
import os
from matplotlib.font_manager import FontProperties
from matplotlib.lines import Line2D
import datetime
import sklearn.metrics
from django.db import transaction, DatabaseError
sql_bins = '''
SELECT @modelVar,transacted,IF(passdue_day>@passdueday,1,0) as overdue FROM risk_analysis
WHERE applied_at BETWEEN
(SELECT date_format(applied_at,'%Y-%m-%d')
FROM risk_analysis
WHERE !ISNULL(@modelVar) AND transacted=1 and applied_from IN (@channelID)
ORDER BY applied_at asc
LIMIT 1) AND DATE_ADD((SELECT date_format(applied_at,'%Y-%m-%d')
FROM risk_analysis
WHERE !ISNULL(@modelVar) AND transacted=1 and applied_from IN (@channelID)
ORDER BY applied_at asc
LIMIT 1),INTERVAL 30 DAY)
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar)
AND @modelVar > 0
'''
"""
### sql_bins_360 = '''
SELECT @modelVar,transacted,IF(passdue_day > 15,1,0) as overdue
FROM risk_analysis
WHERE !ISNULL(@modelVar)
AND applied_at >= '2018-08-01' AND applied_at <= '2018-09-01'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar)
AND @modelVar > 0
'''
"""
sql_observation = '''
SELECT date_format(applied_at,'%Y-%m') as applied_at,@modelVar
FROM risk_analysis
WHERE DATE_FORMAT(applied_at,'%Y-%m')
BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -3 MONTH),'%Y-%m')
AND DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar)
'''
######## calculate with natural mon ###########
sql_passdueday = '''
(SELECT order_no,'T-1' as applied_at,@modelVar,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -15 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1)
UNION ALL
(SELECT order_no,'T-2' as applied_at,@modelVar,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1)
UNION ALL
(SELECT order_no,'T-3' as applied_at,@modelVar,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -105 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1)
'''
############ calculate with T-N mon #############
"""
sql_passdueday = '''
SELECT date_format(loan_start_date,'%Y-%m') as applied_at,@modelVar,IF(passdue_day > @passdueday,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(loan_start_date,'%Y-%m')
BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -4 MONTH),'%Y-%m-%s')
AND DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -2 MONTH),'%Y-%m')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar)
AND transacted = 1
'''
"""
passdue_day = 15
#AND applied_from IN (@channelID)
##################################### db config ###############################
risk_analysis_config = {'user' : 'jiahua_wang',
'password' : 'IqHKCIyZ',
'host' : '172.20.6.9',
'port' : 9030,
'database' : 'risk_analysis',
'encoding' : 'utf8'}
#################################################################################
path = "E:\\Python\\su Project\\plot\\PSI&VAL\\"
mapping_path = "E:\\Python\\su Project\\query_score.xlsx"
mapping = pd.read_excel(mapping_path,sheet_name='score_risk_anlysis')
modelType = mapping.description.tolist()
modelList = mapping.feature.tolist()
appliedTypeList = mapping.appliedType.tolist()
channelIDList = mapping.channel.tolist()
#modelBound_dict = mapping[['feature','boundary']].set_index('feature').boundary.to_dict()
del mapping
#modelList = ['xinyan_xy_fstapply_point','xinyan_xy_reapply_point','xinyan_xy_reloan_point','reloan_v3_point','lxf_v2_point','v6_operator_score_raw','dhb_score','tongdun_score','shuchuang_phone_apply','pingan_markingCriterion','tencent_tencentAntiFraudScore','eleven_bei_score','ljj_old_score','ljj_model_trusty_score']
#modelList = ['xinyan_xy_reloan_point']
#modelType = ['新颜首申分','新颜复申分','新颜复贷分','复贷分','量信分','V6分','电话邦分','同盾分','数创多头','腾讯反欺诈分','十一贝分','量晶晶首贷分','量晶晶复贷分']
#modelType = ['新颜复贷']
#channelIDList = ['217,214,198,1,159481,158748,333,159384,149483,159479,159479,158764,158932,159457,159459,159519','217,214,198,1,159481','158748','333','159384','149483,159479,159479','158764,158932,159457,159459,159519']
#channel = ['全部渠道','内部','汽车之家','融360','平安高净值','平安非高净值','其他外部渠道']
#appliedTypeList = ['1,2,3','1','2','3']
#appliedTypeList = ['1']
#appliedType_type = ['总体','首申','复申','复贷']
appliedType_type = {'1,2,3':'总体','1':'首申','2':'复申','3':'复贷'}
#appliedType_type = ['首申']
passdueday = 15 #more than N days (fstOverdue N+)
def connect2DB(db_config):
db = pymysql.connect(
host = db_config['host'],
port = db_config['port'],
user = db_config['user'],
passwd = db_config['password'],
db = db_config['database'],
charset = db_config['encoding'])
return db
def query_sql(sql,db_config=risk_analysis_config):
try:
conn = connect2DB(db_config)
df = pd.read_sql(sql,conn)
conn.close()
return df
except Exception as e:
return 0
################################### plot PSI ##################################
#+'\nmissing:'+str(missing[int(i/2)])+'%'
def plotPSI(title,y_list,dateList,psi,missing,rows,cols,table_value,save_path):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 100 #分辨率
fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
for y_index in range(len(y_list)):
y = y_list[y_index]
x = range(len(y))
axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' PSI:'+str(psi[y_index])+'\n缺失率:'+str(missing[y_index])+'%')
the_table = plt.table(cellText=table_value,
rowLabels=rows,
colLabels=cols,
colWidths=[0.91 / (len(cols)-1)] * len(cols),
loc='bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel('各分段样本占比'+' (%)')
plt.legend()
plt.xticks([])
#plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
fig.suptitle(title)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
########################### validation liftchart###############################
def plotLiftChart(title,y_list,dateList,aucri,auc,rows,cols,table_value,save_path):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 100 #分辨率
fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
for y_index in range(len(y_list)):
y = y_list[y_index]
x = range(len(y))
axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' (AUCRI:' + str(aucri[y_index])+ ') AUC: ' + str(auc[y_index]))
the_table = plt.table(cellText=table_value,
rowLabels = rows,
colLabels = cols,
colWidths = [0.91 / (len(cols)-1)] * len(cols),
loc = 'bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom = 0.2)
plt.legend()
plt.grid()
plt.ylabel('贷后首逾'+str(passdueday)+'+ (%)')
plt.xticks([])
fig.suptitle(title)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
###############################################################################
#def dataManipul(df,keyword,interval):
#
# # df count of all records
#
# df_count = df[['applied_at',keyword]].fillna(0).groupby('applied_at').count()[keyword]
# df_zeros = pd.Series(np.zeros(df_count.shape),index = df_count.index)
#
# df_missing = df[['applied_at',keyword]].groupby('applied_at').count()
# df_missing = pd.concat([df_zeros,df_missing], axis = 1, sort = True).fillna(0)[keyword]
#
## df_shape = pd.DataFrame(np.zeros(df_count.shape))
##
## df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
## df_missing = df_shape + df_missing
# missing_rate = df_missing / df_count * 100
#
# df_zero = df[df[keyword] == 0].groupby('applied_at')[keyword].count()
# df_zero = pd.concat([df_zeros,df_zero], axis = 1, sort = True).fillna(0)[keyword]
# zero_rate = df_zero / df_count * 100
#
# df_noneNA = df.dropna(axis = 0)
# df_sum = df_noneNA.groupby('applied_at').agg(['mean','std','count'])
#
# cols = df_count.index
# return zero_rate,missing_rate,cols,df_sum
def dataManipul(df,keyword,interval):
# df count of all records
# missing_rate = {}
# df_count = df[['applied_at','bins']].groupby('applied_at')
# count dataframe separated by mon
# set negative as null
df.dropna(axis=0)[keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
df_noneNA = df.dropna(axis = 0)
df_count = df[['applied_at',keyword]].fillna(0).groupby('applied_at').count()
df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
df_missing = df_count - df_noneNA[['applied_at',keyword]].groupby('applied_at').count()
df_missing = pd.concat([df_zeros,df_missing],axis=1)[keyword].fillna(0)
missing_rate = df_missing / df_count[keyword].replace(0,1) * 100
df_noneNA['bins'] = pd.cut(df_noneNA[keyword],interval,precision=6)
cols = df_noneNA['bins'].value_counts().sort_index().index.astype('str')
df_count = df_noneNA[['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
df_zero = df_noneNA[df_noneNA[keyword] == 0][['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
df_zero = pd.concat([df_zeros,df_zero],axis=1)[keyword].fillna(0)
zero_rate = df_zero / df_count[keyword].replace(0,1) * 100
y = df_count / df_noneNA[['applied_at',keyword]].groupby('applied_at').count() * 100
rows = y.index.levels[0].tolist()
return zero_rate.round(1),missing_rate.round(1),rows,cols,y[keyword].round(1),df_count[keyword]
def psi_bins(df,keyword,interval):
df.loc[:,'bins'] = pd.cut(df[keyword],interval,precision=6)
BM = df.groupby('bins').count()[keyword]
BM_count = BM / BM.values.sum() * 100
return BM_count
# draw liftchart
def liftchart(df,keyword,interval):
# split bins with scores
#nothing,interval = pd.qcut(df[df.loc[:,keyword]>0][keyword],10,retbins=True,duplicates='drop')
# delete 'nothing' var cause its useless
if len(df[df.loc[:,keyword]<0][keyword])>0:
bins_interval = interval.tolist()
bins_interval.append(-10000000)
bins_interval.sort()
else:
bins_interval = interval
df.loc[:,'bins'] = pd.cut(df[keyword],bins_interval,precision=6)
# count of sample
df_count = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()
df_zeros = pd.Series(np.zeros(df_count['overdue'].shape),index = df_count.index)
# overdue samples
df = df[df.overdue == 1]
#df.loc[:,'bins'] = pd.cut(df[keyword],interval)
df_overdue = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()
df_overdue = pd.concat([df_zeros,df_overdue],axis=1)['overdue'].fillna(0)
y = df_overdue / df_count['overdue'].replace(0,1) * 100
rows = y.index.levels[0].tolist()
cols = df['bins'].value_counts().sort_index().index.astype('str').tolist()
return df_count['overdue'],df_overdue,y.round(3),rows,cols
# extract channel list where except recalling channel
sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel FROM risk_analysis
WHERE transacted = 1
AND real_loan_amount > 20000
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01')
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528)
'''
channel = {'1,214,217,198':'内部','159507':'浅橙','159537':'360金融','333':'融360','159384,159483':'平安','159561':'51公积金API'}
channelId = query_sql(sql_channel).applied_from
l=''
for i in channel.keys():
l = l + i+','
l = eval('['+l+']')
channel[str(channelId[channelId.map(lambda x : True if x not in l else False)].tolist()).strip('[').strip(']')] = '其他渠道'
channel[str(channelId.tolist()).strip('[').strip(']')] = '全部渠道'
# traverse each model & applied_type & channelbins_interval
for modelVar in modelList:
print('model: ',modelVar)
for appliedType in str(appliedTypeList[modelList.index(modelVar)]).split(';'):
# print('appliedType',appliedType)
# print('appliedTypeList[model_index]',appliedTypeList[modelList.index(modelVar)])
for channelID in channel.keys():
try:
print('channelID:',channelID)
df_bins = query_sql(sql_bins.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday))).dropna(axis=0)
df_observation = query_sql(sql_observation.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID))
df_observation.loc[:,modelVar] = df_observation.loc[:,modelVar].map(lambda x : np.nan if x < 0 else x)
#df_bins = df_bins.apply(lambda x :np.nan if x < 0 else x)
Nothing,interval = pd.qcut(df_bins.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
BM_count = psi_bins(df_bins,modelVar,interval)
zero_rate,missing_rate,dateList,cols,y,count = dataManipul(df_observation,modelVar,np.array(interval).round(6))
#df_observation_with_bin = pd.cut(df_observation.dropna(axis=0)[modelVar],interval)
# del df_bins
del interval
value_tab = []
rows = []
y_list = []
psi = []
# plot line separated by mon
for mon in dateList:
y_list.append(y.loc[mon].values)
#value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(count.loc[mon].astype('str')+'(zeroR:'+zero_rate.loc[mon].astype('str')+'%)')
#rows.append(str(mon)+' Value');
rows.append(str(mon)+' Count')
#(y-10).sum() / np.log10(y/10)
psi.append((((y.loc[mon]-BM_count) * np.log10(y.loc[mon]/BM_count)).sum()/100).round(3))
plotPSI(modelType[modelList.index(modelVar)]+'-'+appliedType_type[appliedType]+'-' + channel[channelID] + ' PSI',y_list,dateList,psi,missing_rate,rows,cols,value_tab,path)
except Exception as e:
print('psi exception',e)
try:
# Overdue dataframe
df_bins_auc = df_bins[df_bins.transacted == 1]
del df_bins
auc_BM = sklearn.metrics.roc_auc_score(df_bins_auc.overdue, df_bins_auc.loc[:,modelVar])
print('AUC_BM: ',auc_BM)
Nothing,interval = pd.qcut(df_bins_auc.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
df_passdueday = query_sql(sql_passdueday.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdue_day',str(passdueday)))
count,df_overdue,y,dateList,cols = liftchart(df_passdueday,modelVar,np.array(interval).round(6))
value_tab = []
rows = []
y_list = []
aucri = []
auc = []
for mon in dateList:
y_list.append(y.loc[mon].values)
#value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(df_overdue.loc[mon].astype('str') + ' (总计 ' + count.loc[mon].astype('str') + ')' )
#rows.append(str(mon)+' OverdueRate');
rows.append(str(mon)+' Count')
df_passdueday = df_passdueday.dropna(axis=0)
aucri.append(round((sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar])/auc_BM),3))
auc.append(round(sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar]),3))
auc[-1] = str(auc[-1]) + '\n AUC基准: ' + str(round(auc_BM,3))
plotLiftChart(modelType[modelList.index(modelVar)] + '-' + appliedType_type[appliedType] + '-' + channel[channelID] + ' AUC WITH '+ str(passdueday) + '+',y_list,dateList,aucri,auc,rows,cols,value_tab,path)
except Exception as e: # ZeroDivisionError
print('val exception',e)
"""
#V5 333
modelVar = 'v5_filter_fraud_point_v5_without_zhima'
channelID = '333'
for appliedType in str(appliedTypeList[modelList.index(modelVar)]).split(';'):
print('appliedType',appliedType)
print('appliedTypeList[model_index]',appliedTypeList[modelList.index(modelVar)])
try:
df_bins = pd.read_sql(sql_bins.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn).dropna(axis=0)
df_observation = pd.read_sql(sql_observation.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID),conn)
df_observation.loc[:,modelVar] = df_observation.loc[:,modelVar].map(lambda x : np.nan if x < 0 else x)
#df_bins = df_bins.apply(lambda x :np.nan if x < 0 else x)
Nothing,interval = pd.qcut(df_bins.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
BM_count = psi_bins(df_bins,modelVar,interval)
zero_rate,missing_rate,dateList,cols,y,count = dataManipul(df_observation,modelVar,np.array(interval).round(6))
#df_observation_with_bin = pd.cut(df_observation.dropna(axis=0)[modelVar],interval)
# del df_bins
del interval
value_tab = []
rows = []
y_list = []
psi = []
# plot line separated by mon
for mon in dateList:
y_list.append(y.loc[mon].values)
value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(count.loc[mon].astype('str')+'(zeroR:'+zero_rate.loc[mon].astype('str')+'%)')
rows.append(str(mon)+' Value');rows.append(str(mon)+' Count')
#(y-10).sum() / np.log10(y/10)
psi.append((((y.loc[mon]-BM_count) * np.log10(y.loc[mon]/BM_count)).sum()/100).round(3))
plotPSI(modelType[modelList.index(modelVar)]+'-'+appliedType_type[appliedType]+'-' + channel[channelID] + ' PSI',y_list,dateList,psi,missing_rate,rows,cols,value_tab,path)
except Exception as e:
print(e)
try:
# Overdue dataframe
df_bins_auc = df_bins[df_bins.transacted == 1]
del df_bins
auc_BM = sklearn.metrics.roc_auc_score(df_bins_auc.overdue, df_bins_auc.loc[:,modelVar])
print('AUC_BM: ',auc_BM)
Nothing,interval = pd.qcut(df_bins_auc.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
df_passdueday = pd.read_sql(sql_passdueday.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn)
count,df_overdue,y,dateList,cols = liftchart(df_passdueday,modelVar,np.array(interval).round(6))
value_tab = []
rows = []
y_list = []
aucri = []
auc = []
for mon in dateList:
y_list.append(y.loc[mon].values)
value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(df_overdue.loc[mon].astype('str') + ' (总计 ' + count.loc[mon].astype('str') + ')' )
rows.append(str(mon)+' OverdueRate');rows.append(str(mon)+' Count')
aucri.append(round((sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar])/auc_BM),3))
auc.append(round(sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar]),3))
auc[-1] = str(auc[-1]) + '\n AUC基准: ' + str(round(auc_BM,3))
plotLiftChart(modelType[modelList.index(modelVar)] + '-' + appliedType_type[appliedType] + '-' + channel[channelID] + ' AUC WITH '+ str(passdueday) + '+',y_list,dateList,aucri,auc,rows,cols,value_tab,path)
except Exception as e:
print(e)
"""
def plot_table_df(dataset, auc, title='untitled', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path=None):
print(tab_df)
'''
instructions : visualization of pivot with single dataframe
Params :
dataset -
auc - auc list / array
title - title of plot('untitled' as default)
x_label - X axis label of plot
y_label - y axis label of plot
plot_tab - plot table or not , default as True
saved_path - saved path, set as None as there has no download needs
'''
fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = pd.Series(dataset.index).astype(str).map(lambda x : x.replace(' ','')).map(lambda x : x.replace('0.','.'))
# traverse each columns of dataframe
for i in range(len(table_rows)):
x = range(len(table_cols))
y = dataset.iloc[:,i]
axs.plot(x, y, label = str(table_rows[i]) + ' AUC: ' + str(auc[i]))
# if table should be plot
if plot_tab:
if tab_df == None:
tab_df = [list(dataset.iloc[:, 1].values) for i in range(len(table_rows))]
else:
table_rows = tab_df.columns
table_cols = tab_df.index
tab_df = [list(tab_df.iloc[:, 1].values) for i in range(len(table_rows))]
the_table = plt.table(cellText = tab_df,
rowLabels = table_rows,
colLabels = table_cols,
colWidths = [0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
# otherwise, nothing to do here
the_table.auto_set_font_size(False)
the_table.set_fontsize(9)
fig.subplots_adjust(bottom=0.2)
plt.grid()
if y_label is not None:
plt.ylabel(y_label)
if X_label is not None:
plt.xlabel(X_label)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
if saved_path != None:
plt.savefig(saved_path + title + ".png")
plt.show()
return 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment