Commit 9a3d42b6 authored by 王家华's avatar 王家华

V1.0

parent 78c9b003
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (model_mvp)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/model_monitor.iml" filepath="$PROJECT_DIR$/.idea/model_monitor.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="1ecd0b9f-60aa-441d-b8e6-0ca91e7a02ef" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" beforeDir="false" afterPath="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" beforeDir="false" afterPath="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/query_score.xlsx" beforeDir="false" afterPath="$PROJECT_DIR$/query_score.xlsx" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2104">
<caret line="103" lean-forward="true" selection-start-line="103" selection-end-line="103" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="238">
<caret line="14" column="33" lean-forward="true" selection-start-line="14" selection-start-column="33" selection-end-line="14" selection-end-column="33" />
<folding>
<element signature="e#50#79#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/monitoring_VLM_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374">
<caret line="23" column="44" lean-forward="true" selection-start-line="23" selection-start-column="44" selection-end-line="23" selection-end-column="44" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="8" column="17" selection-start-line="8" selection-start-column="17" selection-end-line="8" selection-end-column="17" />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>E:</find>
<find>E:\</find>
</findStrings>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" />
<option value="$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py" />
<option value="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" />
<option value="$PROJECT_DIR$/Monitor_mongoDB/monitoring_VLM_mongodb.py" />
</list>
</option>
</component>
<component name="ProjectConfigurationFiles">
<option name="files">
<list>
<option value="$PROJECT_DIR$/.idea/model_monitor.iml" />
<option value="$PROJECT_DIR$/.idea/vcs.xml" />
<option value="$PROJECT_DIR$/.idea/misc.xml" />
<option value="$PROJECT_DIR$/.idea/modules.xml" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="312" />
<option name="y" value="137" />
<option name="width" value="1228" />
<option name="height" value="675" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="model_monitor" type="b2602c69:ProjectViewProjectNode" />
<item name="model_monitor" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_monitor" type="b2602c69:ProjectViewProjectNode" />
<item name="model_monitor" type="462c0819:PsiDirectoryNode" />
<item name="Monitor_mongoDB" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_monitor" type="b2602c69:ProjectViewProjectNode" />
<item name="model_monitor" type="462c0819:PsiDirectoryNode" />
<item name="Monitor_risk_analysis" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="ASKED_SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="1ecd0b9f-60aa-441d-b8e6-0ca91e7a02ef" name="Default Changelist" comment="" />
<created>1562726148779</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1562726148779</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1066" extended-state="6" />
<editor active="true" />
<layout>
<window_info id="Favorites" side_tool="true" />
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.1564805" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info anchor="bottom" id="Version Control" />
<window_info anchor="bottom" id="Python Console" />
<window_info anchor="bottom" id="Terminal" />
<window_info anchor="bottom" id="Event Log" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
</layout>
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-2104">
<caret line="103" lean-forward="true" selection-start-line="103" selection-end-line="103" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="136">
<caret line="8" column="17" selection-start-line="8" selection-start-column="17" selection-end-line="8" selection-end-column="17" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="238">
<caret line="14" column="33" lean-forward="true" selection-start-line="14" selection-start-column="33" selection-end-line="14" selection-end-column="33" />
<folding>
<element signature="e#50#79#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/monitoring_VLM_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="374">
<caret line="23" column="44" lean-forward="true" selection-start-line="23" selection-start-column="44" selection-end-line="23" selection-end-column="44" />
</state>
</provider>
</entry>
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python
# coding: utf-8
from datetime import datetime
import pandas as pd
pd.options.display.max_columns = 1000
import pymongo
import numpy as np
import time
import pymysql
import datetime
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from dateutil.relativedelta import relativedelta
# ka channel
kalist = [1,198,214,217,333,159507,159384,159478,459483,159563,159561,159538,159609]
risk_analysis_config = {'user' : 'fengkong_read_only',
'password' : 'mT2HFUgI',
'host' : '172.20.6.9',
'port' : 9030,
'database' : 'risk_analysis',
'encoding' : 'utf8'}
# read mongodb mapping from excel
#mapping_score = pd.read_excel("./mongodb.xlsx",sheet_name='score').dropna(axis=0)
#mapping_variable = pd.read_excel("./mongodb.xlsx",sheet_name='variable').dropna(axis=0)
def readExcel(path,sheet=None):
return pd.read_excel(path,sheet)
dict_DD = readExcel("../features_mongodb.xlsx")
modelList = [model for model in dict_DD.keys()]
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1,'_id':0}"
vlm_start_date = (datetime.date.today() - relativedelta(days = +57)).strftime("%Y-%m-01 00:00:00")
vlm_end_date = time.strftime("%Y-%m-01 00:00:00")
# extract channel list where except recalling channel
sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel FROM risk_analysis
WHERE transacted = 1
AND real_loan_amount > 20000
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01')
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528)
'''
sql = '''
SELECT date_format(applied_at,'%Y-%m-%d') as applied_at,applied_from,applied_type,order_no FROM risk_analysis
WHERE DATE_FORMAT(applied_at,'%Y-%m') BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -2 MONTH),'%Y-%m')
AND DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m')
AND applied_from in (@applied_channel)
AND applied = 1
'''
path_alarm = "../plot/VLM/alarm/"
path = "../plot/VLM/"
def querymongo(limit,query):
myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
mydb = myclient["rc_dp_feature_pro"]
mycol = mydb["rc_feature_analysis_timing_v2"]
# gt greater than, lt less than. e = equals
x = mycol.find(eval(limit),eval(query))
myclient.close()
return pd.DataFrame(list(x))
def connect2DB(db_config):
db = pymysql.connect(
host = db_config['host'],
port = db_config['port'],
user = db_config['user'],
passwd = db_config['password'],
db = db_config['database'],
charset = db_config['encoding'])
return db
def query_sql(sql,db=risk_analysis_config):
try:
conn = connect2DB(db)
df = pd.read_sql(sql,conn)
conn.close()
return df
except Exception as e:
return 0
# VLM with one variable
def plotLine(title,y,row,col,table,save_path,upperBoundary=0,bottomBoundary=0):
# if x less than 10, ignored its plot
if len(col) <= 10 | int(y.sum()) <= 10 :
return 1
else:
cols = [item[5:] for item in col.values.tolist()]
#print(content)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['savefig.dpi'] = 226 #图片像素
#分辨率
fig,axs = plt.subplots(1,1,figsize=(33,11),linewidth=0.1)
x = range(len(col))
axs.plot(x,y)
axs.add_line(Line2D((x[0],x[-1]),(y.mean(),y.mean()),linestyle='--',color='darkorange'))
plt.annotate(s = '月均{}'.format(round(y.mean(),2)),xy=(x[-1] + 0.1,y.mean()))
# upper boundary
if upperBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() + 3 * y.std(),y.mean() + 3 * y.std()),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean+3STD\n{}'.format(round(y.mean() + 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() + 3 * y.std()))
else:
axs.add_line(Line2D((x[0],x[-1]),(upperBoundary,upperBoundary),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean+3STD\n{}'.format(round(upperBoundary,2)),xy = (x[-1] + 0.1,upperBoundary))
# bottom boundary
if bottomBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() - 3 * y.std(),y.mean() - 3 * y.std()),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean-3STD\n{}'.format(round(y.mean() - 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() - 3 * y.std()))
else:
axs.add_line(Line2D((x[0],x[-1]),(upperBoundary,upperBoundary),linestyle = '--',color = 'lightcoral'))
plt.annotate(s = 'Mean-3STD\n{}'.format(round(upperBoundary,2)),xy = (x[-1] + 0.1,upperBoundary))
# draw vertical line of each points
bottom = 0
if y.min() - y.std() * 3 - y.mean() * 0.02 > 0:
bottom = y.min() - y.std() * 3 - y.std() * 0.1
plt.vlines(x,[bottom],y,color = 'lightgrey',linestyle = '--')
axs.grid()
plt.xticks([])
the_table = plt.table(cellText=table,
rowLabels=row,
colLabels=cols,
colWidths=[0.91 / (len(col) - 1)]*len(col),
loc='bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(9)
fig.subplots_adjust(left=0.032,right=0.97)
fig.set_size_inches(33,11)
#fig.suptitle(title)
plt.title(title,fontsize=18)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
############################################
def dataManipul(df,keyword):
#df_withoutna = df.dropna(axis=0).sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
#df = pd.merge(df_withoutna[keyword].iloc[int(len(df_withoutna)*0.01):int(len(df_withoutna)*0.99)]
#df.dropna(axis=0).loc[:,keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
df_count = df[['applied_at',keyword]].groupby('applied_at').count()[keyword] # need 2 recheck
df_zeros = pd.Series(np.zeros(df_count.shape),index = df_count.index)
df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
df_missing = pd.concat([df_zeros,df_missing], axis = 1, sort = True).fillna(0)[keyword]
# df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
# df_missing = df_shape + df_missing
missing_rate = df_missing / (df_count + df_missing) * 100
del df_missing
df_zero = df[df[keyword] == 0].groupby('applied_at')[keyword].count()
df_zero = pd.concat([df_zeros,df_zero], axis = 1, sort = True).fillna(0)[keyword]
zero_rate = df_zero / df_count * 100
del df_zero
df_noneNA = df.dropna(axis = 0)
df_noneNA = df_noneNA.sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
df_sum = df_noneNA.iloc[int(len(df_noneNA)*0.01):int(len(df_noneNA)*0.99)].groupby('applied_at').agg(['mean','std','count'])
df_sum = pd.concat([df_zeros,df_sum], axis = 1, sort = True).fillna(0).drop(columns=[0])
df_sum.columns = ['mean','std','count']
cols = df_count.index
return zero_rate.fillna(0).round(1),missing_rate.fillna(0).round(1),cols,df_sum
#############################################
# check via channel details
def separateBychannel(df,key,meansub3std,meanpls3std):
try:
for appliedFrom in kalist:
try:
#df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df[df.applied_from == appliedFrom][['applied_at',key]],key)
table = []
y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist()
table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(str(model) + '-' + description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'with'+str(appliedFrom)+'-VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,'./plot/vlm_separate_By_channel/')
del table
except ValueError as e: #ValueError
continue
except Exception as e :
print('channel Exception : ',key,appliedType,e)
########### extract channel list #############
applied_channel = query_sql(sql_channel).applied_from.tolist()
sql = sql.replace('@applied_channel',str(applied_channel).strip('[').strip(']'))
#########################################################################
#########################################################################
for model in modelList:
#feature key list
features = dict_DD[model].feature
# query key list
queries = dict_DD[model].queries
#feature descriptions list
description = dict_DD[model].description
appliedTypeList = ['1,2,3','1','2','3']
appliedType_type = ['客群总体','首申','复申','复贷']
for fea_i in range(len(queries)):
appliedType_index = 0
key = queries[fea_i].strip()
df = querymongo(limit.replace('@start_date',vlm_start_date).replace('@end_date',vlm_end_date),query.replace('@key',key))
df = df.applymap(lambda x : np.nan if x == '' else x)
df_offline = query_sql(sql,risk_analysis_config)
df = pd.merge(df,df_offline,how='right',left_on='order_id',right_on='order_no')[['applied_at','applied_from','applied_type',key]]
del df_offline
df[key] = df[key].astype('float')
df.applied_type = df['applied_type'].astype('int')
for appliedType in appliedTypeList:
if appliedType_index == 0:
df_tmp = df[['applied_at','applied_from',key]]
else:
df_tmp = df[df.applied_type == int(appliedType)][['applied_at','applied_from',key]]
try:
#df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key)
table = []
y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist()
table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
meanpls3std = y_total.mean() + y_total.std() * 3
meansub3std = y_total.mean() - y_total.std() * 3
#mean_mean = y_total.mean()
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(model+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm)
separateBychannel(df_tmp,key,meansub3std,meanpls3std)
else:
plotLine(model+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path)
del table
except Exception as e: #ValueError
print('Mean Exception : ',key,appliedType,e)
appliedType_index += 1
continue
try:
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key)
table = []
y_total = df_sum['std']
table.append(df_sum['std'].round(1))
table.append(df_sum['count'])
table.append(missing_rate_total.astype('str')+'%')
table.append(zero_rate_total.astype('str')+'%')
del df_sum
stdpls3std = y_total.mean() + y_total.std() * 3
stdsub3std = y_total.mean() - y_total.std() * 3
#std_mean = y_total.mean()
if (y_total.iloc[-30:-1].max() > stdpls3std) | (y_total.iloc[-30:-1].min() < stdsub3std):
plotLine(model+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm)
else:
plotLine(model+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path)
del table
except Exception as e:
print('Std Exception : ',e)
appliedType_index += 1
continue
appliedType_index += 1
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
from datetime import datetime
import pandas as pd
pd.options.display.max_columns = 1000
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
# read mongodb mapping from excel
mapping_score = pd.read_excel("./query_score.xlsx",sheet_name='score_mongo').dropna(axis=0)
#mapping_variable = pd.read_excel("./mongodb.xlsx",sheet_name='variable').dropna(axis=0)
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}"
passdue_day = 15
appliedType_type = {'1,2,3':'总体','1,2':'首贷','1':'首申','2':'复申','3':'复贷'}
path = "../plot/PSI_VAL/"
################################### plot PSI ##################################
def plotPSI(title,y_list,dateList,psi,missing,rows,cols,table_value,save_path):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 100 #分辨率
fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth = 0.1)
for y_index in range(len(y_list)):
y = y_list[y_index]
x = range(len(y))
axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' PSI:'+str(psi[y_index])+'\n缺失率:'+str(missing[y_index])+'%')
the_table = plt.table(cellText=table_value,
rowLabels=rows,
colLabels=cols,
colWidths=[0.91 / (len(cols)-1)] * len(cols),
loc='bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel('各分段样本占比'+' (%)')
plt.legend()
plt.xticks([])
#plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
fig.suptitle(title)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
# draw liftchart
def liftchart(df,keyword,interval):
# split bins with scores
#nothing,interval = pd.qcut(df[df.loc[:,keyword]>0][keyword],10,retbins=True,duplicates='drop')
# delete 'nothing' var cause its useless
if len(df[df.loc[:,keyword]<0][keyword])>0:
bins_interval = interval.tolist()
bins_interval.append(-10000000)
bins_interval.sort()
else:
bins_interval = interval
df.loc[:,'bins'] = pd.cut(df[keyword],bins_interval,precision=6)
# count of sample
df_count = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()
df_zeros = pd.Series(np.zeros(df_count['overdue'].shape),index = df_count.index)
# overdue samples
df = df[df.overdue == 1]
#df.loc[:,'bins'] = pd.cut(df[keyword],interval)
df_overdue = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()
df_overdue = pd.concat([df_zeros,df_overdue],axis=1)['overdue'].fillna(0)
y = df_overdue / df_count['overdue'].replace(0,1) * 100
rows = y.index.levels[0].tolist()
cols = df['bins'].value_counts().sort_index().index.astype('str').tolist()
return df_count['overdue'],df_overdue,y.round(3),rows,cols
############################## validation liftchart###############################
def plotLiftChart(title,y_list,dateList,aucri,auc,rows,cols,table_value,save_path):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 100 #分辨率
fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
for y_index in range(len(y_list)):
y = y_list[y_index]
x = range(len(y))
axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' (AUCRI:' + str(aucri[y_index])+ ') AUC: ' + str(auc[y_index]))
the_table = plt.table(cellText=table_value,
rowLabels = rows,
colLabels = cols,
colWidths = [0.91 / (len(cols)-1)] * len(cols),
loc = 'bottom')
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom = 0.2)
plt.legend()
plt.grid()
plt.ylabel('贷后首逾'+str(15)+'+ (%)')
plt.xticks([])
fig.suptitle(title)
plt.savefig(save_path + title + ".png")
plt.show()
return 1
def psi_bins(df,keyword,interval):
df.loc[:,'bins'] = pd.cut(df[keyword],interval,precision=6)
BM = df.groupby('bins').count()[keyword]
BM_count = BM / BM.values.sum() * 100
return BM_count
def querymongo(limit,query):
myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
mydb = myclient["rc_dp_feature_pro"]
mycol = mydb["rc_feature_analysis_timing_v2"]
x = mycol.find(eval(limit),eval(query))
myclient.close()
return pd.DataFrame(list(x))
benchmark_start_date = "2018-12-21 00:00:00"
benchmark_end_date = "2019-01-22 00:00:00"
psi_start_date = "2019-03-01 00:00:00"
psi_end_date = "2019-06-01 00:00:00"
val_start_date = "2019-02-03 00:00:00"
val_end_date = "2019-05-03 00:00:00"
risk_analysis_config = {'user' : 'fengkong_read_only',
'password' : 'mT2HFUgI',
'host' : '172.20.6.9',
'port' : 9030,
'database' : 'risk_analysis',
'encoding' : 'utf8'}
import pymysql
def connect2DB(db_config):
db = pymysql.connect(
host = db_config['host'],
port = db_config['port'],
user = db_config['user'],
passwd = db_config['password'],
db = db_config['database'],
charset = db_config['encoding'])
return db
def query_sql(sql,db=risk_analysis_config):
try:
conn = connect2DB(db)
df = pd.read_sql(sql,conn)
conn.close()
return df
except Exception as e:
return 0
def dataManipul(df,keyword,interval):
# df count of all records
# missing_rate = {}
# df_count = df[['applied_at','bins']].groupby('applied_at')
# count dataframe separated by mon
# set negative as null
df.dropna(axis=0)[keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
df_noneNA = df.dropna(axis = 0)
df_count = df[['applied_at',keyword]].fillna(0).groupby('applied_at').count()
df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
df_missing = df_count - df_noneNA[['applied_at',keyword]].groupby('applied_at').count()
df_missing = pd.concat([df_zeros,df_missing],axis=1)[keyword].fillna(0)
missing_rate = df_missing / df_count[keyword].replace(0,1) * 100
df_noneNA['bins'] = pd.cut(df_noneNA[keyword],interval,precision=6)
cols = df_noneNA['bins'].value_counts().sort_index().index.astype('str')
df_count = df_noneNA[['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
df_zero = df_noneNA[df_noneNA[keyword] == 0][['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
df_zero = pd.concat([df_zeros,df_zero],axis=1)[keyword].fillna(0)
zero_rate = df_zero / df_count[keyword].replace(0,1) * 100
y = df_count / df_noneNA[['applied_at',keyword]].groupby('applied_at').count() * 100
rows = y.index.levels[0].tolist()
return zero_rate.round(1),missing_rate.round(1),rows,cols,y[keyword].round(1),df_count[keyword]
sql_bins = '''
SELECT order_no,transacted,IF(passdue_day>15,1,0) as overdue
FROM risk_analysis
WHERE applied_at >= '@benchmark_start_date' and applied_at <= '@benchmark_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND repayment_status != 4
'''
sql_observation = '''
SELECT order_no,date_format(applied_at,'%Y-%m') as applied_at
FROM risk_analysis
WHERE applied_at >= '@psi_start_date' and applied_at <= '@psi_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND repayment_status != 4
'''
######## calculate with natural mon ###########
"""
sql_passdueday = '''
SELECT order_no,date_format(loan_start_date,'%Y-%m') as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE applied_at >= '@val_start_date' and applied_at <= '@val_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4
'''
"""
######## calculate with T-n mon ###########
sql_passdueday = '''
(SELECT order_no,'T-1' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -15 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
UNION ALL
(SELECT order_no,'T-2' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
UNION ALL
(SELECT order_no,'T-3' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -105 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
'''
# benchmark
#df = querymongo(limit.replace('@start_date',benchmark_start_date).replace('@end_date',benchmark_end_date),query.replace('@key',key))[['order_id',key]]
modelType = mapping_score.description.tolist()
modelList = mapping_score.score.tolist()
appliedTypeList = mapping_score.appliedType.tolist()
#channelIDList = mapping_score.channel.tolist()
conn = connect2DB(risk_analysis_config)
# extract channel list where except recalling channel
sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel
FROM risk_analysis
WHERE applied_from IN
(SELECT applied_from FROM risk_analysis
WHERE transacted = 1
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01')
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528)
GROUP BY 1
HAVING SUM(real_loan_amount) > 100000
ORDER BY sum(real_loan_amount) DESC)
'''
channel = {'1,214,217,198':'内部','159507':'浅橙','159537':'360金融','333':'融360','159384,159483':'平安','159561':'51公积金API'}
channelId = pd.read_sql(sql_channel,conn).applied_from
l=''
for i in channel.keys():
l = l + i+','
l = eval('['+l+']')
channel[str(channelId[channelId.map(lambda x : True if x not in l else False)].tolist()).strip('[').strip(']')] = '其他渠道'
channel[str(channelId.tolist()).strip('[').strip(']')] = '全部渠道'
for modelVar in modelList:
for appliedType in str(appliedTypeList[modelList.index(modelVar)]).split(';'):
print('appliedType',appliedType)
print('appliedTypeList[model_index]',appliedTypeList[modelList.index(modelVar)])
for channelID in channel.keys():
try:
df_bins = querymongo(limit.replace('@start_date',benchmark_start_date).replace('@end_date',benchmark_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]
df_bins = df_bins.applymap(lambda x : np.nan if x == '' else x)
df_bins[modelVar] = df_bins[modelVar].astype('float')
df_offline = query_sql(sql_bins.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@benchmark_start_date',benchmark_start_date).replace('@benchmark_end_date',benchmark_end_date),risk_analysis_config)
df_bins = pd.merge(df_bins,df_offline,how='right',left_on='order_id',right_on='order_no')[['transacted','overdue',modelVar]].dropna(axis=0)
del df_offline
df_observation = querymongo(limit.replace('@start_date',psi_start_date).replace('@end_date',psi_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]
df_observation = df_observation.applymap(lambda x : np.nan if x == '' else x)
df_observation[modelVar] = df_observation[modelVar].astype('float')
df_offline = query_sql(sql_observation.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@psi_start_date',psi_start_date).replace('@psi_end_date',psi_end_date),risk_analysis_config)
df_observation = pd.merge(df_observation,df_offline,how='right',left_on='order_id',right_on='order_no')[['applied_at',modelVar]]
del df_offline
#df_observation = query_sql(sql_observation.replace('@appliedType',appliedType).replace('@channelID',channelID),risk_analysis_config)
df_observation.loc[:,modelVar] = df_observation.loc[:,modelVar].map(lambda x : np.nan if x < 0 else x)
#df_bins = df_bins.apply(lambda x :np.nan if x < 0 else x)
Nothing,interval = pd.qcut(df_bins.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
BM_count = psi_bins(df_bins,modelVar,interval)
zero_rate,missing_rate,dateList,cols,y,count = dataManipul(df_observation,modelVar,np.array(interval).round(6))
#df_observation_with_bin = pd.cut(df_observation.dropna(axis=0)[modelVar],interval)
# del df_bins
del interval
value_tab = []
rows = []
y_list = []
psi = []
# plot line separated by mon
for mon in dateList:
y_list.append(y.loc[mon].values)
#value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(count.loc[mon].astype('str')+'(zeroR:'+zero_rate.loc[mon].astype('str')+'%)')
#rows.append(str(mon)+' Value');
rows.append(str(mon)+' Count')
#(y-10).sum() / np.log10(y/10)
psi.append((((y.loc[mon]-BM_count) * np.log10(y.loc[mon]/BM_count)).sum()/100).round(3))
plotPSI(modelType[modelList.index(modelVar)]+'-'+appliedType_type[appliedType]+'-' + channel[channelID] + ' PSI',y_list,dateList,psi,missing_rate,rows,cols,value_tab,path)
except Exception as e:
print("psi exception",e)
try:
# Overdue dataframe
df_bins_auc = df_bins[df_bins.transacted == 1]
del df_bins
auc_BM = sklearn.metrics.roc_auc_score(df_bins_auc.overdue, df_bins_auc.loc[:,modelVar])
print('AUC_BM: ',auc_BM)
Nothing,interval = pd.qcut(df_bins_auc.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
df_passdueday = querymongo(limit.replace('@start_date',val_start_date).replace('@end_date',val_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]
df_passdueday = df_passdueday.applymap(lambda x : np.nan if x == '' else x)
df_passdueday[modelVar] = df_passdueday[modelVar].astype('float')
df_offline = query_sql(sql_passdueday.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@val_start_date',val_start_date).replace('@val_end_date',val_end_date).replace('@passdue_day',str(passdue_day)),risk_analysis_config)
df_passdueday = pd.merge(df_passdueday,df_offline,how='inner',left_on='order_id',right_on='order_no')[['applied_at','overdue',modelVar]].dropna(axis=0)
#print('df_passdueday count: ',df_passdueday.shape)
del df_offline
# df_passdueday = pd.read_sql(sql_passdueday.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn)
count,df_overdue,y,dateList,cols = liftchart(df_passdueday,modelVar,np.array(interval).round(6))
value_tab = []
rows = []
y_list = []
aucri = []
auc = []
for mon in dateList:
y_list.append(y.loc[mon].values)
#value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(df_overdue.loc[mon].astype('str') + ' (总计 ' + count.loc[mon].astype('str') + ')' )
#rows.append(str(mon)+' OverdueRate');
rows.append(str(mon)+' Count')
aucri.append(round((sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar])/auc_BM),3))
auc.append(round(sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar]),3))
auc[-1] = str(auc[-1]) + '\n AUC基准: ' + str(round(auc_BM,3))
plotLiftChart(modelType[modelList.index(modelVar)] + '-' + appliedType_type[appliedType] + '-' + channel[channelID] + ' AUC WITH '+ str(15) + '+',y_list,dateList,aucri,auc,rows,cols,value_tab,path)
except Exception as e:
print("val exception",e)
# In[ ]:
...@@ -5,42 +5,39 @@ Created on Mon Nov 26 21:44:56 2018 ...@@ -5,42 +5,39 @@ Created on Mon Nov 26 21:44:56 2018
@author: Jason Wang @author: Jason Wang
""" """
import time import time
import os
import pymysql import pymysql
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import openpyxl
import decimal
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import os import os
from matplotlib.font_manager import FontProperties
from matplotlib.lines import Line2D from matplotlib.lines import Line2D
import datetime import datetime
from django.db import transaction, DatabaseError
kalist = [1,198,214,217,333,159507,159384,159563,159561,159538,159609,159537] kalist = [1, 198, 214, 217, 333, 159507, 159384, 159563, 159561, 159538, 159609, 159537]
############################## SQL ############################################## ############################## SQL ##############################################
#applied_channel = [1,214,217,198,159384,159483,159479,159478,333,158748,158764,158932,159457,159459,159519,159507,159538,159561] # applied_channel = [1,214,217,198,159384,159483,159479,159478,333,158748,158764,158932,159457,159459,159519,159507,159538,159561]
#applice_type = [] # applice_type = []
#channelDict = {159384:'平安H5高净值',159483:'平安低净值',159479:'车险保单贷',159478:'法人贷',333:'融360',158748:'汽车之家',158764:'翼支付',158932:'拉卡拉',159457:'惠金所',159459:'惠金所',159519:'亿融普惠'} # channelDict = {159384:'平安H5高净值',159483:'平安低净值',159479:'车险保单贷',159478:'法人贷',333:'融360',158748:'汽车之家',158764:'翼支付',158932:'拉卡拉',159457:'惠金所',159459:'惠金所',159519:'亿融普惠'}
appliedTypeList = ['1,2,3','1','2','3']
appliedType_type = ['客群总体','首申','复申','复贷']
appliedTypeList = ['1,2,3', '1', '2', '3']
appliedType_type = ['客群总体', '首申', '复申', '复贷']
# extract channel list where except recall channel
# extract channel list where except recalling channel
sql_channel = ''' sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel FROM risk_analysis SELECT DISTINCT(applied_from),applied_channel
FROM risk_analysis
WHERE applied_from IN
(SELECT applied_from FROM risk_analysis
WHERE transacted = 1 WHERE transacted = 1
AND real_loan_amount > 20000
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01') AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01')
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01') AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528) and applied_from not in (159481,159486,159528)
GROUP BY 1
HAVING SUM(real_loan_amount) > 100000
ORDER BY sum(real_loan_amount) DESC)
''' '''
sql = ''' sql = '''
SELECT date_format(applied_at,'%Y-%m-%d') as applied_at,applied_from,applied_type,@feature FROM risk_analysis SELECT date_format(applied_at,'%Y-%m-%d') as applied_at,applied_from,applied_type,@feature FROM risk_analysis
WHERE DATE_FORMAT(applied_at,'%Y-%m') BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -2 MONTH),'%Y-%m') WHERE DATE_FORMAT(applied_at,'%Y-%m') BETWEEN DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -2 MONTH),'%Y-%m')
...@@ -50,8 +47,8 @@ AND applied = 1 ...@@ -50,8 +47,8 @@ AND applied = 1
''' '''
########################## DB Configuration ##################################### ########################## DB Configuration #####################################
risk_analysis_config = {'user' : 'jiahua_wang', risk_analysis_config = {'user' : 'fengkong_read_only',
'password' : 'IqHKCIyZ', 'password' : 'mT2HFUgI',
'host' : '172.20.6.9', 'host' : '172.20.6.9',
'port' : 9030, 'port' : 9030,
'database' : 'risk_analysis', 'database' : 'risk_analysis',
...@@ -67,7 +64,7 @@ now = time.strftime("%Y-%m-%d") ...@@ -67,7 +64,7 @@ now = time.strftime("%Y-%m-%d")
# make directory, if it exists return path, else return created folder path # make directory, if it exists return path, else return created folder path
#def mkdir(path,name): # def mkdir(path,name):
# folder = os.path.exists(path+name) # folder = os.path.exists(path+name)
# if folder: # if folder:
# return path+name+'\\' # return path+name+'\\'
...@@ -76,182 +73,193 @@ now = time.strftime("%Y-%m-%d") ...@@ -76,182 +73,193 @@ now = time.strftime("%Y-%m-%d")
# return path+name+'\\' # return path+name+'\\'
# VLM with one variable # VLM with one variable
def plotLine(title,y,row,col,table,save_path,upperBoundary=0,bottomBoundary=0): def plotLine(title, y, row, col, table, save_path, upperBoundary=0, bottomBoundary=0):
# if x less than 10, ignored its plot # if x less than 10, ignored its plot
if len(col) <= 10 | int(y.sum()) <= 10 : if len(col) <= 10 | int(y.sum()) <= 10:
return 1 return 1
else: else:
cols = [item[5:] for item in col.values.tolist()] cols = [item[5:] for item in col.values.tolist()]
#print(content) # print(content)
plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['savefig.dpi'] = 226 #图片像素 plt.rcParams['savefig.dpi'] = 226 # 图片像素
#分辨率 # 分辨率
fig,axs = plt.subplots(1,1,figsize=(33,11),linewidth=0.1) fig, axs = plt.subplots(1, 1, figsize=(33, 11), linewidth=0.1)
x = range(len(col)) x = range(len(col))
axs.plot(x,y) axs.plot(x, y)
axs.add_line(Line2D((x[0],x[-1]),(y.mean(),y.mean()),linestyle='--',color='darkorange')) axs.add_line(Line2D((x[0], x[-1]), (y.mean(), y.mean()), linestyle='--', color='darkorange'))
plt.annotate(s = '月均{}'.format(round(y.mean(),2)),xy=(x[-1] + 0.1,y.mean())) plt.annotate(s='月均{}'.format(round(y.mean(), 2)), xy=(x[-1] + 0.1, y.mean()))
# upper boundary # upper boundary
if upperBoundary == 0: if upperBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() + 3 * y.std(),y.mean() + 3 * y.std()),linestyle = '--',color = 'lightcoral')) axs.add_line(Line2D((x[0], x[-1]), (y.mean() + 3 * y.std(), y.mean() + 3 * y.std()), linestyle='--',
plt.annotate(s = 'Mean+3STD\n{}'.format(round(y.mean() + 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() + 3 * y.std())) color='lightcoral'))
plt.annotate(s='Mean+3STD\n{}'.format(round(y.mean() + 3 * y.std(), 2)),
xy=(x[-1] + 0.1, y.mean() + 3 * y.std()))
else: else:
axs.add_line(Line2D((x[0],x[-1]),(upperBoundary,upperBoundary),linestyle = '--',color = 'lightcoral')) axs.add_line(Line2D((x[0], x[-1]), (upperBoundary, upperBoundary), linestyle='--', color='lightcoral'))
plt.annotate(s = 'Mean+3STD\n{}'.format(round(upperBoundary,2)),xy = (x[-1] + 0.1,upperBoundary)) plt.annotate(s='Mean+3STD\n{}'.format(round(upperBoundary, 2)), xy=(x[-1] + 0.1, upperBoundary))
# bottom boundary # bottom boundary
if bottomBoundary == 0: if bottomBoundary == 0:
axs.add_line(Line2D((x[0],x[-1]),(y.mean() - 3 * y.std(),y.mean() - 3 * y.std()),linestyle = '--',color = 'lightcoral')) axs.add_line(Line2D((x[0], x[-1]), (y.mean() - 3 * y.std(), y.mean() - 3 * y.std()), linestyle='--',
plt.annotate(s = 'Mean-3STD\n{}'.format(round(y.mean() - 3 * y.std(),2)),xy = (x[-1] + 0.1,y.mean() - 3 * y.std())) color='lightcoral'))
plt.annotate(s='Mean-3STD\n{}'.format(round(y.mean() - 3 * y.std(), 2)),
xy=(x[-1] + 0.1, y.mean() - 3 * y.std()))
else: else:
print('gonna here') print('gonna here')
axs.add_line(Line2D((x[0],x[-1]),(bottomBoundary,bottomBoundary),linestyle = '--',color = 'lightcoral')) axs.add_line(Line2D((x[0], x[-1]), (bottomBoundary, bottomBoundary), linestyle='--', color='lightcoral'))
plt.annotate(s = 'Mean-3STD\n{}'.format(round(bottomBoundary,2)),xy = (x[-1] + 0.1,bottomBoundary)) plt.annotate(s='Mean-3STD\n{}'.format(round(bottomBoundary, 2)), xy=(x[-1] + 0.1, bottomBoundary))
# draw vertical line of each points # draw vertical line of each points
bottom = 0 bottom = 0
if y.min() - y.std() * 3 - y.mean() * 0.02 > 0: if y.min() - y.std() * 3 - y.mean() * 0.02 > 0:
bottom = y.min() - y.std() * 3 - y.std() * 0.1 bottom = y.min() - y.std() * 3 - y.std() * 0.1
plt.vlines(x,[bottom],y,color = 'lightgrey',linestyle = '--') plt.vlines(x, [bottom], y, color='lightgrey', linestyle='--')
axs.grid() axs.grid()
plt.xticks([]) plt.xticks([])
the_table = plt.table(cellText=table, the_table = plt.table(cellText=table,
rowLabels=row, rowLabels=row,
colLabels=cols, colLabels=cols,
colWidths=[0.91 / (len(col) - 1)]*len(col), colWidths=[0.91 / (len(col) - 1)] * len(col),
loc='bottom') loc='bottom')
the_table.auto_set_font_size(False) the_table.auto_set_font_size(False)
the_table.set_fontsize(9) the_table.set_fontsize(9)
fig.subplots_adjust(left=0.032,right=0.97) fig.subplots_adjust(left=0.032, right=0.97)
fig.set_size_inches(33,11) fig.set_size_inches(33, 11)
#fig.suptitle(title) # fig.suptitle(title)
plt.title(title,fontsize=18) plt.title(title, fontsize=18)
plt.savefig(save_path + title + ".png") plt.savefig(save_path + title + ".png")
plt.show() plt.show()
return 1 return 1
def readExcel(path,sheet=None):
return pd.read_excel(path,sheet)
#conn = connect2DB() def readExcel(path, sheet=None):
return pd.read_excel(path, sheet)
# conn = connect2DB()
dict_keylist = [] dict_keylist = []
dict_vallist = [] dict_vallist = []
dict_DD = readExcel("E:\\Python\\su Project\\DD.xlsx") dict_DD = readExcel("E:\\Python\\su Project\\features_DD.xlsx")
modelList = [model for model in dict_DD.keys()] modelList = [model for model in dict_DD.keys()]
def mkdir(path,fd):
if not os.path.exists(path+fd): def mkdir(path, fd):
folder = mkdir(path,fd) if not os.path.exists(path + fd):
folder = mkdir(path, fd)
return folder return folder
else: else:
return path + 'fd' return path + 'fd'
def connect2DB(db_config): def connect2DB(db_config):
db = pymysql.connect( db = pymysql.connect(
host = db_config['host'], host=db_config['host'],
port = db_config['port'], port=db_config['port'],
user = db_config['user'], user=db_config['user'],
passwd = db_config['password'], passwd=db_config['password'],
db = db_config['database'], db=db_config['database'],
charset = db_config['encoding']) charset=db_config['encoding'])
return db return db
def query_sql(sql,db_config=risk_analysis_config): def query_sql(sql, db_config=risk_analysis_config):
try: try:
conn = connect2DB(db_config) conn = connect2DB(db_config)
df = pd.read_sql(sql,conn) df = pd.read_sql(sql, conn)
conn.close() conn.close()
return df return df
except Exception as e: except Exception as e:
return 0 return 0
def dataManipul(df,keyword):
#df_withoutna = df.dropna(axis=0).sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
#df = pd.merge(df_withoutna[keyword].iloc[int(len(df_withoutna)*0.01):int(len(df_withoutna)*0.99)]
#df.dropna(axis=0).loc[:,keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x ) def dataManipul(df, keyword):
# df_withoutna = df.dropna(axis=0).sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1)
# df = pd.merge(df_withoutna[keyword].iloc[int(len(df_withoutna)*0.01):int(len(df_withoutna)*0.99)]
df_count = df[['applied_at',keyword]].groupby('applied_at').count()[keyword] # need 2 recheck # df.dropna(axis=0).loc[:,keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
df_zeros = pd.Series(np.zeros(df_count.shape),index = df_count.index) df_count = df[['applied_at', keyword]].groupby('applied_at').count()[keyword] # need 2 recheck
df_zeros = pd.Series(np.zeros(df_count.shape), index=df_count.index)
df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count() df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
df_missing = pd.concat([df_zeros,df_missing], axis = 1, sort = True).fillna(0)[keyword] df_missing = pd.concat([df_zeros, df_missing], axis=1, sort=True).fillna(0)[keyword]
# df_shape = pd.DataFrame(np.zeros(df_count.shape)) # df_shape = pd.DataFrame(np.zeros(df_count.shape))
# #
# df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count() # df_missing = df[df[keyword].isnull()].fillna(0).groupby('applied_at')[keyword].count()
# df_missing = df_shape + df_missing # df_missing = df_shape + df_missing
missing_rate = df_missing / (df_count + df_missing) * 100 missing_rate = df_missing / (df_count + df_missing) * 100
del df_missing del df_missing
df_zero = df[df[keyword] == 0].groupby('applied_at')[keyword].count() df_zero = df[df[keyword] == 0].groupby('applied_at')[keyword].count()
df_zero = pd.concat([df_zeros,df_zero], axis = 1, sort = True).fillna(0)[keyword] df_zero = pd.concat([df_zeros, df_zero], axis=1, sort=True).fillna(0)[keyword]
zero_rate = df_zero / df_count * 100 zero_rate = df_zero / df_count * 100
del df_zero del df_zero
df_noneNA = df.dropna(axis = 0) df_noneNA = df.dropna(axis=0)
df_noneNA = df_noneNA.sort_values(by=keyword,ascending=False).reset_index().drop('index',axis=1) df_noneNA = df_noneNA.sort_values(by=keyword, ascending=False).reset_index().drop('index', axis=1)
df_sum = df_noneNA.iloc[int(len(df_noneNA)*0.01):int(len(df_noneNA)*0.99)].groupby('applied_at').agg(['mean','std','count']) df_sum = df_noneNA.iloc[int(len(df_noneNA) * 0.01):int(len(df_noneNA) * 0.99)].groupby('applied_at').agg(
df_sum = pd.concat([df_zeros,df_sum], axis = 1, sort = True).fillna(0).drop(columns=[0]) ['mean', 'std', 'count'])
df_sum.columns = ['mean','std','count'] df_sum = pd.concat([df_zeros, df_sum], axis=1, sort=True).fillna(0).drop(columns=[0])
df_sum.columns = ['mean', 'std', 'count']
cols = df_count.index cols = df_count.index
return zero_rate.fillna(0).round(1),missing_rate.fillna(0).round(1),cols,df_sum return zero_rate.fillna(0).round(1), missing_rate.fillna(0).round(1), cols, df_sum
######################################################################### #########################################################################
# check via channel details # check via channel details
def separateBychannel(df,key,meansub3std,meanpls3std): def separateBychannel(df, key, meansub3std, meanpls3std):
try: try:
for appliedFrom in kalist: for appliedFrom in kalist:
try: try:
#df.applied_from = df.applied_from.astype('str') # df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df[df.applied_from == appliedFrom][['applied_at',key]],key) zero_rate_total, missing_rate_total, cols_total, df_sum = dataManipul(
df[df.applied_from == appliedFrom][['applied_at', key]], key)
table = [] table = []
y_total = df_sum['mean'] y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist() table.append(df_sum['mean'].round(1)) # .round(1).values.tolist()
table.append(df_sum['count'].astype('int')) table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%') table.append(missing_rate_total.astype('str') + '%')
table.append(zero_rate_total.astype('str')+'%') table.append(zero_rate_total.astype('str') + '%')
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std): if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'with'+str(appliedFrom)+'-VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_sepatate) plotLine(str(modelList[i]) + '-' + description[fea_i] + '-Mean-' + appliedType_type[
appliedType_index] + 'with' + str(appliedFrom) + '-VLM', y_total,
['value', 'count', 'Missing Rate', 'Zero Rate'], cols_total, table, path_sepatate)
del table del table
except ValueError as e: #ValueError except ValueError as e: # ValueError
continue continue
except Exception as e : except Exception as e:
print('channel Exception : ',key,appliedType,e) print('channel Exception : ', key, appliedType, e)
########### extract channel list ############# ########### extract channel list #############
applied_channel = query_sql(sql_channel).applied_from.tolist() applied_channel = query_sql(sql_channel).applied_from.tolist()
sql = sql.replace('@applied_channel',str(applied_channel).strip('[').strip(']')) sql = sql.replace('@applied_channel', str(applied_channel).strip('[').strip(']'))
######################################################################### #########################################################################
#for model in modelList: # for model in modelList:
# df_model = dict_DD[model].dropna(axis = 0) # df_model = dict_DD[model].dropna(axis = 0)
# dict_keylist.append(df_model.feature.tolist()) # dict_keylist.append(df_model.feature.tolist())
# dict_keylist.append(df_model.query.tolist()) # dict_keylist.append(df_model.query.tolist())
# dict_vallist.append(df_model.description.tolist()) # dict_vallist.append(df_model.description.tolist())
# #
#for li in dict_keylist: # for li in dict_keylist:
for i in range(len(modelList)): for i in range(len(modelList)):
# drop colums from data dict where there has no description # drop colums from data dict where there has no description
df_model_list = dict_DD[modelList[i]].dropna(axis = 0) df_model_list = dict_DD[modelList[i]].dropna(axis=0)
#feature key list # feature key list
features = df_model_list.reset_index().feature features = df_model_list.reset_index().feature
# query key list # query key list
queries = df_model_list.reset_index().queries queries = df_model_list.reset_index().queries
#feature descriptions list # feature descriptions list
description = df_model_list.reset_index().description description = df_model_list.reset_index().description
# applied_from
# cv channel = df_model_list.reset_index().applied_type
modelVar_index = 0 modelVar_index = 0
...@@ -259,70 +267,77 @@ for i in range(len(modelList)): ...@@ -259,70 +267,77 @@ for i in range(len(modelList)):
appliedType_index = 0 appliedType_index = 0
try: try:
key = queries[fea_i].strip() key = queries[fea_i].strip()
print('key: ',key) print('key: ', key)
df = query_sql(sql.replace('@feature',queries[fea_i])) df = query_sql(sql.replace('@feature', queries[fea_i]))
# except None # except None
df.loc[:,key] = df.loc[:,key].map(lambda x : np.nan if x == None else x) df.loc[:, key] = df.loc[:, key].map(lambda x: np.nan if x == None else x)
df.loc[:,key] = df.loc[:,key].map(lambda x : np.nan if x < 0 else x) df.loc[:, key] = df.loc[:, key].map(lambda x: np.nan if x < 0 else x)
# exception of interger == mysql query meets a exception # exception of interger == mysql query meets a exception
except Exception as a: except Exception as a:
print(a) print(a)
continue continue
for appliedType in appliedTypeList: for appliedType in appliedTypeList:
print('appliedType',appliedType) print('appliedType', appliedType)
if appliedType_index == 0: if appliedType_index == 0:
df_tmp = df[['applied_at','applied_from',key]] df_tmp = df[['applied_at', 'applied_from', key]]
else: else:
df_tmp = df[df.applied_type == int(appliedType)][['applied_at','applied_from',key]] df_tmp = df[df.applied_type == int(appliedType)][['applied_at', 'applied_from', key]]
#print('appliedType: ',appliedType) # print('appliedType: ',appliedType)
try: try:
#df.applied_from = df.applied_from.astype('str') # df.applied_from = df.applied_from.astype('str')
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key) zero_rate_total, missing_rate_total, cols_total, df_sum = dataManipul(df_tmp[['applied_at', key]], key)
table = [] table = []
y_total = df_sum['mean'] y_total = df_sum['mean']
table.append(df_sum['mean'].round(1)) #.round(1).values.tolist() table.append(df_sum['mean'].round(1)) # .round(1).values.tolist()
table.append(df_sum['count'].astype('int')) table.append(df_sum['count'].astype('int'))
table.append(missing_rate_total.astype('str')+'%') table.append(missing_rate_total.astype('str') + '%')
table.append(zero_rate_total.astype('str')+'%') table.append(zero_rate_total.astype('str') + '%')
meanpls3std = y_total.mean() + y_total.std() * 3 meanpls3std = y_total.mean() + y_total.std() * 3
meansub3std = y_total.mean() - y_total.std() * 3 meansub3std = y_total.mean() - y_total.std() * 3
#mean_mean = y_total.mean() # mean_mean = y_total.mean()
if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std): if (y_total.iloc[-30:].max() > meanpls3std) | (y_total.iloc[-30:].min() < meansub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm) plotLine(str(modelList[i]) + '-' + description[fea_i] + '-Mean-' + appliedType_type[
separateBychannel(df_tmp,key,meansub3std,meanpls3std) appliedType_index] + '-变化VLM', y_total, ['value', 'count', 'Missing Rate', 'Zero Rate'],
cols_total, table, path_alarm)
separateBychannel(df_tmp, key, meansub3std, meanpls3std)
else: else:
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Mean-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path) plotLine(str(modelList[i]) + '-' + description[fea_i] + '-Mean-' + appliedType_type[
appliedType_index] + '-变化VLM', y_total, ['value', 'count', 'Missing Rate', 'Zero Rate'],
cols_total, table, path)
del table del table
except Exception as e: #ValueError except Exception as e: # ValueError
print('Mean Exception : ',key,appliedType,e) print('Mean Exception : ', key, appliedType, e)
appliedType_index += 1 appliedType_index += 1
continue continue
try: try:
zero_rate_total,missing_rate_total,cols_total,df_sum = dataManipul(df_tmp[['applied_at',key]],key) zero_rate_total, missing_rate_total, cols_total, df_sum = dataManipul(df_tmp[['applied_at', key]], key)
table = [] table = []
y_total = df_sum['std'] y_total = df_sum['std']
table.append(df_sum['std'].round(1)) table.append(df_sum['std'].round(1))
table.append(df_sum['count']) table.append(df_sum['count'])
table.append(missing_rate_total.astype('str')+'%') table.append(missing_rate_total.astype('str') + '%')
table.append(zero_rate_total.astype('str')+'%') table.append(zero_rate_total.astype('str') + '%')
del df_sum del df_sum
stdpls3std = y_total.mean() + y_total.std() * 3 stdpls3std = y_total.mean() + y_total.std() * 3
stdsub3std = y_total.mean() - y_total.std() * 3 stdsub3std = y_total.mean() - y_total.std() * 3
#std_mean = y_total.mean() # std_mean = y_total.mean()
if (y_total.iloc[-30:-1].max() > stdpls3std) | (y_total.iloc[-30:-1].min() < stdsub3std): if (y_total.iloc[-30:-1].max() > stdpls3std) | (y_total.iloc[-30:-1].min() < stdsub3std):
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path_alarm) plotLine(str(modelList[i]) + '-' + description[fea_i] + '-Std-' + appliedType_type[
appliedType_index] + '-变化VLM', y_total, ['value', 'count', 'Missing Rate', 'Zero Rate'],
cols_total, table, path_alarm)
else: else:
plotLine(str(modelList[i])+'-'+description[fea_i]+'-Std-'+appliedType_type[appliedType_index]+'-变化VLM',y_total,['value','count','Missing Rate','Zero Rate'],cols_total,table,path) plotLine(str(modelList[i]) + '-' + description[fea_i] + '-Std-' + appliedType_type[
appliedType_index] + '-变化VLM', y_total, ['value', 'count', 'Missing Rate', 'Zero Rate'],
cols_total, table, path)
del table del table
except Exception as e: except Exception as e:
print('Std Exception : ',e) print('Std Exception : ', e)
appliedType_index += 1 appliedType_index += 1
continue continue
appliedType_index += 1 appliedType_index += 1
......
...@@ -20,8 +20,6 @@ import sklearn.metrics ...@@ -20,8 +20,6 @@ import sklearn.metrics
from django.db import transaction, DatabaseError from django.db import transaction, DatabaseError
sql_bins = ''' sql_bins = '''
SELECT @modelVar,transacted,IF(passdue_day>@passdueday,1,0) as overdue FROM risk_analysis SELECT @modelVar,transacted,IF(passdue_day>@passdueday,1,0) as overdue FROM risk_analysis
WHERE applied_at BETWEEN WHERE applied_at BETWEEN
...@@ -40,18 +38,6 @@ AND !ISNULL(@modelVar) ...@@ -40,18 +38,6 @@ AND !ISNULL(@modelVar)
AND @modelVar > 0 AND @modelVar > 0
''' '''
"""
### sql_bins_360 = '''
SELECT @modelVar,transacted,IF(passdue_day > 15,1,0) as overdue
FROM risk_analysis
WHERE !ISNULL(@modelVar)
AND applied_at >= '2018-08-01' AND applied_at <= '2018-09-01'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar)
AND @modelVar > 0
'''
"""
sql_observation = ''' sql_observation = '''
SELECT date_format(applied_at,'%Y-%m') as applied_at,@modelVar SELECT date_format(applied_at,'%Y-%m') as applied_at,@modelVar
...@@ -64,8 +50,7 @@ AND applied_type IN (@appliedType) ...@@ -64,8 +50,7 @@ AND applied_type IN (@appliedType)
AND !ISNULL(@modelVar) AND !ISNULL(@modelVar)
''' '''
######## calculate with T-N mon ###########
######## calculate with natural mon ###########
sql_passdueday = ''' sql_passdueday = '''
(SELECT order_no,'T-1' as applied_at,@modelVar,IF(passdue_day > @passdue_day,1,0) as overdue (SELECT order_no,'T-1' as applied_at,@modelVar,IF(passdue_day > @passdue_day,1,0) as overdue
...@@ -90,8 +75,7 @@ AND applied_type IN (@appliedType) ...@@ -90,8 +75,7 @@ AND applied_type IN (@appliedType)
AND transacted = 1) AND transacted = 1)
''' '''
############ calculate with natural mon #############
############ calculate with T-N mon #############
""" """
sql_passdueday = ''' sql_passdueday = '''
SELECT date_format(loan_start_date,'%Y-%m') as applied_at,@modelVar,IF(passdue_day > @passdueday,1,0) as overdue SELECT date_format(loan_start_date,'%Y-%m') as applied_at,@modelVar,IF(passdue_day > @passdueday,1,0) as overdue
...@@ -106,26 +90,26 @@ AND transacted = 1 ...@@ -106,26 +90,26 @@ AND transacted = 1
''' '''
""" """
passdue_day = 15 passdue_day = 15
#AND applied_from IN (@channelID) #AND applied_from IN (@channelID)
##################################### db config ############################### ##################################### db config ###############################
risk_analysis_config = {'user' : 'jiahua_wang', risk_analysis_config = {'user' : 'fengkong_read_only',
'password' : 'IqHKCIyZ', 'password' : 'mT2HFUgI',
'host' : '172.20.6.9', 'host' : '172.20.6.9',
'port' : 9030, 'port' : 9030,
'database' : 'risk_analysis', 'database' : 'risk_analysis',
'encoding' : 'utf8'} 'encoding' : 'utf8'}
################################################################################# #################################################################################
path = "E:\\Python\\su Project\\plot\\PSI&VAL\\" path = "../plot/PSI_VAL/"
mapping_path = "E:\\Python\\su Project\\query_score.xlsx" mapping_path = "./query_score.xlsx"
mapping = pd.read_excel(mapping_path,sheet_name='score_risk_anlysis') mapping = pd.read_excel(mapping_path,sheet_name='score_risk_anlysis')
modelType = mapping.description.tolist() modelType = mapping.description.tolist()
modelList = mapping.feature.tolist() modelList = mapping.score.tolist()
appliedTypeList = mapping.appliedType.tolist() appliedTypeList = mapping.appliedType.tolist()
channelIDList = mapping.channel.tolist() channelIDList = mapping.channel.tolist()
...@@ -133,21 +117,8 @@ channelIDList = mapping.channel.tolist() ...@@ -133,21 +117,8 @@ channelIDList = mapping.channel.tolist()
del mapping del mapping
#modelList = ['xinyan_xy_fstapply_point','xinyan_xy_reapply_point','xinyan_xy_reloan_point','reloan_v3_point','lxf_v2_point','v6_operator_score_raw','dhb_score','tongdun_score','shuchuang_phone_apply','pingan_markingCriterion','tencent_tencentAntiFraudScore','eleven_bei_score','ljj_old_score','ljj_model_trusty_score']
#modelList = ['xinyan_xy_reloan_point']
#modelType = ['新颜首申分','新颜复申分','新颜复贷分','复贷分','量信分','V6分','电话邦分','同盾分','数创多头','腾讯反欺诈分','十一贝分','量晶晶首贷分','量晶晶复贷分']
#modelType = ['新颜复贷']
#channelIDList = ['217,214,198,1,159481,158748,333,159384,149483,159479,159479,158764,158932,159457,159459,159519','217,214,198,1,159481','158748','333','159384','149483,159479,159479','158764,158932,159457,159459,159519']
#channel = ['全部渠道','内部','汽车之家','融360','平安高净值','平安非高净值','其他外部渠道']
#appliedTypeList = ['1,2,3','1','2','3']
#appliedTypeList = ['1']
#appliedType_type = ['总体','首申','复申','复贷']
appliedType_type = {'1,2,3':'总体','1':'首申','2':'复申','3':'复贷'} appliedType_type = {'1,2,3':'总体','1':'首申','2':'复申','3':'复贷'}
#appliedType_type = ['首申']
passdueday = 15 #more than N days (fstOverdue N+) passdueday = 15 #more than N days (fstOverdue N+)
def connect2DB(db_config): def connect2DB(db_config):
...@@ -420,77 +391,6 @@ for modelVar in modelList: ...@@ -420,77 +391,6 @@ for modelVar in modelList:
except Exception as e: # ZeroDivisionError except Exception as e: # ZeroDivisionError
print('val exception',e) print('val exception',e)
"""
#V5 333
modelVar = 'v5_filter_fraud_point_v5_without_zhima'
channelID = '333'
for appliedType in str(appliedTypeList[modelList.index(modelVar)]).split(';'):
print('appliedType',appliedType)
print('appliedTypeList[model_index]',appliedTypeList[modelList.index(modelVar)])
try:
df_bins = pd.read_sql(sql_bins.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn).dropna(axis=0)
df_observation = pd.read_sql(sql_observation.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID),conn)
df_observation.loc[:,modelVar] = df_observation.loc[:,modelVar].map(lambda x : np.nan if x < 0 else x)
#df_bins = df_bins.apply(lambda x :np.nan if x < 0 else x)
Nothing,interval = pd.qcut(df_bins.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
BM_count = psi_bins(df_bins,modelVar,interval)
zero_rate,missing_rate,dateList,cols,y,count = dataManipul(df_observation,modelVar,np.array(interval).round(6))
#df_observation_with_bin = pd.cut(df_observation.dropna(axis=0)[modelVar],interval)
# del df_bins
del interval
value_tab = []
rows = []
y_list = []
psi = []
# plot line separated by mon
for mon in dateList:
y_list.append(y.loc[mon].values)
value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(count.loc[mon].astype('str')+'(zeroR:'+zero_rate.loc[mon].astype('str')+'%)')
rows.append(str(mon)+' Value');rows.append(str(mon)+' Count')
#(y-10).sum() / np.log10(y/10)
psi.append((((y.loc[mon]-BM_count) * np.log10(y.loc[mon]/BM_count)).sum()/100).round(3))
plotPSI(modelType[modelList.index(modelVar)]+'-'+appliedType_type[appliedType]+'-' + channel[channelID] + ' PSI',y_list,dateList,psi,missing_rate,rows,cols,value_tab,path)
except Exception as e:
print(e)
try:
# Overdue dataframe
df_bins_auc = df_bins[df_bins.transacted == 1]
del df_bins
auc_BM = sklearn.metrics.roc_auc_score(df_bins_auc.overdue, df_bins_auc.loc[:,modelVar])
print('AUC_BM: ',auc_BM)
Nothing,interval = pd.qcut(df_bins_auc.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
interval[0] = 0
del Nothing
df_passdueday = pd.read_sql(sql_passdueday.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn)
count,df_overdue,y,dateList,cols = liftchart(df_passdueday,modelVar,np.array(interval).round(6))
value_tab = []
rows = []
y_list = []
aucri = []
auc = []
for mon in dateList:
y_list.append(y.loc[mon].values)
value_tab.append(y.loc[mon].astype('str')+'%')
value_tab.append(df_overdue.loc[mon].astype('str') + ' (总计 ' + count.loc[mon].astype('str') + ')' )
rows.append(str(mon)+' OverdueRate');rows.append(str(mon)+' Count')
aucri.append(round((sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar])/auc_BM),3))
auc.append(round(sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar]),3))
auc[-1] = str(auc[-1]) + '\n AUC基准: ' + str(round(auc_BM,3))
plotLiftChart(modelType[modelList.index(modelVar)] + '-' + appliedType_type[appliedType] + '-' + channel[channelID] + ' AUC WITH '+ str(passdueday) + '+',y_list,dateList,aucri,auc,rows,cols,value_tab,path)
except Exception as e:
print(e)
"""
def plot_table_df(dataset, auc, title='untitled', X_label=None, y_label=None, def plot_table_df(dataset, auc, title='untitled', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path=None): tab_df=None, plot_tab=True, saved_path=None):
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment