Commit edc7499b authored by 舒皓月's avatar 舒皓月

...

parent cb1c5b5d
......@@ -2,13 +2,8 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="e1b3e57f-dd82-4187-916a-8212c6c521a7" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
<change afterPath="$PROJECT_DIR$/model_score.xlsx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/doc/image/31EA97A8-19B7-45c6-8302-4148D19BAABA.png" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/doc/image/C6640ABE-9017-42b5-A92A-2DE5601A15D8.png" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" beforeDir="false" afterPath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test.py" beforeDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
......@@ -18,11 +13,11 @@
</component>
<component name="FileEditorManager">
<leaf>
<file pinned="false" current-in-tab="true">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tmp.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-136">
<caret line="495" column="19" selection-start-line="495" selection-start-column="19" selection-end-line="495" selection-end-column="19" />
<state relative-caret-position="332">
<caret line="334" column="39" selection-start-line="334" selection-start-column="39" selection-end-line="334" selection-end-column="39" />
<folding>
<element signature="e#2742#2953#0" />
<element signature="e#2931#3547#0" />
......@@ -57,8 +52,6 @@
<element signature="e#11135#11353#0" />
<element signature="e#11490#11717#0" />
<element signature="e#11819#11905#0" />
<element signature="e#12107#12803#0" />
<element signature="e#12277#12548#0" />
<element signature="e#12877#13384#0" />
<element signature="e#13413#13513#0" />
<element signature="e#13575#13737#0" />
......@@ -100,81 +93,68 @@
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="152">
<caret line="481" column="24" selection-start-line="481" selection-start-column="24" selection-end-line="481" selection-end-column="24" />
<state relative-caret-position="368">
<caret line="337" column="39" selection-start-line="337" selection-start-column="39" selection-end-line="337" selection-end-column="39" />
<folding>
<element signature="e#16#34#0" expanded="true" />
<element signature="e#509#2431#0" />
<element signature="e#2143#2314#0" />
<element signature="e#2471#2578#0" />
<element signature="e#2634#2768#0" />
<element signature="e#2804#2887#0" />
<element signature="e#2933#3549#0" />
<element signature="e#2933#3035#1" />
<element signature="e#3113#3384#0" />
<element signature="e#3413#3549#0" />
<element signature="e#3497#3549#0" />
<element signature="e#3601#4067#0" />
<element signature="e#3601#3720#1" />
<element signature="e#3746#4027#0" />
<element signature="e#4166#6211#0" />
<element signature="e#4166#4321#1" />
<element signature="e#4815#5478#0" />
<element signature="e#4995#5054#0" />
<element signature="e#5595#6158#0" />
<element signature="e#5626#5702#0" />
<element signature="e#5745#6158#0" />
<element signature="e#6071#6158#0" />
<element signature="e#6310#8760#0" />
<element signature="e#6310#6465#1" />
<element signature="e#7061#8339#0" />
<element signature="e#7251#7310#0" />
<element signature="e#7930#8200#0" />
<element signature="e#8245#8339#0" />
<element signature="e#8420#8704#0" />
<element signature="e#8459#8536#0" />
<element signature="e#8801#13386#0" />
<element signature="e#9246#9397#0" />
<element signature="e#9353#9397#0" />
<element signature="e#9431#9516#0" />
<element signature="e#9597#9609#0" />
<element signature="e#10491#10501#0" />
<element signature="e#11091#11355#0" />
<element signature="e#11137#11355#0" />
<element signature="e#11492#11719#0" />
<element signature="e#11821#11907#0" />
<element signature="e#12109#12805#0" />
<element signature="e#12279#12550#0" />
<element signature="e#12879#13386#0" />
<element signature="e#12969#13077#0" />
<element signature="e#13143#13307#0" />
<element signature="e#13427#18635#0" />
<element signature="e#14091#14242#0" />
<element signature="e#14198#14242#0" />
<element signature="e#14276#14361#0" />
<element signature="e#14442#14454#0" />
<element signature="e#15534#15544#0" />
<element signature="e#16140#16425#0" />
<element signature="e#16186#16425#0" />
<element signature="e#16562#16786#0" />
<element signature="e#16888#16974#0" />
<element signature="e#17178#18133#0" />
<element signature="e#17348#17878#0" />
<element signature="e#17467#17516#0" />
<element signature="e#18207#18635#0" />
<element signature="e#18218#18326#0" />
<element signature="e#18392#18556#0" />
<element signature="e#18742#19104#0" />
<element signature="e#19265#19320#0" />
<element signature="e#19452#19653#0" />
<element signature="e#19470#19652#0" />
<element signature="e#19894#19949#0" />
<element signature="e#20774#21074#0" />
<element signature="e#20811#21047#0" />
<element signature="e#21084#21089#0" />
<element signature="e#2839#3050#0" />
<element signature="e#4164#4780#0" />
<element signature="e#4164#4266#1" />
<element signature="e#4344#4615#0" />
<element signature="e#4644#4780#0" />
<element signature="e#4728#4780#0" />
<element signature="e#4832#5298#0" />
<element signature="e#4832#4951#1" />
<element signature="e#4977#5258#0" />
<element signature="e#5397#7442#0" />
<element signature="e#5397#5552#1" />
<element signature="e#6046#6709#0" />
<element signature="e#6226#6285#0" />
<element signature="e#6826#7389#0" />
<element signature="e#6857#6933#0" />
<element signature="e#6976#7389#0" />
<element signature="e#7302#7389#0" />
<element signature="e#7541#9990#0" />
<element signature="e#7541#7696#1" />
<element signature="e#8291#9569#0" />
<element signature="e#8481#8540#0" />
<element signature="e#9160#9430#0" />
<element signature="e#9475#9569#0" />
<element signature="e#9650#9934#0" />
<element signature="e#9689#9766#0" />
<element signature="e#10467#10618#0" />
<element signature="e#10574#10618#0" />
<element signature="e#10925#10937#0" />
<element signature="e#11819#11829#0" />
<element signature="e#12419#12683#0" />
<element signature="e#12465#12683#0" />
<element signature="e#12789#13016#0" />
<element signature="e#13118#13204#0" />
<element signature="e#14176#14668#0" />
<element signature="e#14266#14362#0" />
<element signature="e#14428#14589#0" />
<element signature="e#15686#15837#0" />
<element signature="e#15793#15837#0" />
<element signature="e#16144#16156#0" />
<element signature="e#17236#17246#0" />
<element signature="e#17842#18127#0" />
<element signature="e#17888#18127#0" />
<element signature="e#18233#18457#0" />
<element signature="e#18559#18645#0" />
<element signature="e#18849#19804#0" />
<element signature="e#19019#19549#0" />
<element signature="e#19138#19187#0" />
<element signature="e#19889#19985#0" />
<element signature="e#20051#20305#0" />
<element signature="e#22953#23042#0" />
<element signature="e#23127#23384#0" />
<element signature="e#23189#23322#0" />
<element signature="e#23487#23700#0" />
<element signature="e#23866#24109#0" />
<element signature="e#23903#24104#0" />
<element signature="e#24119#24124#0" />
<element signature="e#21487#21515#0" />
</folding>
</state>
......@@ -197,9 +177,9 @@
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/.gitignore" />
<option value="$PROJECT_DIR$/model_monitor_PSI_AUC.py" />
<option value="$PROJECT_DIR$/test.py" />
<option value="$PROJECT_DIR$/tmp.py" />
<option value="$PROJECT_DIR$/model_monitor_PSI_AUC.py" />
</list>
</option>
</component>
......@@ -239,6 +219,7 @@
</panes>
</component>
<component name="PropertiesComponent">
<property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RunDashboard">
......@@ -332,85 +313,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="152">
<caret line="481" column="24" selection-start-line="481" selection-start-column="24" selection-end-line="481" selection-end-column="24" />
<folding>
<element signature="e#16#34#0" expanded="true" />
<element signature="e#509#2431#0" />
<element signature="e#2143#2314#0" />
<element signature="e#2471#2578#0" />
<element signature="e#2634#2768#0" />
<element signature="e#2804#2887#0" />
<element signature="e#2933#3549#0" />
<element signature="e#2933#3035#1" />
<element signature="e#3113#3384#0" />
<element signature="e#3413#3549#0" />
<element signature="e#3497#3549#0" />
<element signature="e#3601#4067#0" />
<element signature="e#3601#3720#1" />
<element signature="e#3746#4027#0" />
<element signature="e#4166#6211#0" />
<element signature="e#4166#4321#1" />
<element signature="e#4815#5478#0" />
<element signature="e#4995#5054#0" />
<element signature="e#5595#6158#0" />
<element signature="e#5626#5702#0" />
<element signature="e#5745#6158#0" />
<element signature="e#6071#6158#0" />
<element signature="e#6310#8760#0" />
<element signature="e#6310#6465#1" />
<element signature="e#7061#8339#0" />
<element signature="e#7251#7310#0" />
<element signature="e#7930#8200#0" />
<element signature="e#8245#8339#0" />
<element signature="e#8420#8704#0" />
<element signature="e#8459#8536#0" />
<element signature="e#8801#13386#0" />
<element signature="e#9246#9397#0" />
<element signature="e#9353#9397#0" />
<element signature="e#9431#9516#0" />
<element signature="e#9597#9609#0" />
<element signature="e#10491#10501#0" />
<element signature="e#11091#11355#0" />
<element signature="e#11137#11355#0" />
<element signature="e#11492#11719#0" />
<element signature="e#11821#11907#0" />
<element signature="e#12109#12805#0" />
<element signature="e#12279#12550#0" />
<element signature="e#12879#13386#0" />
<element signature="e#12969#13077#0" />
<element signature="e#13143#13307#0" />
<element signature="e#13427#18635#0" />
<element signature="e#14091#14242#0" />
<element signature="e#14198#14242#0" />
<element signature="e#14276#14361#0" />
<element signature="e#14442#14454#0" />
<element signature="e#15534#15544#0" />
<element signature="e#16140#16425#0" />
<element signature="e#16186#16425#0" />
<element signature="e#16562#16786#0" />
<element signature="e#16888#16974#0" />
<element signature="e#17178#18133#0" />
<element signature="e#17348#17878#0" />
<element signature="e#17467#17516#0" />
<element signature="e#18207#18635#0" />
<element signature="e#18218#18326#0" />
<element signature="e#18392#18556#0" />
<element signature="e#18742#19104#0" />
<element signature="e#19265#19320#0" />
<element signature="e#19452#19653#0" />
<element signature="e#19470#19652#0" />
<element signature="e#19894#19949#0" />
<element signature="e#20774#21074#0" />
<element signature="e#20811#21047#0" />
<element signature="e#21084#21089#0" />
<element signature="e#21487#21515#0" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/.PyCharmCE2019.1/system/python_stubs/1626812534/builtins.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="226">
......@@ -430,8 +332,8 @@
</entry>
<entry file="file://$PROJECT_DIR$/tmp.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-136">
<caret line="495" column="19" selection-start-line="495" selection-start-column="19" selection-end-line="495" selection-end-column="19" />
<state relative-caret-position="332">
<caret line="334" column="39" selection-start-line="334" selection-start-column="39" selection-end-line="334" selection-end-column="39" />
<folding>
<element signature="e#2742#2953#0" />
<element signature="e#2931#3547#0" />
......@@ -466,8 +368,6 @@
<element signature="e#11135#11353#0" />
<element signature="e#11490#11717#0" />
<element signature="e#11819#11905#0" />
<element signature="e#12107#12803#0" />
<element signature="e#12277#12548#0" />
<element signature="e#12877#13384#0" />
<element signature="e#13413#13513#0" />
<element signature="e#13575#13737#0" />
......@@ -496,5 +396,71 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="368">
<caret line="337" column="39" selection-start-line="337" selection-start-column="39" selection-end-line="337" selection-end-column="39" />
<folding>
<element signature="e#2839#3050#0" />
<element signature="e#4164#4780#0" />
<element signature="e#4164#4266#1" />
<element signature="e#4344#4615#0" />
<element signature="e#4644#4780#0" />
<element signature="e#4728#4780#0" />
<element signature="e#4832#5298#0" />
<element signature="e#4832#4951#1" />
<element signature="e#4977#5258#0" />
<element signature="e#5397#7442#0" />
<element signature="e#5397#5552#1" />
<element signature="e#6046#6709#0" />
<element signature="e#6226#6285#0" />
<element signature="e#6826#7389#0" />
<element signature="e#6857#6933#0" />
<element signature="e#6976#7389#0" />
<element signature="e#7302#7389#0" />
<element signature="e#7541#9990#0" />
<element signature="e#7541#7696#1" />
<element signature="e#8291#9569#0" />
<element signature="e#8481#8540#0" />
<element signature="e#9160#9430#0" />
<element signature="e#9475#9569#0" />
<element signature="e#9650#9934#0" />
<element signature="e#9689#9766#0" />
<element signature="e#10467#10618#0" />
<element signature="e#10574#10618#0" />
<element signature="e#10925#10937#0" />
<element signature="e#11819#11829#0" />
<element signature="e#12419#12683#0" />
<element signature="e#12465#12683#0" />
<element signature="e#12789#13016#0" />
<element signature="e#13118#13204#0" />
<element signature="e#14176#14668#0" />
<element signature="e#14266#14362#0" />
<element signature="e#14428#14589#0" />
<element signature="e#15686#15837#0" />
<element signature="e#15793#15837#0" />
<element signature="e#16144#16156#0" />
<element signature="e#17236#17246#0" />
<element signature="e#17842#18127#0" />
<element signature="e#17888#18127#0" />
<element signature="e#18233#18457#0" />
<element signature="e#18559#18645#0" />
<element signature="e#18849#19804#0" />
<element signature="e#19019#19549#0" />
<element signature="e#19138#19187#0" />
<element signature="e#19889#19985#0" />
<element signature="e#20051#20305#0" />
<element signature="e#22953#23042#0" />
<element signature="e#23127#23384#0" />
<element signature="e#23189#23322#0" />
<element signature="e#23487#23700#0" />
<element signature="e#23866#24109#0" />
<element signature="e#23903#24104#0" />
<element signature="e#24119#24124#0" />
<element signature="e#21487#21515#0" />
</folding>
</state>
</provider>
</entry>
</component>
</project>
\ No newline at end of file
......@@ -97,6 +97,7 @@
- min_aucr: 最小AUCR, 小于则视为该客群异常.
- 执行run函数.
```python
......
......@@ -13,15 +13,17 @@ import pymongo
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
warnings.filterwarnings('ignore')
class ModelMonitor:
def __init__(self, excel_path='./model_score.xlsx', sheet_name='mongo_model',
passdue_day=15, save_path='./image/',
last_month=7, num_month=4, min_user_group=500):
num_month=4, min_user_group=500, max_psi=0.1, min_aucr=0.85):
# 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量.
self.mysql_engine = pymysql.connect(host='172.20.6.9',
......@@ -45,9 +47,19 @@ class ModelMonitor:
# 一些定义的常量
self.passdue_day = passdue_day # 逾期天数, 默认15.
self.save_path = save_path # 图片保存位置, 默认./image.
self.last_month = last_month # 取数的最后一个月.
self.num_month = num_month # 取数的月数.
self.min_user_group = min_user_group # 最小客群数量.
self.max_psi = max_psi # 最大PSI, 超过视为异常.
self.min_aucr = min_aucr # 最小AUC比率, 小于视为异常.
# 获取当天日期信息.
self.current_date = (datetime.date.today() + relativedelta(days=-1)).strftime('%Y-%m-%d')
self.response_date = (datetime.date.today() + relativedelta(days=-(31 + self.passdue_day))).strftime('%Y-%m-%d')
self.first_date = (datetime.date.today() + relativedelta(months=-self.num_month + 1)).strftime('%Y-%m-01')
self.current_month = (datetime.date.today() + datetime.timedelta(days=-1)).month
self.response_month = (datetime.date.today() + relativedelta(days=-46)).month
self.first_month = self.current_month - self.num_month + 1
# 将会从数据库中读取的数据.
self.mysql_df = None
......@@ -55,30 +67,51 @@ class ModelMonitor:
self.merge_data = None
# 统计数据记录.
psi_cols = ['model_name', 'group_name']
auc_cols = ['model_name', 'group_name']
for m in range(self.last_month - self.num_month, self.last_month):
psi_cols.append(str(m) + 'm_num')
psi_cols.append(str(m) + 'm_psi')
auc_cols.append(str(m) + 'm_num')
auc_cols.append(str(m) + 'm_auc')
psi_cols = ['模型名称', '客群名称']
auc_cols = ['模型名称', '客群名称']
for m in range(self.first_month, self.current_month + 1):
psi_cols.append(str(m) + '月数量')
psi_cols.append(str(m) + '月PSI')
auc_cols.append(str(m) + '月数量')
auc_cols.append(str(m) + '月AUC')
auc_cols.append(str(m) + '月AUCR')
self.psi_info_df = pd.DataFrame(columns=psi_cols)
self.auc_info_df = pd.DataFrame(columns=auc_cols)
self.na_enough_data_psi_set = set() # 一些新的模型没有足够数据用于统计.
self.na_enough_data_auc_set = set() # 一些新的模型没有足够数据用于统计.
def sql_query(self, sql):
'''
连接MySQL数据库, 根据SQL返回数据.
:param sql: str.
:return: DataFrame.
'''
try:
return pd.read_sql(sql, self.mysql_engine)
except:
print('SQL查询出现错误.')
def mongo_query(self, condition, fields):
'''
连接MongoDB, 根据查询返回数据.
:param condition: dict
:param fields: dict
:return: DataFrame
'''
try:
return pd.DataFrame(list(self.mongo_table.find(condition, fields)))
except:
print('Mongo查询出现错误.')
def int2str(self, x):
'''
将int转换为str, 用于日期.
e.g. 5 --> 05
:param x: int
:return: str.
'''
if x >= 10:
return str(x)
else:
......@@ -165,7 +198,7 @@ class ModelMonitor:
info_dict[user_group_name][m]['psi'] = -999
print('计算PSI出现错误.')
print('处理完成.')
print('='*40)
print('=' * 40)
def helper_auc(self, user_group_name=None, df=None, info_dict=None, field=None):
'''
......@@ -180,7 +213,6 @@ class ModelMonitor:
if 0 in month_list:
month_list.remove(0)
df_g = df.groupby(['month_label', 'bins'])['overdue'].agg({'overdue': ['count', 'sum', 'mean']})
df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()]
df_g = df_g.reset_index()
......@@ -196,11 +228,14 @@ class ModelMonitor:
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[df_g['month_label'] == m, 'overdue_sum'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[df_g['month_label'] == m, 'overdue_mean'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[
df_g['month_label'] == m, 'overdue_sum'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[
df_g['month_label'] == m, 'overdue_mean'].values
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
try:
info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(df.loc[(df['month_label'] == m) & (df[field].notna()), 'overdue'],
info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(
df.loc[(df['month_label'] == m) & (df[field].notna()), 'overdue'],
df.loc[(df['month_label'] == m) & (df[field].notna()), field])
except:
print('AUC计算发生错误.')
......@@ -211,7 +246,8 @@ class ModelMonitor:
info_dict[user_group_name][m]['aucR'] = 1
bench_month = m
else:
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / info_dict[user_group_name][bench_month]['auc']
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / \
info_dict[user_group_name][bench_month]['auc']
print('处理完成.')
print('=' * 40)
......@@ -226,14 +262,15 @@ class ModelMonitor:
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel']].copy()
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month):
for m in range(self.first_month, self.current_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('以%d月为基准月.' % m)
break
if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field)
print('='*40)
self.na_enough_data_psi_set.add(self.model_feild_name_dict[field])
print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field])
print('=' * 40)
return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
......@@ -252,7 +289,6 @@ class ModelMonitor:
# '各分箱样本占比': [...]}}}
info_dict = {}
# 全样本
self.helper_psi('全样本', df_copy, info_dict, field)
# 按申请类型划分.
......@@ -273,13 +309,15 @@ class ModelMonitor:
app_chan_set = df_copy_g['applied_channel'].unique()
for app_type in app_type_set:
for app_chan in app_chan_set:
if df_copy_g.loc[(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0:
if df_copy_g.loc[
(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g
## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict:
self.helper_psi(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (df_copy['applied_channel'] == user_group_dict[user_group_name][1])],
self.helper_psi(user_group_name, df_copy.loc[
(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])],
info_dict, field)
# 过滤不包含信息的客群.
remove_list = []
......@@ -291,14 +329,15 @@ class ModelMonitor:
# 画图.
print('开始画图.')
print('='*40)
print('=' * 40)
for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])),
info_dict[user_group_name][m]['各分箱样本占比'],
label='%s PSI: %.3f \n 样本量: %d' % (m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量']))
[round(x[0], 3) for x in info_dict[user_group_name][m]['各分箱样本占比']],
label='%s PSI: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'PSI/' + self.model_feild_name_dict[field] + '-' + user_group_name)
......@@ -307,11 +346,11 @@ class ModelMonitor:
# 保存统计信息.
for user_group_name in info_dict:
# print(self.model_feild_name_dict[field] + '-' + user_group_name)
tmp_dict = {'model_name': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]}
tmp_dict = {'模型名称': [self.model_feild_name_dict[field]],
'客群名称': [user_group_name]}
for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_psi'] = [round(info_dict[user_group_name][m]['psi'], 3)]
tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + '月PSI'] = [round(info_dict[user_group_name][m]['psi'], 3)]
self.psi_info_df = self.psi_info_df.append(pd.DataFrame(tmp_dict))
def plot_auc(self, field):
......@@ -321,20 +360,28 @@ class ModelMonitor:
if not os.path.exists(self.save_path + 'AUC/'):
os.mkdir(self.save_path + 'AUC/')
# 分离数据.
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day']].copy()
df_copy = self.merge_data[
[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day', 'applied_at']].copy()
## 筛选出放款, 且逾期表现的数据.
df_copy = df_copy.loc[(df_copy[field].notna()) & (df_copy['month_label'] != self.last_month - 1) & (df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
if repr(df_copy['applied_at'].dtype) == "dtype('O')":
df_copy = df_copy.loc[
(df_copy[field].notna()) & (df_copy['applied_at'].apply(lambda x: x[:10]) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
else:
df_copy = df_copy.loc[(df_copy[field].notna()) & (
df_copy['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d')) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month):
for m in range(self.first_month, self.response_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('以%d月为基准月.' % m)
break
if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field)
print('='*40)
self.na_enough_data_auc_set.add(self.model_feild_name_dict[field])
print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field])
print('=' * 40)
return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
......@@ -357,7 +404,6 @@ class ModelMonitor:
# 'aucR': float}}}
info_dict = {}
# 全样本
self.helper_auc('全样本', df_copy, info_dict, field)
# 按申请类型划分.
......@@ -383,10 +429,11 @@ class ModelMonitor:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g
## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict:
self.helper_auc(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], info_dict, field)
self.helper_auc(user_group_name,
df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], info_dict,
field)
# 过滤不包含信息的客群.
remove_list = []
for user_group_name in info_dict:
......@@ -409,7 +456,8 @@ class ModelMonitor:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱逾期率'])),
info_dict[user_group_name][m]['各分箱逾期率'],
label='%s AUC: %.3f AUCR: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'], info_dict[user_group_name][m]['该月样本量']))
m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'],
info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'AUC/' + self.model_feild_name_dict[field] + '-' + user_group_name)
......@@ -417,28 +465,48 @@ class ModelMonitor:
# 保存统计信息.
for user_group_name in info_dict:
tmp_dict = {'model_name': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]}
tmp_dict = {'模型名称': [self.model_feild_name_dict[field]],
'客群名称': [user_group_name]}
for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_auc'] = [round(info_dict[user_group_name][m]['auc'], 3)]
tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + '月AUC'] = [round(info_dict[user_group_name][m]['auc'], 3)]
tmp_dict[m[0] + '月AUCR'] = [round(info_dict[user_group_name][m]['aucR'], 3)]
self.auc_info_df = self.auc_info_df.append(pd.DataFrame(tmp_dict))
def abnormal_psi(self):
def is_abnormal_psi(data):
for i in data.index:
if 'PSI' in i and data[i] > self.max_psi:
return True
return False
self.psi_info_df['是否异常'] = self.psi_info_df.apply(is_abnormal_psi, axis=1)
def abnormal_auc(self):
def is_abnormal_auc(data):
for i in data.index:
if 'AUCR' in i and data[i] < self.min_aucr:
return True
return False
self.auc_info_df['是否异常'] = self.auc_info_df.apply(is_abnormal_auc, axis=1)
def run(self):
# 获取MySQL数据, 取last_month往前num_month个月数据.
# 获取MySQL数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
self.mysql_df = self.sql_query('''SELECT order_no, applied_at,
applied_type, applied_from, applied_channel, transacted, passdue_day
FROM risk_analysis
WHERE applied_at > "2019-%s-01 00:00:00"
AND applied_at < "2019-%s-01 00:00:00"'''
% (self.int2str(self.last_month - self.num_month), self.int2str(self.last_month)))
WHERE applied_at >= "%s 00:00:00"
AND applied_at <= "%s 00:00:00"'''
% (self.first_date, self.current_date))
print('MySQL数据获取成功.')
# self.mysql_df.to_csv('./mysql_data.csv', index=False)
# self.mysql_df = pd.read_csv('./mysql_data.csv')
# 获取MongoDB数据, 取last_month往前num_month个月数据.
condition = {'wf_created_at': {'$gte': '2019-%s-01 00:00:00' % self.int2str(self.last_month - self.num_month),
'$lte': '2019-%s-01 00:00:00' % self.int2str(self.last_month)}}
# 获取MongoDB数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
condition = {'wf_created_at': {'$gte': '%s 00:00:00' % self.first_date,
'$lte': '%s 00:00:00' % self.current_date}}
fields = {'wf_biz_no': 1, 'wf_created_at': 1}
for f in self.model_feild_list: # 加入Excel中预置的模型分名称
fields[f] = 1
......@@ -456,8 +524,15 @@ class ModelMonitor:
# 拼接数据.
self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df,
left_on='order_no', right_on='wf_biz_no', how='left')
## 定义逾期用户.
self.merge_data['overdue'] = self.merge_data['passdue_day'] > self.passdue_day
def overdue(data):
if pd.isnull(data):
return np.nan
else:
return float(data > self.passdue_day)
self.merge_data['overdue'] = self.merge_data['passdue_day'].apply(overdue)
# 清洗数据.
def clean_data(data):
......@@ -465,6 +540,7 @@ class ModelMonitor:
return float(data)
except:
return np.nan
na_field_list = []
for field in self.model_feild_list:
if field in self.merge_data.columns.tolist():
......@@ -475,38 +551,51 @@ class ModelMonitor:
## 去除因为一些原因未抽取到的字段.
print('不包含以下字段:')
for field in na_field_list:
print(self.model_feild_name_dict[field])
self.model_feild_list.remove(field)
self.model_name_list.remove(self.model_feild_name_dict[field])
del self.model_feild_name_dict[field]
print(self.model_feild_name_dict[field])
print('数据拼接完成.')
# 数据按月划分.
self.merge_data['month_label'] = 0
for m in range(self.num_month):
for m in range(self.first_month, self.current_month + 1):
self.merge_data.loc[
(self.merge_data['applied_at'] > '2019-%s-01 00:00:00' % self.int2str(self.last_month - m - 1)) &
(self.merge_data['applied_at'] < '2019-%s-01 00:00:00' % self.int2str(self.last_month - m)),
'month_label'] = self.last_month - m - 1
(self.merge_data['applied_at'] >= '2019-%s-01 00:00:00' % self.int2str(m)) &
(self.merge_data['applied_at'] < '2019-%s-01 00:00:00' % self.int2str(m + 1)),
'month_label'] = m
# 画图.
## AUC.
print('开始画图-AUC.')
for field in self.model_feild_list:
self.plot_auc(field)
## PSI.
print('开始画图-PSI.')
for field in self.model_feild_list:
self.plot_psi(field)
## AUC.
print('开始画图-AUC.')
for field in self.model_feild_list:
self.plot_auc(field)
# 输出数据不足的模型.
print('PSI 数据不足以统计的模型:')
for model_name in self.na_enough_data_psi_set:
print(model_name)
print('=' * 40)
print('AUC 数据不足以统计的模型:')
for model_name in self.na_enough_data_auc_set:
print(model_name)
print('=' * 40)
# 检测是否异常.
self.abnormal_psi()
self.abnormal_auc()
# 保存统计信息.
self.psi_info_df.to_csv('./psi_info.csv', index=False)
self.auc_info_df.to_csv('./auc_info.csv', index=False)
print('统计信息保存成功.')
# if __name__ == '__main__':
# pass
# mm = ModelMonitor(excel_path='./model_score.xlsx', save_path='./image/', last_month=7, num_month=2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment