Commit edc7499b authored by 舒皓月's avatar 舒皓月

...

parent cb1c5b5d
...@@ -2,13 +2,8 @@ ...@@ -2,13 +2,8 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="e1b3e57f-dd82-4187-916a-8212c6c521a7" name="Default Changelist" comment=""> <list default="true" id="e1b3e57f-dd82-4187-916a-8212c6c521a7" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
<change afterPath="$PROJECT_DIR$/model_score.xlsx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" /> <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/doc/image/31EA97A8-19B7-45c6-8302-4148D19BAABA.png" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/doc/image/C6640ABE-9017-42b5-A92A-2DE5601A15D8.png" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" beforeDir="false" afterPath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" beforeDir="false" afterPath="$PROJECT_DIR$/model_monitor_PSI_AUC.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test.py" beforeDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
...@@ -18,11 +13,11 @@ ...@@ -18,11 +13,11 @@
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf> <leaf>
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tmp.py"> <entry file="file://$PROJECT_DIR$/tmp.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-136"> <state relative-caret-position="332">
<caret line="495" column="19" selection-start-line="495" selection-start-column="19" selection-end-line="495" selection-end-column="19" /> <caret line="334" column="39" selection-start-line="334" selection-start-column="39" selection-end-line="334" selection-end-column="39" />
<folding> <folding>
<element signature="e#2742#2953#0" /> <element signature="e#2742#2953#0" />
<element signature="e#2931#3547#0" /> <element signature="e#2931#3547#0" />
...@@ -57,8 +52,6 @@ ...@@ -57,8 +52,6 @@
<element signature="e#11135#11353#0" /> <element signature="e#11135#11353#0" />
<element signature="e#11490#11717#0" /> <element signature="e#11490#11717#0" />
<element signature="e#11819#11905#0" /> <element signature="e#11819#11905#0" />
<element signature="e#12107#12803#0" />
<element signature="e#12277#12548#0" />
<element signature="e#12877#13384#0" /> <element signature="e#12877#13384#0" />
<element signature="e#13413#13513#0" /> <element signature="e#13413#13513#0" />
<element signature="e#13575#13737#0" /> <element signature="e#13575#13737#0" />
...@@ -100,81 +93,68 @@ ...@@ -100,81 +93,68 @@
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py"> <entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="152"> <state relative-caret-position="368">
<caret line="481" column="24" selection-start-line="481" selection-start-column="24" selection-end-line="481" selection-end-column="24" /> <caret line="337" column="39" selection-start-line="337" selection-start-column="39" selection-end-line="337" selection-end-column="39" />
<folding> <folding>
<element signature="e#16#34#0" expanded="true" /> <element signature="e#2839#3050#0" />
<element signature="e#509#2431#0" /> <element signature="e#4164#4780#0" />
<element signature="e#2143#2314#0" /> <element signature="e#4164#4266#1" />
<element signature="e#2471#2578#0" /> <element signature="e#4344#4615#0" />
<element signature="e#2634#2768#0" /> <element signature="e#4644#4780#0" />
<element signature="e#2804#2887#0" /> <element signature="e#4728#4780#0" />
<element signature="e#2933#3549#0" /> <element signature="e#4832#5298#0" />
<element signature="e#2933#3035#1" /> <element signature="e#4832#4951#1" />
<element signature="e#3113#3384#0" /> <element signature="e#4977#5258#0" />
<element signature="e#3413#3549#0" /> <element signature="e#5397#7442#0" />
<element signature="e#3497#3549#0" /> <element signature="e#5397#5552#1" />
<element signature="e#3601#4067#0" /> <element signature="e#6046#6709#0" />
<element signature="e#3601#3720#1" /> <element signature="e#6226#6285#0" />
<element signature="e#3746#4027#0" /> <element signature="e#6826#7389#0" />
<element signature="e#4166#6211#0" /> <element signature="e#6857#6933#0" />
<element signature="e#4166#4321#1" /> <element signature="e#6976#7389#0" />
<element signature="e#4815#5478#0" /> <element signature="e#7302#7389#0" />
<element signature="e#4995#5054#0" /> <element signature="e#7541#9990#0" />
<element signature="e#5595#6158#0" /> <element signature="e#7541#7696#1" />
<element signature="e#5626#5702#0" /> <element signature="e#8291#9569#0" />
<element signature="e#5745#6158#0" /> <element signature="e#8481#8540#0" />
<element signature="e#6071#6158#0" /> <element signature="e#9160#9430#0" />
<element signature="e#6310#8760#0" /> <element signature="e#9475#9569#0" />
<element signature="e#6310#6465#1" /> <element signature="e#9650#9934#0" />
<element signature="e#7061#8339#0" /> <element signature="e#9689#9766#0" />
<element signature="e#7251#7310#0" /> <element signature="e#10467#10618#0" />
<element signature="e#7930#8200#0" /> <element signature="e#10574#10618#0" />
<element signature="e#8245#8339#0" /> <element signature="e#10925#10937#0" />
<element signature="e#8420#8704#0" /> <element signature="e#11819#11829#0" />
<element signature="e#8459#8536#0" /> <element signature="e#12419#12683#0" />
<element signature="e#8801#13386#0" /> <element signature="e#12465#12683#0" />
<element signature="e#9246#9397#0" /> <element signature="e#12789#13016#0" />
<element signature="e#9353#9397#0" /> <element signature="e#13118#13204#0" />
<element signature="e#9431#9516#0" /> <element signature="e#14176#14668#0" />
<element signature="e#9597#9609#0" /> <element signature="e#14266#14362#0" />
<element signature="e#10491#10501#0" /> <element signature="e#14428#14589#0" />
<element signature="e#11091#11355#0" /> <element signature="e#15686#15837#0" />
<element signature="e#11137#11355#0" /> <element signature="e#15793#15837#0" />
<element signature="e#11492#11719#0" /> <element signature="e#16144#16156#0" />
<element signature="e#11821#11907#0" /> <element signature="e#17236#17246#0" />
<element signature="e#12109#12805#0" /> <element signature="e#17842#18127#0" />
<element signature="e#12279#12550#0" /> <element signature="e#17888#18127#0" />
<element signature="e#12879#13386#0" /> <element signature="e#18233#18457#0" />
<element signature="e#12969#13077#0" /> <element signature="e#18559#18645#0" />
<element signature="e#13143#13307#0" /> <element signature="e#18849#19804#0" />
<element signature="e#13427#18635#0" /> <element signature="e#19019#19549#0" />
<element signature="e#14091#14242#0" /> <element signature="e#19138#19187#0" />
<element signature="e#14198#14242#0" /> <element signature="e#19889#19985#0" />
<element signature="e#14276#14361#0" /> <element signature="e#20051#20305#0" />
<element signature="e#14442#14454#0" /> <element signature="e#22953#23042#0" />
<element signature="e#15534#15544#0" /> <element signature="e#23127#23384#0" />
<element signature="e#16140#16425#0" /> <element signature="e#23189#23322#0" />
<element signature="e#16186#16425#0" /> <element signature="e#23487#23700#0" />
<element signature="e#16562#16786#0" /> <element signature="e#23866#24109#0" />
<element signature="e#16888#16974#0" /> <element signature="e#23903#24104#0" />
<element signature="e#17178#18133#0" /> <element signature="e#24119#24124#0" />
<element signature="e#17348#17878#0" />
<element signature="e#17467#17516#0" />
<element signature="e#18207#18635#0" />
<element signature="e#18218#18326#0" />
<element signature="e#18392#18556#0" />
<element signature="e#18742#19104#0" />
<element signature="e#19265#19320#0" />
<element signature="e#19452#19653#0" />
<element signature="e#19470#19652#0" />
<element signature="e#19894#19949#0" />
<element signature="e#20774#21074#0" />
<element signature="e#20811#21047#0" />
<element signature="e#21084#21089#0" />
<element signature="e#21487#21515#0" /> <element signature="e#21487#21515#0" />
</folding> </folding>
</state> </state>
...@@ -197,9 +177,9 @@ ...@@ -197,9 +177,9 @@
<option name="CHANGED_PATHS"> <option name="CHANGED_PATHS">
<list> <list>
<option value="$PROJECT_DIR$/.gitignore" /> <option value="$PROJECT_DIR$/.gitignore" />
<option value="$PROJECT_DIR$/model_monitor_PSI_AUC.py" />
<option value="$PROJECT_DIR$/test.py" /> <option value="$PROJECT_DIR$/test.py" />
<option value="$PROJECT_DIR$/tmp.py" /> <option value="$PROJECT_DIR$/tmp.py" />
<option value="$PROJECT_DIR$/model_monitor_PSI_AUC.py" />
</list> </list>
</option> </option>
</component> </component>
...@@ -239,6 +219,7 @@ ...@@ -239,6 +219,7 @@
</panes> </panes>
</component> </component>
<component name="PropertiesComponent"> <component name="PropertiesComponent">
<property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" /> <property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component> </component>
<component name="RunDashboard"> <component name="RunDashboard">
...@@ -332,85 +313,6 @@ ...@@ -332,85 +313,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="152">
<caret line="481" column="24" selection-start-line="481" selection-start-column="24" selection-end-line="481" selection-end-column="24" />
<folding>
<element signature="e#16#34#0" expanded="true" />
<element signature="e#509#2431#0" />
<element signature="e#2143#2314#0" />
<element signature="e#2471#2578#0" />
<element signature="e#2634#2768#0" />
<element signature="e#2804#2887#0" />
<element signature="e#2933#3549#0" />
<element signature="e#2933#3035#1" />
<element signature="e#3113#3384#0" />
<element signature="e#3413#3549#0" />
<element signature="e#3497#3549#0" />
<element signature="e#3601#4067#0" />
<element signature="e#3601#3720#1" />
<element signature="e#3746#4027#0" />
<element signature="e#4166#6211#0" />
<element signature="e#4166#4321#1" />
<element signature="e#4815#5478#0" />
<element signature="e#4995#5054#0" />
<element signature="e#5595#6158#0" />
<element signature="e#5626#5702#0" />
<element signature="e#5745#6158#0" />
<element signature="e#6071#6158#0" />
<element signature="e#6310#8760#0" />
<element signature="e#6310#6465#1" />
<element signature="e#7061#8339#0" />
<element signature="e#7251#7310#0" />
<element signature="e#7930#8200#0" />
<element signature="e#8245#8339#0" />
<element signature="e#8420#8704#0" />
<element signature="e#8459#8536#0" />
<element signature="e#8801#13386#0" />
<element signature="e#9246#9397#0" />
<element signature="e#9353#9397#0" />
<element signature="e#9431#9516#0" />
<element signature="e#9597#9609#0" />
<element signature="e#10491#10501#0" />
<element signature="e#11091#11355#0" />
<element signature="e#11137#11355#0" />
<element signature="e#11492#11719#0" />
<element signature="e#11821#11907#0" />
<element signature="e#12109#12805#0" />
<element signature="e#12279#12550#0" />
<element signature="e#12879#13386#0" />
<element signature="e#12969#13077#0" />
<element signature="e#13143#13307#0" />
<element signature="e#13427#18635#0" />
<element signature="e#14091#14242#0" />
<element signature="e#14198#14242#0" />
<element signature="e#14276#14361#0" />
<element signature="e#14442#14454#0" />
<element signature="e#15534#15544#0" />
<element signature="e#16140#16425#0" />
<element signature="e#16186#16425#0" />
<element signature="e#16562#16786#0" />
<element signature="e#16888#16974#0" />
<element signature="e#17178#18133#0" />
<element signature="e#17348#17878#0" />
<element signature="e#17467#17516#0" />
<element signature="e#18207#18635#0" />
<element signature="e#18218#18326#0" />
<element signature="e#18392#18556#0" />
<element signature="e#18742#19104#0" />
<element signature="e#19265#19320#0" />
<element signature="e#19452#19653#0" />
<element signature="e#19470#19652#0" />
<element signature="e#19894#19949#0" />
<element signature="e#20774#21074#0" />
<element signature="e#20811#21047#0" />
<element signature="e#21084#21089#0" />
<element signature="e#21487#21515#0" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/.PyCharmCE2019.1/system/python_stubs/1626812534/builtins.py"> <entry file="file://$USER_HOME$/.PyCharmCE2019.1/system/python_stubs/1626812534/builtins.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="226"> <state relative-caret-position="226">
...@@ -430,8 +332,8 @@ ...@@ -430,8 +332,8 @@
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tmp.py"> <entry file="file://$PROJECT_DIR$/tmp.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-136"> <state relative-caret-position="332">
<caret line="495" column="19" selection-start-line="495" selection-start-column="19" selection-end-line="495" selection-end-column="19" /> <caret line="334" column="39" selection-start-line="334" selection-start-column="39" selection-end-line="334" selection-end-column="39" />
<folding> <folding>
<element signature="e#2742#2953#0" /> <element signature="e#2742#2953#0" />
<element signature="e#2931#3547#0" /> <element signature="e#2931#3547#0" />
...@@ -466,8 +368,6 @@ ...@@ -466,8 +368,6 @@
<element signature="e#11135#11353#0" /> <element signature="e#11135#11353#0" />
<element signature="e#11490#11717#0" /> <element signature="e#11490#11717#0" />
<element signature="e#11819#11905#0" /> <element signature="e#11819#11905#0" />
<element signature="e#12107#12803#0" />
<element signature="e#12277#12548#0" />
<element signature="e#12877#13384#0" /> <element signature="e#12877#13384#0" />
<element signature="e#13413#13513#0" /> <element signature="e#13413#13513#0" />
<element signature="e#13575#13737#0" /> <element signature="e#13575#13737#0" />
...@@ -496,5 +396,71 @@ ...@@ -496,5 +396,71 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/model_monitor_PSI_AUC.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="368">
<caret line="337" column="39" selection-start-line="337" selection-start-column="39" selection-end-line="337" selection-end-column="39" />
<folding>
<element signature="e#2839#3050#0" />
<element signature="e#4164#4780#0" />
<element signature="e#4164#4266#1" />
<element signature="e#4344#4615#0" />
<element signature="e#4644#4780#0" />
<element signature="e#4728#4780#0" />
<element signature="e#4832#5298#0" />
<element signature="e#4832#4951#1" />
<element signature="e#4977#5258#0" />
<element signature="e#5397#7442#0" />
<element signature="e#5397#5552#1" />
<element signature="e#6046#6709#0" />
<element signature="e#6226#6285#0" />
<element signature="e#6826#7389#0" />
<element signature="e#6857#6933#0" />
<element signature="e#6976#7389#0" />
<element signature="e#7302#7389#0" />
<element signature="e#7541#9990#0" />
<element signature="e#7541#7696#1" />
<element signature="e#8291#9569#0" />
<element signature="e#8481#8540#0" />
<element signature="e#9160#9430#0" />
<element signature="e#9475#9569#0" />
<element signature="e#9650#9934#0" />
<element signature="e#9689#9766#0" />
<element signature="e#10467#10618#0" />
<element signature="e#10574#10618#0" />
<element signature="e#10925#10937#0" />
<element signature="e#11819#11829#0" />
<element signature="e#12419#12683#0" />
<element signature="e#12465#12683#0" />
<element signature="e#12789#13016#0" />
<element signature="e#13118#13204#0" />
<element signature="e#14176#14668#0" />
<element signature="e#14266#14362#0" />
<element signature="e#14428#14589#0" />
<element signature="e#15686#15837#0" />
<element signature="e#15793#15837#0" />
<element signature="e#16144#16156#0" />
<element signature="e#17236#17246#0" />
<element signature="e#17842#18127#0" />
<element signature="e#17888#18127#0" />
<element signature="e#18233#18457#0" />
<element signature="e#18559#18645#0" />
<element signature="e#18849#19804#0" />
<element signature="e#19019#19549#0" />
<element signature="e#19138#19187#0" />
<element signature="e#19889#19985#0" />
<element signature="e#20051#20305#0" />
<element signature="e#22953#23042#0" />
<element signature="e#23127#23384#0" />
<element signature="e#23189#23322#0" />
<element signature="e#23487#23700#0" />
<element signature="e#23866#24109#0" />
<element signature="e#23903#24104#0" />
<element signature="e#24119#24124#0" />
<element signature="e#21487#21515#0" />
</folding>
</state>
</provider>
</entry>
</component> </component>
</project> </project>
\ No newline at end of file
...@@ -97,6 +97,7 @@ ...@@ -97,6 +97,7 @@
- min_aucr: 最小AUCR, 小于则视为该客群异常. - min_aucr: 最小AUCR, 小于则视为该客群异常.
- 执行run函数. - 执行run函数.
```python ```python
......
...@@ -13,15 +13,17 @@ import pymongo ...@@ -13,15 +13,17 @@ import pymongo
import os import os
import pickle import pickle
import warnings import warnings
warnings.filterwarnings('ignore') import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict from collections import OrderedDict
warnings.filterwarnings('ignore')
class ModelMonitor: class ModelMonitor:
def __init__(self, excel_path='./model_score.xlsx', sheet_name='mongo_model', def __init__(self, excel_path='./model_score.xlsx', sheet_name='mongo_model',
passdue_day=15, save_path='./image/', passdue_day=15, save_path='./image/',
last_month=7, num_month=4, min_user_group=500): num_month=4, min_user_group=500, max_psi=0.1, min_aucr=0.85):
# 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量. # 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量.
self.mysql_engine = pymysql.connect(host='172.20.6.9', self.mysql_engine = pymysql.connect(host='172.20.6.9',
...@@ -45,9 +47,19 @@ class ModelMonitor: ...@@ -45,9 +47,19 @@ class ModelMonitor:
# 一些定义的常量 # 一些定义的常量
self.passdue_day = passdue_day # 逾期天数, 默认15. self.passdue_day = passdue_day # 逾期天数, 默认15.
self.save_path = save_path # 图片保存位置, 默认./image. self.save_path = save_path # 图片保存位置, 默认./image.
self.last_month = last_month # 取数的最后一个月.
self.num_month = num_month # 取数的月数. self.num_month = num_month # 取数的月数.
self.min_user_group = min_user_group # 最小客群数量. self.min_user_group = min_user_group # 最小客群数量.
self.max_psi = max_psi # 最大PSI, 超过视为异常.
self.min_aucr = min_aucr # 最小AUC比率, 小于视为异常.
# 获取当天日期信息.
self.current_date = (datetime.date.today() + relativedelta(days=-1)).strftime('%Y-%m-%d')
self.response_date = (datetime.date.today() + relativedelta(days=-(31 + self.passdue_day))).strftime('%Y-%m-%d')
self.first_date = (datetime.date.today() + relativedelta(months=-self.num_month + 1)).strftime('%Y-%m-01')
self.current_month = (datetime.date.today() + datetime.timedelta(days=-1)).month
self.response_month = (datetime.date.today() + relativedelta(days=-46)).month
self.first_month = self.current_month - self.num_month + 1
# 将会从数据库中读取的数据. # 将会从数据库中读取的数据.
self.mysql_df = None self.mysql_df = None
...@@ -55,30 +67,51 @@ class ModelMonitor: ...@@ -55,30 +67,51 @@ class ModelMonitor:
self.merge_data = None self.merge_data = None
# 统计数据记录. # 统计数据记录.
psi_cols = ['model_name', 'group_name'] psi_cols = ['模型名称', '客群名称']
auc_cols = ['model_name', 'group_name'] auc_cols = ['模型名称', '客群名称']
for m in range(self.last_month - self.num_month, self.last_month): for m in range(self.first_month, self.current_month + 1):
psi_cols.append(str(m) + 'm_num') psi_cols.append(str(m) + '月数量')
psi_cols.append(str(m) + 'm_psi') psi_cols.append(str(m) + '月PSI')
auc_cols.append(str(m) + 'm_num') auc_cols.append(str(m) + '月数量')
auc_cols.append(str(m) + 'm_auc') auc_cols.append(str(m) + '月AUC')
auc_cols.append(str(m) + '月AUCR')
self.psi_info_df = pd.DataFrame(columns=psi_cols) self.psi_info_df = pd.DataFrame(columns=psi_cols)
self.auc_info_df = pd.DataFrame(columns=auc_cols) self.auc_info_df = pd.DataFrame(columns=auc_cols)
self.na_enough_data_psi_set = set() # 一些新的模型没有足够数据用于统计.
self.na_enough_data_auc_set = set() # 一些新的模型没有足够数据用于统计.
def sql_query(self, sql): def sql_query(self, sql):
'''
连接MySQL数据库, 根据SQL返回数据.
:param sql: str.
:return: DataFrame.
'''
try: try:
return pd.read_sql(sql, self.mysql_engine) return pd.read_sql(sql, self.mysql_engine)
except: except:
print('SQL查询出现错误.') print('SQL查询出现错误.')
def mongo_query(self, condition, fields): def mongo_query(self, condition, fields):
'''
连接MongoDB, 根据查询返回数据.
:param condition: dict
:param fields: dict
:return: DataFrame
'''
try: try:
return pd.DataFrame(list(self.mongo_table.find(condition, fields))) return pd.DataFrame(list(self.mongo_table.find(condition, fields)))
except: except:
print('Mongo查询出现错误.') print('Mongo查询出现错误.')
def int2str(self, x): def int2str(self, x):
'''
将int转换为str, 用于日期.
e.g. 5 --> 05
:param x: int
:return: str.
'''
if x >= 10: if x >= 10:
return str(x) return str(x)
else: else:
...@@ -123,98 +156,101 @@ class ModelMonitor: ...@@ -123,98 +156,101 @@ class ModelMonitor:
return None return None
def helper_psi(self, user_group_name=None, df=None, info_dict=None, field=None): def helper_psi(self, user_group_name=None, df=None, info_dict=None, field=None):
''' '''
信息提取函数. 信息提取函数.
:param user_group_name: str, 客群名称. :param user_group_name: str, 客群名称.
:param df: Dataframe, 对应客群数据. :param df: Dataframe, 对应客群数据.
:return: None. :return: None.
''' '''
print('正在处理%s客群数据.' % user_group_name) print('正在处理%s客群数据.' % user_group_name)
info_dict[user_group_name] = OrderedDict() info_dict[user_group_name] = OrderedDict()
month_list = list(sorted(df['month_label'].unique().tolist())) month_list = list(sorted(df['month_label'].unique().tolist()))
if 0 in month_list: if 0 in month_list:
month_list.remove(0) month_list.remove(0)
df_g = df.groupby(['month_label', 'bins']).agg({field: ['count']}) df_g = df.groupby(['month_label', 'bins']).agg({field: ['count']})
df_g = df_g.reset_index() df_g = df_g.reset_index()
df_g = df_g.sort_values(['month_label', 'bins']) df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list): for i, m in enumerate(month_list):
amt_in_bins = df_g.loc[df_g['month_label'] == m, field].values amt_in_bins = df_g.loc[df_g['month_label'] == m, field].values
# 某月样本量小于阈值, 放弃记录信息. # 某月样本量小于阈值, 放弃记录信息.
if amt_in_bins.sum() < self.min_user_group: if amt_in_bins.sum() < self.min_user_group:
print('%d月样本量过小, 放弃提取信息.' % m) print('%d月样本量过小, 放弃提取信息.' % m)
continue continue
info_dict[user_group_name][str(m) + '月'] = {} info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum() info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱样本占比'] = amt_in_bins / amt_in_bins.sum() info_dict[user_group_name][str(m) + '月']['各分箱样本占比'] = amt_in_bins / amt_in_bins.sum()
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量'])) print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
# 计算PSI, 以样本量达标的第一个月为基准. # 计算PSI, 以样本量达标的第一个月为基准.
for i, m in enumerate(info_dict[user_group_name]): for i, m in enumerate(info_dict[user_group_name]):
if i == 0: if i == 0:
info_dict[user_group_name][m]['psi'] = 0 info_dict[user_group_name][m]['psi'] = 0
bench_month = m bench_month = m
else:
psi = self.calc_psi(
df_g.loc[df_g['month_label'] == int(bench_month[0]), field],
df_g.loc[df_g['month_label'] == int(m[0]), field])
if psi:
info_dict[user_group_name][m]['psi'] = psi
else: else:
psi = self.calc_psi( info_dict[user_group_name][m]['psi'] = -999
df_g.loc[df_g['month_label'] == int(bench_month[0]), field], print('计算PSI出现错误.')
df_g.loc[df_g['month_label'] == int(m[0]), field]) print('处理完成.')
if psi: print('=' * 40)
info_dict[user_group_name][m]['psi'] = psi
else:
info_dict[user_group_name][m]['psi'] = -999
print('计算PSI出现错误.')
print('处理完成.')
print('='*40)
def helper_auc(self, user_group_name=None, df=None, info_dict=None, field=None): def helper_auc(self, user_group_name=None, df=None, info_dict=None, field=None):
''' '''
信息提取函数. 信息提取函数.
:param user_group_name: str, 客群名称. :param user_group_name: str, 客群名称.
:param df: Dataframe, 对应客群数据. :param df: Dataframe, 对应客群数据.
:return: None. :return: None.
''' '''
print('正在处理%s客群数据.' % user_group_name) print('正在处理%s客群数据.' % user_group_name)
info_dict[user_group_name] = OrderedDict() info_dict[user_group_name] = OrderedDict()
month_list = list(sorted(df['month_label'].unique().tolist())) month_list = list(sorted(df['month_label'].unique().tolist()))
if 0 in month_list: if 0 in month_list:
month_list.remove(0) month_list.remove(0)
df_g = df.groupby(['month_label', 'bins'])['overdue'].agg({'overdue': ['count', 'sum', 'mean']})
df_g = df.groupby(['month_label', 'bins'])['overdue'].agg({'overdue': ['count', 'sum', 'mean']}) df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()]
df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()] df_g = df_g.reset_index()
df_g = df_g.reset_index() df_g = df_g.sort_values(['month_label', 'bins'])
df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list):
for i, m in enumerate(month_list): amt_in_bins = df_g.loc[df_g['month_label'] == m, 'overdue_count'].values
amt_in_bins = df_g.loc[df_g['month_label'] == m, 'overdue_count'].values # 某月样本量小于阈值, 放弃记录信息.
# 某月样本量小于阈值, 放弃记录信息. if amt_in_bins.sum() < self.min_user_group:
if amt_in_bins.sum() < self.min_user_group: print('%d月样本量过小, 放弃提取信息.' % m)
print('%d月样本量过小, 放弃提取信息.' % m) continue
continue info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum() info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[df_g['month_label'] == m, 'overdue_sum'].values df_g['month_label'] == m, 'overdue_sum'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[df_g['month_label'] == m, 'overdue_mean'].values info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量'])) df_g['month_label'] == m, 'overdue_mean'].values
try: print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(df.loc[(df['month_label'] == m) & (df[field].notna()), 'overdue'], try:
df.loc[(df['month_label'] == m) & (df[field].notna()), field]) info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(
except: df.loc[(df['month_label'] == m) & (df[field].notna()), 'overdue'],
print('AUC计算发生错误.') df.loc[(df['month_label'] == m) & (df[field].notna()), field])
info_dict[user_group_name][str(m) + '月']['auc'] = -999 except:
print('AUC计算发生错误.')
for i, m in enumerate(info_dict[user_group_name]): info_dict[user_group_name][str(m) + '月']['auc'] = -999
if i == 0: # 基准月.
info_dict[user_group_name][m]['aucR'] = 1
bench_month = m
else:
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / info_dict[user_group_name][bench_month]['auc']
print('处理完成.') for i, m in enumerate(info_dict[user_group_name]):
print('=' * 40) if i == 0: # 基准月.
info_dict[user_group_name][m]['aucR'] = 1
bench_month = m
else:
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / \
info_dict[user_group_name][bench_month]['auc']
print('处理完成.')
print('=' * 40)
def plot_psi(self, field): def plot_psi(self, field):
# 创建文件夹保存图片. # 创建文件夹保存图片.
...@@ -226,14 +262,15 @@ class ModelMonitor: ...@@ -226,14 +262,15 @@ class ModelMonitor:
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel']].copy() df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel']].copy()
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准. # 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month): for m in range(self.first_month, self.current_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field]) bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins: if bins:
print('以%d月为基准月.' % m) print('以%d月为基准月.' % m)
break break
if not bins: if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field) self.na_enough_data_psi_set.add(self.model_feild_name_dict[field])
print('='*40) print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field])
print('=' * 40)
return None return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱. df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
...@@ -252,7 +289,6 @@ class ModelMonitor: ...@@ -252,7 +289,6 @@ class ModelMonitor:
# '各分箱样本占比': [...]}}} # '各分箱样本占比': [...]}}}
info_dict = {} info_dict = {}
# 全样本 # 全样本
self.helper_psi('全样本', df_copy, info_dict, field) self.helper_psi('全样本', df_copy, info_dict, field)
# 按申请类型划分. # 按申请类型划分.
...@@ -273,13 +309,15 @@ class ModelMonitor: ...@@ -273,13 +309,15 @@ class ModelMonitor:
app_chan_set = df_copy_g['applied_channel'].unique() app_chan_set = df_copy_g['applied_channel'].unique()
for app_type in app_type_set: for app_type in app_type_set:
for app_chan in app_chan_set: for app_chan in app_chan_set:
if df_copy_g.loc[(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0: if df_copy_g.loc[
(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan) user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g del df_copy_g
## 按划分的客群处理数据. ## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict: for user_group_name in user_group_dict:
self.helper_psi(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (df_copy['applied_channel'] == user_group_dict[user_group_name][1])], self.helper_psi(user_group_name, df_copy.loc[
(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])],
info_dict, field) info_dict, field)
# 过滤不包含信息的客群. # 过滤不包含信息的客群.
remove_list = [] remove_list = []
...@@ -291,14 +329,15 @@ class ModelMonitor: ...@@ -291,14 +329,15 @@ class ModelMonitor:
# 画图. # 画图.
print('开始画图.') print('开始画图.')
print('='*40) print('=' * 40)
for user_group_name in info_dict: for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name) print(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8)) plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])), plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])),
info_dict[user_group_name][m]['各分箱样本占比'], [round(x[0], 3) for x in info_dict[user_group_name][m]['各分箱样本占比']],
label='%s PSI: %.3f \n 样本量: %d' % (m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量'])) label='%s PSI: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name) plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'PSI/' + self.model_feild_name_dict[field] + '-' + user_group_name) plt.savefig(self.save_path + 'PSI/' + self.model_feild_name_dict[field] + '-' + user_group_name)
...@@ -307,11 +346,11 @@ class ModelMonitor: ...@@ -307,11 +346,11 @@ class ModelMonitor:
# 保存统计信息. # 保存统计信息.
for user_group_name in info_dict: for user_group_name in info_dict:
# print(self.model_feild_name_dict[field] + '-' + user_group_name) # print(self.model_feild_name_dict[field] + '-' + user_group_name)
tmp_dict = {'model_name': [self.model_feild_name_dict[field]], tmp_dict = {'模型名称': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]} '客群名称': [user_group_name]}
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])] tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_psi'] = [round(info_dict[user_group_name][m]['psi'], 3)] tmp_dict[m[0] + '月PSI'] = [round(info_dict[user_group_name][m]['psi'], 3)]
self.psi_info_df = self.psi_info_df.append(pd.DataFrame(tmp_dict)) self.psi_info_df = self.psi_info_df.append(pd.DataFrame(tmp_dict))
def plot_auc(self, field): def plot_auc(self, field):
...@@ -321,20 +360,28 @@ class ModelMonitor: ...@@ -321,20 +360,28 @@ class ModelMonitor:
if not os.path.exists(self.save_path + 'AUC/'): if not os.path.exists(self.save_path + 'AUC/'):
os.mkdir(self.save_path + 'AUC/') os.mkdir(self.save_path + 'AUC/')
# 分离数据. # 分离数据.
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day']].copy() df_copy = self.merge_data[
[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day', 'applied_at']].copy()
## 筛选出放款, 且逾期表现的数据. ## 筛选出放款, 且逾期表现的数据.
df_copy = df_copy.loc[(df_copy[field].notna()) & (df_copy['month_label'] != self.last_month - 1) & (df_copy[field] > 0) & (df_copy['passdue_day'].notna())] if repr(df_copy['applied_at'].dtype) == "dtype('O')":
df_copy = df_copy.loc[
(df_copy[field].notna()) & (df_copy['applied_at'].apply(lambda x: x[:10]) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
else:
df_copy = df_copy.loc[(df_copy[field].notna()) & (
df_copy['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d')) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准. # 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month): for m in range(self.first_month, self.response_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field]) bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins: if bins:
print('以%d月为基准月.' % m) print('以%d月为基准月.' % m)
break break
if not bins: if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field) self.na_enough_data_auc_set.add(self.model_feild_name_dict[field])
print('='*40) print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field])
print('=' * 40)
return None return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱. df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
...@@ -357,7 +404,6 @@ class ModelMonitor: ...@@ -357,7 +404,6 @@ class ModelMonitor:
# 'aucR': float}}} # 'aucR': float}}}
info_dict = {} info_dict = {}
# 全样本 # 全样本
self.helper_auc('全样本', df_copy, info_dict, field) self.helper_auc('全样本', df_copy, info_dict, field)
# 按申请类型划分. # 按申请类型划分.
...@@ -383,10 +429,11 @@ class ModelMonitor: ...@@ -383,10 +429,11 @@ class ModelMonitor:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan) user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g del df_copy_g
## 按划分的客群处理数据. ## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict: for user_group_name in user_group_dict:
self.helper_auc(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & ( self.helper_auc(user_group_name,
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], info_dict, field) df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], info_dict,
field)
# 过滤不包含信息的客群. # 过滤不包含信息的客群.
remove_list = [] remove_list = []
for user_group_name in info_dict: for user_group_name in info_dict:
...@@ -409,7 +456,8 @@ class ModelMonitor: ...@@ -409,7 +456,8 @@ class ModelMonitor:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱逾期率'])), plt.plot(range(len(info_dict[user_group_name][m]['各分箱逾期率'])),
info_dict[user_group_name][m]['各分箱逾期率'], info_dict[user_group_name][m]['各分箱逾期率'],
label='%s AUC: %.3f AUCR: %.3f \n 样本量: %d' % ( label='%s AUC: %.3f AUCR: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'], info_dict[user_group_name][m]['该月样本量'])) m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'],
info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name) plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'AUC/' + self.model_feild_name_dict[field] + '-' + user_group_name) plt.savefig(self.save_path + 'AUC/' + self.model_feild_name_dict[field] + '-' + user_group_name)
...@@ -417,28 +465,48 @@ class ModelMonitor: ...@@ -417,28 +465,48 @@ class ModelMonitor:
# 保存统计信息. # 保存统计信息.
for user_group_name in info_dict: for user_group_name in info_dict:
tmp_dict = {'model_name': [self.model_feild_name_dict[field]], tmp_dict = {'模型名称': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]} '客群名称': [user_group_name]}
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])] tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_auc'] = [round(info_dict[user_group_name][m]['auc'], 3)] tmp_dict[m[0] + '月AUC'] = [round(info_dict[user_group_name][m]['auc'], 3)]
tmp_dict[m[0] + '月AUCR'] = [round(info_dict[user_group_name][m]['aucR'], 3)]
self.auc_info_df = self.auc_info_df.append(pd.DataFrame(tmp_dict)) self.auc_info_df = self.auc_info_df.append(pd.DataFrame(tmp_dict))
def abnormal_psi(self):
def is_abnormal_psi(data):
for i in data.index:
if 'PSI' in i and data[i] > self.max_psi:
return True
return False
self.psi_info_df['是否异常'] = self.psi_info_df.apply(is_abnormal_psi, axis=1)
def abnormal_auc(self):
def is_abnormal_auc(data):
for i in data.index:
if 'AUCR' in i and data[i] < self.min_aucr:
return True
return False
self.auc_info_df['是否异常'] = self.auc_info_df.apply(is_abnormal_auc, axis=1)
def run(self): def run(self):
# 获取MySQL数据, 取last_month往前num_month个月数据. # 获取MySQL数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
self.mysql_df = self.sql_query('''SELECT order_no, applied_at, self.mysql_df = self.sql_query('''SELECT order_no, applied_at,
applied_type, applied_from, applied_channel, transacted, passdue_day applied_type, applied_from, applied_channel, transacted, passdue_day
FROM risk_analysis FROM risk_analysis
WHERE applied_at > "2019-%s-01 00:00:00" WHERE applied_at >= "%s 00:00:00"
AND applied_at < "2019-%s-01 00:00:00"''' AND applied_at <= "%s 00:00:00"'''
% (self.int2str(self.last_month - self.num_month), self.int2str(self.last_month))) % (self.first_date, self.current_date))
print('MySQL数据获取成功.') print('MySQL数据获取成功.')
# self.mysql_df.to_csv('./mysql_data.csv', index=False) # self.mysql_df.to_csv('./mysql_data.csv', index=False)
# self.mysql_df = pd.read_csv('./mysql_data.csv') # self.mysql_df = pd.read_csv('./mysql_data.csv')
# 获取MongoDB数据, 取last_month往前num_month个月数据. # 获取MongoDB数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
condition = {'wf_created_at': {'$gte': '2019-%s-01 00:00:00' % self.int2str(self.last_month - self.num_month), condition = {'wf_created_at': {'$gte': '%s 00:00:00' % self.first_date,
'$lte': '2019-%s-01 00:00:00' % self.int2str(self.last_month)}} '$lte': '%s 00:00:00' % self.current_date}}
fields = {'wf_biz_no': 1, 'wf_created_at': 1} fields = {'wf_biz_no': 1, 'wf_created_at': 1}
for f in self.model_feild_list: # 加入Excel中预置的模型分名称 for f in self.model_feild_list: # 加入Excel中预置的模型分名称
fields[f] = 1 fields[f] = 1
...@@ -456,8 +524,15 @@ class ModelMonitor: ...@@ -456,8 +524,15 @@ class ModelMonitor:
# 拼接数据. # 拼接数据.
self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df, self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df,
left_on='order_no', right_on='wf_biz_no', how='left') left_on='order_no', right_on='wf_biz_no', how='left')
## 定义逾期用户. ## 定义逾期用户.
self.merge_data['overdue'] = self.merge_data['passdue_day'] > self.passdue_day def overdue(data):
if pd.isnull(data):
return np.nan
else:
return float(data > self.passdue_day)
self.merge_data['overdue'] = self.merge_data['passdue_day'].apply(overdue)
# 清洗数据. # 清洗数据.
def clean_data(data): def clean_data(data):
...@@ -465,6 +540,7 @@ class ModelMonitor: ...@@ -465,6 +540,7 @@ class ModelMonitor:
return float(data) return float(data)
except: except:
return np.nan return np.nan
na_field_list = [] na_field_list = []
for field in self.model_feild_list: for field in self.model_feild_list:
if field in self.merge_data.columns.tolist(): if field in self.merge_data.columns.tolist():
...@@ -475,38 +551,51 @@ class ModelMonitor: ...@@ -475,38 +551,51 @@ class ModelMonitor:
## 去除因为一些原因未抽取到的字段. ## 去除因为一些原因未抽取到的字段.
print('不包含以下字段:') print('不包含以下字段:')
for field in na_field_list: for field in na_field_list:
print(self.model_feild_name_dict[field])
self.model_feild_list.remove(field) self.model_feild_list.remove(field)
self.model_name_list.remove(self.model_feild_name_dict[field]) self.model_name_list.remove(self.model_feild_name_dict[field])
del self.model_feild_name_dict[field] del self.model_feild_name_dict[field]
print(self.model_feild_name_dict[field])
print('数据拼接完成.') print('数据拼接完成.')
# 数据按月划分. # 数据按月划分.
self.merge_data['month_label'] = 0 self.merge_data['month_label'] = 0
for m in range(self.num_month): for m in range(self.first_month, self.current_month + 1):
self.merge_data.loc[ self.merge_data.loc[
(self.merge_data['applied_at'] > '2019-%s-01 00:00:00' % self.int2str(self.last_month - m - 1)) & (self.merge_data['applied_at'] >= '2019-%s-01 00:00:00' % self.int2str(m)) &
(self.merge_data['applied_at'] < '2019-%s-01 00:00:00' % self.int2str(self.last_month - m)), (self.merge_data['applied_at'] < '2019-%s-01 00:00:00' % self.int2str(m + 1)),
'month_label'] = self.last_month - m - 1 'month_label'] = m
# 画图. # 画图.
## AUC.
print('开始画图-AUC.')
for field in self.model_feild_list:
self.plot_auc(field)
## PSI. ## PSI.
print('开始画图-PSI.') print('开始画图-PSI.')
for field in self.model_feild_list: for field in self.model_feild_list:
self.plot_psi(field) self.plot_psi(field)
## AUC. # 输出数据不足的模型.
print('开始画图-AUC.') print('PSI 数据不足以统计的模型:')
for field in self.model_feild_list: for model_name in self.na_enough_data_psi_set:
self.plot_auc(field) print(model_name)
print('=' * 40)
print('AUC 数据不足以统计的模型:')
for model_name in self.na_enough_data_auc_set:
print(model_name)
print('=' * 40)
# 检测是否异常.
self.abnormal_psi()
self.abnormal_auc()
# 保存统计信息. # 保存统计信息.
self.psi_info_df.to_csv('./psi_info.csv', index=False) self.psi_info_df.to_csv('./psi_info.csv', index=False)
self.auc_info_df.to_csv('./auc_info.csv', index=False) self.auc_info_df.to_csv('./auc_info.csv', index=False)
print('统计信息保存成功.') print('统计信息保存成功.')
# if __name__ == '__main__': # if __name__ == '__main__':
# pass # pass
# mm = ModelMonitor(excel_path='./model_score.xlsx', save_path='./image/', last_month=7, num_month=2) # mm = ModelMonitor(excel_path='./model_score.xlsx', save_path='./image/', last_month=7, num_month=2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment