Commit f0404627 authored by 舒皓月's avatar 舒皓月

重写代码提交 07 26

parent 7b4f202f
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (model_monitor)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (model_monitor)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/model_monitor.iml" filepath="$PROJECT_DIR$/.idea/model_monitor.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="1ecd0b9f-60aa-441d-b8e6-0ca91e7a02ef" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py" beforeDir="false" afterPath="$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" beforeDir="false" afterPath="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" beforeDir="false" afterPath="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="5220">
<caret line="311" column="58" selection-start-line="311" selection-start-column="53" selection-end-line="311" selection-end-column="58" />
<folding>
<element signature="e#89#100#0" expanded="true" />
<element signature="e#2551#2626#0" />
<element signature="e#2929#5687#0" />
<element signature="e#5727#5760#0" />
<element signature="e#5950#6075#0" />
<element signature="e#6109#6351#0" />
<element signature="e#6410#6555#0" />
<element signature="e#6592#8303#0" />
<element signature="e#8470#9706#0" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="828">
<caret line="35" column="25" selection-start-line="35" selection-start-column="25" selection-end-line="35" selection-end-column="25" />
<folding>
<element signature="e#89#100#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="576">
<caret line="16" selection-start-line="16" selection-end-line="16" />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>E:</find>
<find>E:\</find>
<find>query_sql</find>
<find>sql_channel</find>
<find>liftchart</find>
<find>plotPSI</find>
<find>dateList</find>
<find>modelList</find>
<find>path</find>
<find>alarm</find>
</findStrings>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/Monitor_mongoDB/monitoring_VLM_mongodb.py" />
<option value="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" />
<option value="$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py" />
<option value="$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py" />
<option value="$PROJECT_DIR$/test.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="310" />
<option name="y" value="135" />
<option name="width" value="1230" />
<option name="height" value="675" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes />
</component>
<component name="PropertiesComponent">
<property name="ASKED_SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="last_opened_file_path" value="D:/work_space/test" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.test">
<configuration name="Monitor_VLM_riskanalysi" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_monitor" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Monitor_risk_analysis" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_monitor" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.test" />
<item itemvalue="Python.Monitor_VLM_riskanalysi" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="1ecd0b9f-60aa-441d-b8e6-0ca91e7a02ef" name="Default Changelist" comment="" />
<created>1562726148779</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1562726148779</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-7" y="-7" width="1550" height="838" extended-state="6" />
<layout>
<window_info content_ui="combo" id="Project" order="0" weight="0.15587847" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.6090652" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
</layout>
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/psi_from_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-12858">
<caret line="15" column="33" selection-start-line="15" selection-start-column="33" selection-end-line="15" selection-end-column="33" />
<folding>
<element signature="e#50#79#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_mongoDB/monitoring_VLM_mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1014">
<caret line="23" column="44" selection-start-line="23" selection-start-column="44" selection-end-line="23" selection-end-column="44" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/Monitor_VLM_riskanalysi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="5220">
<caret line="311" column="58" selection-start-line="311" selection-start-column="53" selection-end-line="311" selection-end-column="58" />
<folding>
<element signature="e#89#100#0" expanded="true" />
<element signature="e#2551#2626#0" />
<element signature="e#2929#5687#0" />
<element signature="e#5727#5760#0" />
<element signature="e#5950#6075#0" />
<element signature="e#6109#6351#0" />
<element signature="e#6410#6555#0" />
<element signature="e#6592#8303#0" />
<element signature="e#8470#9706#0" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/Monitor_risk_analysis/PSI&amp;VAL_riskanalysis.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="828">
<caret line="35" column="25" selection-start-line="35" selection-start-column="25" selection-end-line="35" selection-end-column="25" />
<folding>
<element signature="e#89#100#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="576">
<caret line="16" selection-start-line="16" selection-end-line="16" />
</state>
</provider>
</entry>
</component>
</project>
\ No newline at end of file
......@@ -3,43 +3,123 @@
# 项目介绍
- 本项目主要用于风控模型监控.
- 运用统计绘图分析, 来检测入模特征, 客群变化, 模型性能.
- 基本流程:
数据库取数 --> 整合/处理数据 --> 绘图
- 数据库取数 --> 整合/处理数据 --> 绘图 --> 统计
- 主要统计图类别:
- 单变量分析图
- VLM
- PSI
- PSI
- Lift Chart
# 版本信息 - 新代码
- V 0.0.1
基本完成对PSI和Lift Chart关于模型分在MongoDB的重写.
# VLM
- 待重写.
# PSI & Lift Chart
- 因为这两个指标的统计都需要用到模型分, 所以放到一起.
## 代码流程
- 首先对需要计算的模型分, 在指定的统一时间跨度内进行数据抽取(在MySQL和MongoDB中). 包括如下一些主要字段:
- 模型分1, 模型分2, ...
- 订单号, 申请时间, 申请类型, 渠道类型, 逾期天数.
- 根据预定义逾期阈值, 得到是否逾期标签.
- 根据该模型分有记录的第一个月, 计算分箱规则(由模型分等频分箱区间, PSI在申请集上计算, AUC在放款集上计算).
- 分别对全样本, 首申/复申/复贷, 以及各达标客群(客群数量大于预设阈值), 计算每月统计信息:
- PSI:
- 样本量.
- 各分箱样本量
- 各分箱样本量占比
- 以该客群模型分有记录的第一个月为基准的PSI.
- Lift Chart:
- 样本量
- 各分箱样本量
- 各分箱逾期率
- AUC
- 以该客群模型分有记录的第一个月为基准的AUC ratio.
- 统计表格信息, 方便筛选排序查看.
- 包含以下字段:
- 模型分名称.
- a月样本量, b月样本量...
- a月PSI, b月PSI...
- a月AUC, b月AUC...
- NOTE:
- 当某月样本量很小, 或者没有样本时, 标记为NaN. 对应的PSI, AUC也为NaN.
- 当某月样本量比较小, 导致PSI, AUC计算异常(如某些分箱没有样本, 全为非逾期样本), 则标记为-999.
- 基准月的PSI为0.
- 示例:
![PSI](doc/image/C6640ABE-9017-42b5-A92A-2DE5601A15D8.png)
![](doc/image/31EA97A8-19B7-45c6-8302-4148D19BAABA.png)
## 代码使用方法
- 准备一个Excel表格, 其中放置模型分名称, 以及对应的在数据库中的字段名.
- 创建一个模型监控对象(这样你就有对象了).
```python
mm = ModelMonitor(excel_path='./model_score.xlsx', save_path='./image/', last_month=7, num_month=4, min_user_group=200)
```
- excel_path: Excel文件路径.
- save_path: 统计图存放路径.
不用自己再另外手动创建文件夹, 代码会判断文件夹是否存在并创建.
- last_month: 想要统计的最后一个月.
- num_month: 想要统计几个月.
# Univariant Chart
如last_month=7, num_month=3, 表示统计4, 5, 6三个月的信息.
NOTE: AUC的计算逻辑为了保证样本有响应, 在此基础上还要往前推一个月, 会统计4, 5月的信息.
- min_user_group: 最小客群数量.
# PSI
这个参数越大, 颗粒越粗, 最后的统计图(客群数量)会越少.
反之颗粒越小, 最后统计图会越多.
- 执行run函数.
# Lift Chart
```python
mm.run()
```
- 输出
- 图片保存在./image中.
- PSI: ./image/PSI
- Lift Chart: ./image/AUC
- 统计信息.
- PSI统计信息: ./psi_info.csv
- AUC统计信息: ./auc_info.csv
# TODO
- 添加对存在MySQL中模型分计算PSI, AUC的代码.
- 完成对VLM的重写.
# 贡献
- 王家华 (jiahua.wang@quantgroup.cn)
主要作者.
旧代码作者.
- 舒皓月 (haoyue.shu@quantgroup.cn)
维护者.
\ No newline at end of file
新代码作者.
\ No newline at end of file
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import pymysql
import pymongo
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
from collections import OrderedDict
class ModelMonitor:
def __init__(self, excel_path='../model_score.xlsx', sheet_name='mongo_model',
passdue_day=15, save_path='../image/',
last_month=7, num_month=4, min_user_group=500):
# 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量.
self.mysql_engine = pymysql.connect(host='172.20.6.9',
port=9030,
user='fengkong_read_only',
passwd='mT2HFUgI',
db='risk_analysis',
charset='utf8')
self.mongo_client = pymongo.MongoClient(
"mongodb://haoyue.shu:x2egwRHk7WhQ4So1@172.18.3.22:27017/?authSource=rc_mgo_feature_dp")
self.mongo_db = self.mongo_client['rc_mgo_feature_dp']
self.mongo_table = self.mongo_db['wf_audit_log_with_feature']
# 读取整理在Excel中的模型相关信息.
self.model_info_df = pd.read_excel(excel_path, sheet_name=sheet_name)
self.model_name_list = self.model_info_df.model_name.tolist()
self.model_feild_list = self.model_info_df.model_feild.tolist()
self.model_feild_name_dict = dict(zip(self.model_feild_list, self.model_name_list))
# 一些定义的常量
self.passdue_day = passdue_day # 逾期天数, 默认15.
self.save_path = save_path # 图片保存位置, 默认./image.
self.last_month = last_month # 取数的最后一个月.
self.num_month = num_month # 取数的月数.
self.min_user_group = min_user_group # 最小客群数量.
# 将会从数据库中读取的数据.
self.mysql_df = None
self.mongo_df = None
self.merge_data = None
# 统计数据记录.
psi_cols = ['model_name', 'group_name']
auc_cols = ['model_name', 'group_name']
for m in range(self.last_month - self.num_month, self.last_month):
psi_cols.append(str(m) + 'm_num')
psi_cols.append(str(m) + 'm_psi')
auc_cols.append(str(m) + 'm_num')
auc_cols.append(str(m) + 'm_auc')
self.psi_info_df = pd.DataFrame(columns=psi_cols)
self.auc_info_df = pd.DataFrame(columns=auc_cols)
def sql_query(self, sql):
try:
return pd.read_sql(sql, self.mysql_engine)
except:
print('SQL查询出现错误.')
def mongo_query(self, condition, fields):
try:
return pd.DataFrame(list(self.mongo_table.find(condition, fields)))
except:
print('Mongo查询出现错误.')
def int2str(self, x):
if x >= 10:
return str(x)
else:
return '0' + str(x)
def make_bin(self, score_list):
'''
对传入的模型分进行等频分箱.
:param score_list: pd.Series
:return: list[num]
'''
score_list = score_list[score_list.notna()]
try:
bins = score_list.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9]).values.tolist()
bins = [-99999999] + bins + [99999999]
bins = [x for x in bins if pd.notna(x)]
if len(set(bins)) < 11:
return None
return bins
except:
print('分箱出现错误.')
with open('bin_error.pkl', 'wb') as f:
pickle.dump(score_list, f)
f.close()
def calc_psi(self, array_1, array_2):
'''
计算PSI.
:param array_1: pd.Series
:param array_2: pd.Series
:return: PSI
'''
try:
# print(array_2)
array_1 = array_1.values
array_2 = array_2.values
array_1 = array_1 / array_1.sum()
array_2 = array_2 / array_2.sum()
psi = ((array_1 - array_2) * np.log10(array_1 / array_2)).sum()
return psi
except:
return None
def helper_psi(self, user_group_name=None, df=None, info_dict=None, field=None):
'''
信息提取函数.
:param user_group_name: str, 客群名称.
:param df: Dataframe, 对应客群数据.
:return: None.
'''
print('正在处理%s客群数据.' % user_group_name)
info_dict[user_group_name] = OrderedDict()
month_list = list(sorted(df['month_label'].unique().tolist()))
if 0 in month_list:
month_list.remove(0)
df_g = df.groupby(['month_label', 'bins']).agg({field: ['count']})
df_g = df_g.reset_index()
df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list):
amt_in_bins = df_g.loc[df_g['month_label'] == m, field].values
# 某月样本量小于阈值, 放弃记录信息.
if amt_in_bins.sum() < self.min_user_group:
print('%d月样本量过小, 放弃提取信息.' % m)
continue
info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱样本占比'] = amt_in_bins / amt_in_bins.sum()
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
# 计算PSI, 以样本量达标的第一个月为基准.
for i, m in enumerate(info_dict[user_group_name]):
if i == 0:
info_dict[user_group_name][m]['psi'] = 0
bench_month = m
else:
psi = self.calc_psi(
df_g.loc[df_g['month_label'] == int(bench_month[0]), field],
df_g.loc[df_g['month_label'] == int(m[0]), field])
if psi:
info_dict[user_group_name][m]['psi'] = psi
else:
info_dict[user_group_name][m]['psi'] = -999
print('计算PSI出现错误.')
print('处理完成.')
print('='*40)
def helper_auc(self, user_group_name=None, df=None, info_dict=None, field=None):
'''
信息提取函数.
:param user_group_name: str, 客群名称.
:param df: Dataframe, 对应客群数据.
:return: None.
'''
print('正在处理%s客群数据.' % user_group_name)
info_dict[user_group_name] = OrderedDict()
month_list = list(sorted(df['month_label'].unique().tolist()))
if 0 in month_list:
month_list.remove(0)
df_g = df.groupby(['month_label', 'bins'])['overdue'].agg({'overdue': ['count', 'sum', 'mean']})
df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()]
df_g = df_g.reset_index()
df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list):
amt_in_bins = df_g.loc[df_g['month_label'] == m, 'overdue_count'].values
# 某月样本量小于阈值, 放弃记录信息.
if amt_in_bins.sum() < self.min_user_group:
print('%d月样本量过小, 放弃提取信息.' % m)
continue
info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[df_g['month_label'] == m, 'overdue_sum'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[df_g['month_label'] == m, 'overdue_mean'].values
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
try:
info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(df.loc[(df['month_label'] == m) & (df[field].notna()), 'overdue'],
df.loc[(df['month_label'] == m) & (df[field].notna()), field])
except:
print('AUC计算发生错误.')
info_dict[user_group_name][str(m) + '月']['auc'] = -999
for i, m in enumerate(info_dict[user_group_name]):
if i == 0: # 基准月.
info_dict[user_group_name][m]['aucR'] = 1
bench_month = m
else:
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / info_dict[user_group_name][bench_month]['auc']
print('处理完成.')
print('=' * 40)
def plot_psi(self, field):
# 创建文件夹保存图片.
if not os.path.exists(self.save_path):
os.mkdir(self.save_path)
if not os.path.exists(self.save_path + 'PSI/'):
os.mkdir(self.save_path + 'PSI/')
# 分离数据.
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel']].copy()
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('以%d月为基准月.' % m)
break
if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field)
print('='*40)
return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
# 包含各种信息的字典.
# 如: {'全样本':
# {'3月':
# {'psi': 0,
# '该月样本量': int.
# '各分箱样本量': [...],
# '各分箱样本占比': [...]}
# '4月':
# {'psi': float,
# '该月样本量': int.
# '各分箱样本量': [...],
# '各分箱样本占比': [...]}}}
info_dict = {}
# 全样本
self.helper_psi('全样本', df_copy, info_dict, field)
# 按申请类型划分.
self.helper_psi('首申-全渠道', df_copy.loc[df_copy['applied_type'] == 1], info_dict, field)
self.helper_psi('复申-全渠道', df_copy.loc[df_copy['applied_type'] == 2], info_dict, field)
self.helper_psi('复贷-全渠道', df_copy.loc[df_copy['applied_type'] == 3], info_dict, field)
# 按主要客群划分.
## 客群划分.
## user_group_dict = {'首申-融360': (1, 融360)}
user_group_dict = {}
app_type_dict = {1: '首申', 2: '复申', 3: '复贷'}
df_copy_g = df_copy.groupby(['applied_type', 'applied_channel'])[field].count().sort_values(ascending=False)
df_copy_g = df_copy_g.reset_index()
## 过滤小客群.
df_copy_g = df_copy_g.loc[df_copy_g[field] > self.min_user_group * self.num_month]
app_type_set = df_copy_g['applied_type'].unique()
app_chan_set = df_copy_g['applied_channel'].unique()
for app_type in app_type_set:
for app_chan in app_chan_set:
if df_copy_g.loc[(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g
## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict:
self.helper_psi(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (df_copy['applied_channel'] == user_group_dict[user_group_name][1])],
info_dict, field)
# 过滤不包含信息的客群.
remove_list = []
for user_group_name in info_dict:
if not info_dict[user_group_name]:
remove_list.append(user_group_name)
for user_group_name in remove_list:
del info_dict[user_group_name]
# 画图.
print('开始画图.')
print('='*40)
for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])),
info_dict[user_group_name][m]['各分箱样本占比'],
label='%s PSI: %.3f \n 样本量: %d' % (m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'PSI/' + self.model_feild_name_dict[field] + '-' + user_group_name)
plt.show()
# 保存统计信息.
for user_group_name in info_dict:
# print(self.model_feild_name_dict[field] + '-' + user_group_name)
tmp_dict = {'model_name': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]}
for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_psi'] = [round(info_dict[user_group_name][m]['psi'], 3)]
self.psi_info_df = self.psi_info_df.append(pd.DataFrame(tmp_dict))
def plot_auc(self, field):
# 创建文件夹保存图片.
if not os.path.exists(self.save_path):
os.mkdir(self.save_path)
if not os.path.exists(self.save_path + 'AUC/'):
os.mkdir(self.save_path + 'AUC/')
# 分离数据.
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day']].copy()
## 筛选出放款, 且逾期表现的数据.
df_copy = df_copy.loc[(df_copy[field].notna()) & (df_copy['month_label'] != self.last_month - 1) & (df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
for m in range(self.last_month - self.num_month, self.last_month):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('以%d月为基准月.' % m)
break
if not bins:
print('%s 数据时间跨度不足, 放弃画图.' % field)
print('='*40)
return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱.
# 包含各种信息的字典.
# 如: {'全样本':
# {'3月':
# {'该月样本量': int.
# '各分箱样本量': [...],
# '各分箱逾期样本量': [...],
# '各分箱逾期率': [...],
# 'auc': float,
# 'aucR': 1}
# '4月':
# {'该月样本量': int.
# '各分箱样本量': [...],
# '各分箱逾期样本量': [...],
# '各分箱逾期率': [...],
# 'auc': float,
# 'aucR': float}}}
info_dict = {}
# 全样本
self.helper_auc('全样本', df_copy, info_dict, field)
# 按申请类型划分.
self.helper_auc('首申-全渠道', df_copy.loc[df_copy['applied_type'] == 1], info_dict, field)
self.helper_auc('复申-全渠道', df_copy.loc[df_copy['applied_type'] == 2], info_dict, field)
self.helper_auc('复贷-全渠道', df_copy.loc[df_copy['applied_type'] == 3], info_dict, field)
# 按主要客群划分.
## 客群划分.
## user_group_dict = {'首申-融360': (1, 融360)}
user_group_dict = {}
app_type_dict = {1: '首申', 2: '复申', 3: '复贷'}
df_copy_g = df_copy.groupby(['applied_type', 'applied_channel'])[field].count().sort_values(ascending=False)
df_copy_g = df_copy_g.reset_index()
## 过滤小客群.
df_copy_g = df_copy_g.loc[df_copy_g[field] > self.min_user_group * (self.num_month - 1)]
app_type_set = df_copy_g['applied_type'].unique()
app_chan_set = df_copy_g['applied_channel'].unique()
for app_type in app_type_set:
for app_chan in app_chan_set:
if df_copy_g.loc[
(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0:
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
del df_copy_g
## 按划分的客群处理数据.
print(user_group_dict)
for user_group_name in user_group_dict:
self.helper_auc(user_group_name, df_copy.loc[(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], info_dict, field)
# 过滤不包含信息的客群.
remove_list = []
for user_group_name in info_dict:
if not info_dict[user_group_name]:
remove_list.append(user_group_name)
for user_group_name in remove_list:
del info_dict[user_group_name]
# 画图.
print('开始画图.')
print('=' * 40)
for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]:
## 若某月数量少, 则放弃画图.
if info_dict[user_group_name][m]['该月样本量'] < self.min_user_group:
print('样本量少于阈值, 放弃画图.')
break
plt.plot(range(len(info_dict[user_group_name][m]['各分箱逾期率'])),
info_dict[user_group_name][m]['各分箱逾期率'],
label='%s AUC: %.3f AUCR: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'], info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'AUC/' + self.model_feild_name_dict[field] + '-' + user_group_name)
plt.show()
# 保存统计信息.
for user_group_name in info_dict:
tmp_dict = {'model_name': [self.model_feild_name_dict[field]],
'group_name': [user_group_name]}
for m in info_dict[user_group_name]:
tmp_dict[m[0] + 'm_num'] = [int(info_dict[user_group_name][m]['该月样本量'])]
tmp_dict[m[0] + 'm_auc'] = [round(info_dict[user_group_name][m]['auc'], 3)]
self.auc_info_df = self.auc_info_df.append(pd.DataFrame(tmp_dict))
def run(self):
# 获取MySQL数据, 取last_month往前num_month个月数据.
# self.mysql_df = self.sql_query('''SELECT order_no, applied_at,
# applied_type, applied_from, applied_channel, transacted, passdue_day
# FROM risk_analysis
# WHERE applied_at > "2019-%s-01 00:00:00"
# AND applied_at < "2019-%s-01 00:00:00"'''
# % (self.int2str(self.last_month - self.num_month), self.int2str(self.last_month)))
# print('MySQL数据获取成功.')
# self.mysql_df.to_csv('./mysql_data.csv', index=False)
self.mysql_df = pd.read_csv('./mysql_data.csv')
# 获取MongoDB数据, 取last_month往前num_month个月数据.
# condition = {'wf_created_at': {'$gte': '2019-%s-01 00:00:00' % self.int2str(self.last_month - self.num_month),
# '$lte': '2019-%s-01 00:00:00' % self.int2str(self.last_month)}}
# fields = {'wf_biz_no': 1, 'wf_created_at': 1}
# for f in self.model_feild_list: # 加入Excel中预置的模型分名称
# fields[f] = 1
# self.mongo_df = self.mongo_query(condition, fields)
# print('MongoDB数据获取成功.')
# self.mongo_df.to_csv('./mongo_data.csv', index=False)
self.mongo_df = pd.read_csv('./mongo_data.csv')
# MySQL数据去重.
self.mysql_df = self.mysql_df.sort_values('passdue_day')
self.mysql_df = self.mysql_df.drop_duplicates('order_no', keep='first')
print('数据去重完成.')
# 拼接数据.
self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df,
left_on='order_no', right_on='wf_biz_no', how='left')
## 定义逾期用户.
self.merge_data['overdue'] = self.merge_data['passdue_day'] > self.passdue_day
for field in self.model_feild_list:
self.merge_data[field] = self.merge_data[field].astype('float')
print('数据拼接完成.')
# 数据按月划分.
self.merge_data['month_label'] = 0
for m in range(self.num_month):
self.merge_data.loc[
(self.merge_data['applied_at'] > '2019-%s-01 00:00:00' % self.int2str(self.last_month - m - 1)) &
(self.merge_data['applied_at'] < '2019-%s-01 00:00:00' % self.int2str(self.last_month - m)),
'month_label'] = self.last_month - m - 1
# 画图.
## PSI.
print('开始画图-PSI.')
for field in self.model_feild_list:
self.plot_psi(field)
## AUC.
print('开始画图-AUC.')
for field in self.model_feild_list:
self.plot_auc(field)
# 保存统计信息.
self.psi_info_df.to_csv('./psi_info.csv', index=False)
self.auc_info_df.to_csv('./auc_info.csv', index=False)
print('统计信息保存成功.')
if __name__ == '__main__':
pass
mm = ModelMonitor(excel_path='./model_score.xlsx', save_path='./image/', last_month=7, num_month=2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment