Commit 8d3de4ce authored by 舒皓月's avatar 舒皓月

加入VLM V 0.0.4, 2019 08 01

parent 04453ff3
tmp.py tmp.py
test.py test.py
\ No newline at end of file .idea/
\ No newline at end of file
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
# VLM # VLM
- 待重写. - 重写完成, 代码流程改天写.
# PSI & Lift Chart # PSI & Lift Chart
...@@ -49,9 +49,8 @@ ...@@ -49,9 +49,8 @@
- 某个客群是否异常(AUC明显下降, PSI较大). - 某个客群是否异常(AUC明显下降, PSI较大).
- NOTE: - NOTE:
- 当某月样本量很小, 或者没有样本时, 标记为NaN. 对应的PSI, AUC也为NaN. - 当某月样本量很小, 或者没有样本时, 标记为NaN. 对应的PSI, AUC也为NaN.
- 当某月样本量比较小, 导致PSI, AUC计算异常(如某些分箱没有样本, 全为非逾期样本), 则标记为-999. - 当某月样本量比较小, 导致AUC计算异常(如某些分箱没有样本, 全为非逾期样本), 则标记为-999.
- 基准月的PSI为其与全样本第一个月之间的PSI, 反应该客群与整体的差别. - 对于PSI的计算, 采用平滑处理, 根据PSI计算公式, 一旦某箱为0, 而基准月该分箱只要不为0, 就会导致计算异常, 因此计算PSI时为每箱的占比在原基础上增加0.001, 使计算更加稳定.
- 基准月的AUCR为其与全样本第一个月AUC之间的比率, 反应该客群与整体之间的差别.
## 使用方法 ## 使用方法
...@@ -60,7 +59,10 @@ ...@@ -60,7 +59,10 @@
- 创建一个模型监控对象(这样我们就有对象了^v^). - 创建一个模型监控对象(这样我们就有对象了^v^).
```python ```python
mm = ModelMonitor(excel_path='./model_score.xlsx', sheet_name='mongo_model', save_path='./image/', passdue_day=15, num_month=4, min_user_group=500, max_psi=0.1, min_aucr=0.8, if_load=False, if_save=True) # 创建对象.
mm = ModelMonitor(excel_path='./model_score.xlsx', sheet_name='mongo_model', save_path='./image/', passdue_day=15, num_month=4, min_user_group=1000, max_psi=0.1, min_aucr=0.9, if_read=True, if_save=True)
# 执行run方法.
mm.run()
``` ```
- excel_path: Excel文件路径. - excel_path: Excel文件路径.
...@@ -85,19 +87,14 @@ ...@@ -85,19 +87,14 @@
- min_aucr: 最小AUCR, 小于则视为该客群异常. - min_aucr: 最小AUCR, 小于则视为该客群异常.
- if_load: 是否从本地加载数据. - if_read: 是否从数据库读取数据.
默认为False, 即从数据库抽取数据. 默认为False, 即从本地获取数据.
- if_save: 是否将数据库抽取的数据保存到本地. - if_save: 是否将数据库抽取的数据保存到本地.
- 执行run函数.
```python
mm.run()
```
- 输出: ## 输出
- 图片保存在./image中. - 图片保存在./image中.
- PSI: ./image/PSI - PSI: ./image/PSI
...@@ -109,9 +106,7 @@ ...@@ -109,9 +106,7 @@
# TODO # TODO
- 添加对存在MySQL中模型分计算PSI, AUC的代码. - 添加对存在MySQL中模型分计算PSI, AUC的代码.
- 完成对VLM的重写. - 增加模型重要特征的VLM.
- 部分(量信分, app模型)模型分报错, 进一步与模型维护者交流, 看是否字段名或者其它地方有问题.
- 对于某些客群在10个分箱中, 部分分箱数量为0导致计算异常的进行细致的处理.
# 版本信息 # 版本信息
...@@ -137,6 +132,13 @@ ...@@ -137,6 +132,13 @@
这样既可以看每个客群随时间AUC的变化, 也可以看到每个客群的AUC与整体的差别. 这样既可以看每个客群随时间AUC的变化, 也可以看到每个客群的AUC与整体的差别.
- 增加数据读写模式的功能, 主要方便debug. - 增加数据读写模式的功能, 主要方便debug.
- V 0.0.4
- 删除基准月AUCR, PSI的计算.
- 重写VLM.
- 增加过滤空跑数据的功能.
- 增强代码鲁棒性.
# 贡献 # 贡献
......
...@@ -23,8 +23,8 @@ warnings.filterwarnings('ignore') ...@@ -23,8 +23,8 @@ warnings.filterwarnings('ignore')
class ModelMonitor: class ModelMonitor:
def __init__(self, excel_path='./model_score.xlsx', sheet_name='mongo_model', def __init__(self, excel_path='./model_score.xlsx', sheet_name='mongo_model',
passdue_day=15, save_path='./image/', passdue_day=15, save_path='./image/',
num_month=4, min_user_group=500, max_psi=0.1, min_aucr=0.85, num_month=4, min_user_group=1000, max_psi=0.1, min_aucr=0.85,
if_save=True, if_load=False): if_save=True, if_read=True):
# 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量. # 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量.
self.mysql_engine = pymysql.connect(host='172.20.6.9', self.mysql_engine = pymysql.connect(host='172.20.6.9',
...@@ -40,10 +40,17 @@ class ModelMonitor: ...@@ -40,10 +40,17 @@ class ModelMonitor:
self.mongo_table = self.mongo_db['wf_audit_log_with_feature'] self.mongo_table = self.mongo_db['wf_audit_log_with_feature']
# 读取整理在Excel中的模型相关信息. # 读取整理在Excel中的模型相关信息.
self.model_info_df = pd.read_excel(excel_path, sheet_name=sheet_name) self.field_info_df = pd.read_excel(excel_path, sheet_name=sheet_name)
self.model_name_list = self.model_info_df.model_name.tolist() self.field_name_list = self.field_info_df.field_name.tolist()
self.model_feild_list = self.model_info_df.model_feild.tolist() self.field_query_list = self.field_info_df.field_query.tolist()
self.model_feild_name_dict = dict(zip(self.model_feild_list, self.model_name_list)) self.field_app_type_list = self.field_info_df.app_type.tolist()
self.field_app_type_list = [str(x) for x in self.field_app_type_list]
self.field_query_name_dict = dict(zip(self.field_query_list, self.field_name_list))
self.field_query_app_type_dict = dict(zip(self.field_query_list, self.field_app_type_list))
## 空跑信息.
self.na_time = self.field_info_df.na_time.tolist() # 空跑时间段
self.na_app_type = self.field_info_df.na_app_type.tolist() # 空跑申请类型
self.na_app_chan = self.field_info_df.na_app_chan.tolist() # 空跑渠道
# 一些定义的常量 # 一些定义的常量
self.passdue_day = passdue_day # 逾期天数, 默认15. self.passdue_day = passdue_day # 逾期天数, 默认15.
...@@ -56,10 +63,11 @@ class ModelMonitor: ...@@ -56,10 +63,11 @@ class ModelMonitor:
# 获取当天日期信息. # 获取当天日期信息.
self.current_date = (datetime.date.today() + relativedelta(days=-1)).strftime('%Y-%m-%d') self.current_date = (datetime.date.today() + relativedelta(days=-1)).strftime('%Y-%m-%d')
self.response_date = (datetime.date.today() + relativedelta(days=-(31 + self.passdue_day))).strftime('%Y-%m-%d') self.response_date = (datetime.date.today() + relativedelta(days=-(31 + self.passdue_day))).strftime('%Y-%m-%d')
self.first_date = (datetime.date.today() + relativedelta(months=-self.num_month + 1)).strftime('%Y-%m-01') self.first_date = (datetime.date.today() + relativedelta(days=-1) + relativedelta(
months=-self.num_month + 1)).strftime('%Y-%m-01')
self.current_month = (datetime.date.today() + datetime.timedelta(days=-1)).month self.current_month = (datetime.date.today() + datetime.timedelta(days=-1)).month
self.response_month = (datetime.date.today() + relativedelta(days=-46)).month self.response_month = (datetime.date.today() + relativedelta(days=-(31 + self.passdue_day))).month
self.first_month = self.current_month - self.num_month + 1 self.first_month = self.current_month - self.num_month + 1
# 将会从数据库中读取的数据. # 将会从数据库中读取的数据.
...@@ -83,12 +91,14 @@ class ModelMonitor: ...@@ -83,12 +91,14 @@ class ModelMonitor:
self.na_enough_data_psi_set = set() # 一些新的模型没有足够数据用于统计. self.na_enough_data_psi_set = set() # 一些新的模型没有足够数据用于统计.
self.na_enough_data_auc_set = set() # 一些新的模型没有足够数据用于统计. self.na_enough_data_auc_set = set() # 一些新的模型没有足够数据用于统计.
self.filed_bench_bins_ratio = None # 每个模型分在总体样本上第一个月的分箱比例.
self.filed_bench_auc = None # 每个模型分在总体样本上第一个月的AUC
# 程序数据读写模式. # 程序数据读写模式.
self.if_save = if_save # 是否保存从数据库抽取的数据. self.if_save = if_save # 是否保存从数据库抽取的数据.
self.if_load = if_load # 是否从保存的数据加载数据, 而不从数据库读取. self.if_read = if_read # 是否从从数据库读取.
# 分箱方式.
self.bins = None
self.bench_month = None
def sql_query(self, sql): def sql_query(self, sql):
''' '''
...@@ -136,8 +146,10 @@ class ModelMonitor: ...@@ -136,8 +146,10 @@ class ModelMonitor:
bins = score_list.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9]).values.tolist() bins = score_list.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9]).values.tolist()
bins = [-99999999] + bins + [99999999] bins = [-99999999] + bins + [99999999]
bins = [x for x in bins if pd.notna(x)] bins = [x for x in bins if pd.notna(x)]
if len(set(bins)) < 11: bins = list(sorted(list(set(bins))))
return None # print(bins)
# if len(set(bins)) < 11:
# return None
return bins return bins
except: except:
print('分箱出现错误.') print('分箱出现错误.')
...@@ -152,12 +164,68 @@ class ModelMonitor: ...@@ -152,12 +164,68 @@ class ModelMonitor:
:param array_2: array :param array_2: array
:return: PSI :return: PSI
''' '''
# 对array做平滑处理, 防止一些分箱为零的PSI计算异常.
array_1 = array_1 + 0.001
array_2 = array_2 + 0.001
try: try:
psi = ((array_1 - array_2) * np.log10(array_1 / array_2)).sum() psi = ((array_1 - array_2) * np.log10(array_1 / array_2)).sum()
return psi return psi
except: except:
return None return None
def filter_data(self, df, field):
'''
过滤空跑数据.
:param df: df.
:param field: str, 字段名.
:return: df, 过滤后的数据.
'''
df = df[~((df['applied_type'] == 1) & (df['applied_channel'].apply(lambda x: 'Android' in x)))]
field_idx = self.field_query_list.index(field)
na_time = self.na_time[field_idx]
na_type = self.na_app_type[field_idx]
na_chan = self.na_app_chan[field_idx]
print(na_chan, type(na_chan))
if pd.isnull(na_time): # 没有空跑时间, 则不记录.
return df
# 时间.
t_s, t_e = na_time.split('~')
print(t_s, t_e)
if len(t_e) == 0: # 若还在空跑, 则不记录.
return pd.DataFrame()
else:
print(df['applied_at'].head())
na_df = df[
(df['applied_at'].apply(lambda x: x[:10] >= t_s)) & (df['applied_at'].apply(lambda x: x[:10] <= t_e))]
if na_df.shape[0] == 0:
return df
# 申请类型.
if pd.isnull(na_type):
return df[~df.index.isin(na_df.index.values)]
else:
tmp_df = pd.DataFrame()
for i in str(int(na_type)):
print(i, 'wsnd')
tmp_df = tmp_df.append(na_df[na_df['applied_type'] == int(i)])
na_df = tmp_df
if na_df.shape[0] == 0:
return df
# 申请渠道.
if pd.isnull(na_chan):
return df[~df.index.isin(na_df.index.values)]
else:
tmp_df = pd.DataFrame()
for i in na_chan.split(','):
tmp_df = tmp_df.append(na_df[na_df['applied_channel'].apply(lambda x: i in x)])
na_df = tmp_df
if na_df.shape[0] == 0:
return df
print(df.shape[0], na_df.shape[0])
return df[~df.index.isin(na_df.index.values)]
def helper_psi(self, user_group_name=None, df=None, info_dict=None, field=None): def helper_psi(self, user_group_name=None, df=None, info_dict=None, field=None):
''' '''
信息提取函数. 信息提取函数.
...@@ -172,35 +240,31 @@ class ModelMonitor: ...@@ -172,35 +240,31 @@ class ModelMonitor:
month_list.remove(0) month_list.remove(0)
df_g = df.groupby(['month_label', 'bins']).agg({field: ['count']}) df_g = df.groupby(['month_label', 'bins']).agg({field: ['count']})
df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()]
df_g = df_g.reset_index() df_g = df_g.reset_index()
df_g = df_g.sort_values(['month_label', 'bins']) df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list): for i, m in enumerate(month_list):
amt_in_bins = df_g.loc[df_g['month_label'] == m, field].values if m < self.bench_month:
continue
amt_in_bins = df_g.loc[df_g['month_label'] == m, ['bins', field + '_count']]
amt_in_bins = pd.merge(left=self.bins, right=amt_in_bins, on='bins', how='left')
amt_in_bins[field + '_count'] = amt_in_bins[field + '_count'].fillna(0)
amt_in_bins = amt_in_bins[field + '_count'].values
## 某月样本量小于阈值, 放弃记录信息. ## 某月样本量小于阈值, 放弃记录信息.
if amt_in_bins.sum() < self.min_user_group: if amt_in_bins.sum() < self.min_user_group:
print('%d月样本量过小, 放弃提取信息.' % m) print('%d月样本量过小, 放弃提取信息.' % m)
continue continue
info_dict[user_group_name][str(m) + '月'] = {} info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum() info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱样本占比'] = np.array([x[0] for x in amt_in_bins / amt_in_bins.sum()]) info_dict[user_group_name][str(m) + '月']['各分箱样本占比'] = amt_in_bins / amt_in_bins.sum()
## 若为某模型分全样本, 则记录分箱比例.
if user_group_name == '全样本' and self.filed_bench_bins_ratio is None:
self.filed_bench_bins_ratio = info_dict[user_group_name][str(m) + '月']['各分箱样本占比']
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量'])) print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
# 计算PSI, 以样本量达标的第一个月为基准. # 计算PSI, 以样本量达标的第一个月为基准.
for i, m in enumerate(info_dict[user_group_name]): for i, m in enumerate(info_dict[user_group_name]):
if i == 0: if i == 0:
if user_group_name == '全样本': info_dict[user_group_name][m]['psi'] = 0
psi = 0
else:
psi = self.calc_psi(self.filed_bench_bins_ratio, info_dict[user_group_name][m]['各分箱样本占比'])
if psi is not None:
info_dict[user_group_name][m]['psi'] = psi
else:
info_dict[user_group_name][m]['psi'] = -999
print('计算PSI出现错误.')
bench_bins_ratio = info_dict[user_group_name][m]['各分箱样本占比'] bench_bins_ratio = info_dict[user_group_name][m]['各分箱样本占比']
else: else:
psi = self.calc_psi(bench_bins_ratio, info_dict[user_group_name][m]['各分箱样本占比']) psi = self.calc_psi(bench_bins_ratio, info_dict[user_group_name][m]['各分箱样本占比'])
...@@ -231,19 +295,27 @@ class ModelMonitor: ...@@ -231,19 +295,27 @@ class ModelMonitor:
df_g = df_g.sort_values(['month_label', 'bins']) df_g = df_g.sort_values(['month_label', 'bins'])
for i, m in enumerate(month_list): for i, m in enumerate(month_list):
amt_in_bins = df_g.loc[df_g['month_label'] == m, 'overdue_count'].values if m < self.bench_month:
continue
amt_in_bins = df_g.loc[df_g['month_label'] == m, ['bins', 'overdue_count']]
amt_in_bins = pd.merge(left=self.bins, right=amt_in_bins, on='bins', how='left')
amt_in_bins['overdue_count'] = amt_in_bins['overdue_count'].fillna(0)
amt_in_bins = amt_in_bins['overdue_count'].values
# 某月样本量小于阈值, 放弃记录信息. # 某月样本量小于阈值, 放弃记录信息.
if amt_in_bins.sum() < self.min_user_group: if amt_in_bins.sum() < self.min_user_group:
print('%d月样本量过小, 放弃提取信息.' % m) print('%d月样本量过小, 放弃提取信息.' % m)
continue continue
info_dict[user_group_name][str(m) + '月'] = {} info_dict[user_group_name][str(m) + '月'] = {}
info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum() info_dict[user_group_name][str(m) + '月']['该月样本量'] = amt_in_bins.sum()
info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins info_dict[user_group_name][str(m) + '月']['各分箱样本量'] = amt_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = df_g.loc[ overdue_in_bins = df_g.loc[df_g['month_label'] == m, ['bins', 'overdue_sum']]
df_g['month_label'] == m, 'overdue_sum'].values overdue_in_bins = pd.merge(left=self.bins, right=overdue_in_bins, on='bins', how='left')
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = df_g.loc[ overdue_in_bins['overdue_sum'] = overdue_in_bins['overdue_sum'].fillna(0)
df_g['month_label'] == m, 'overdue_mean'].values overdue_in_bins = overdue_in_bins['overdue_sum'].values
info_dict[user_group_name][str(m) + '月']['各分箱逾期样本量'] = overdue_in_bins
info_dict[user_group_name][str(m) + '月']['各分箱逾期率'] = np.array(
[x if ~np.isnan(x) else 0 for x in overdue_in_bins / amt_in_bins])
print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量'])) print('%d月样本量: %d' % (m, info_dict[user_group_name][str(m) + '月']['该月样本量']))
try: try:
info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score( info_dict[user_group_name][str(m) + '月']['auc'] = roc_auc_score(
...@@ -252,12 +324,9 @@ class ModelMonitor: ...@@ -252,12 +324,9 @@ class ModelMonitor:
except: except:
print('AUC计算发生错误.') print('AUC计算发生错误.')
info_dict[user_group_name][str(m) + '月']['auc'] = -999 info_dict[user_group_name][str(m) + '月']['auc'] = -999
if user_group_name == '全样本' and self.filed_bench_auc is None:
self.filed_bench_auc = info_dict[user_group_name][str(m) + '月']['auc']
for i, m in enumerate(info_dict[user_group_name]): for i, m in enumerate(info_dict[user_group_name]):
if i == 0: # 基准月. if i == 0: # 基准月.
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / self.filed_bench_auc info_dict[user_group_name][m]['aucR'] = 1
bench_month = m bench_month = m
else: else:
info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / \ info_dict[user_group_name][m]['aucR'] = info_dict[user_group_name][m]['auc'] / \
...@@ -273,22 +342,40 @@ class ModelMonitor: ...@@ -273,22 +342,40 @@ class ModelMonitor:
if not os.path.exists(self.save_path + 'PSI/'): if not os.path.exists(self.save_path + 'PSI/'):
os.mkdir(self.save_path + 'PSI/') os.mkdir(self.save_path + 'PSI/')
# 分离数据. # 分离数据.
df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel']].copy() df_copy = self.merge_data[[field, 'month_label', 'applied_type', 'applied_channel', 'applied_at']].copy()
# 选择包含正确申请类型的数据.
tmp_df = pd.DataFrame()
for i in self.field_query_app_type_dict[field]:
tmp_df = tmp_df.append(df_copy[df_copy['applied_type'] == int(i)])
df_copy = tmp_df
df_copy = df_copy[df_copy[field].notna()]
# 过滤空跑数据.
df_copy = self.filter_data(df_copy, field)
if df_copy.shape[0] == 0:
print('仍在空跑.')
return None
print(df_copy['applied_at'].min(), df_copy['applied_at'].max())
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准. # 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
bins = None
for m in range(self.first_month, self.current_month + 1): for m in range(self.first_month, self.current_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field]) if df_copy.loc[df_copy['month_label'] == m, field].shape[0] < self.min_user_group:
if bins: continue
print('%s以%d月为基准月.' % (self.model_feild_name_dict[field], m)) else:
break bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('%s以%d月为基准月.' % (self.field_query_name_dict[field], m))
self.bench_month = m
break
if not bins: if not bins:
self.na_enough_data_psi_set.add(self.model_feild_name_dict[field]) self.na_enough_data_psi_set.add(self.field_query_name_dict[field])
print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field]) print('%s 数据时间跨度不足, 放弃画图.' % self.field_query_name_dict[field])
print('=' * 40) print('=' * 40)
return None return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱. df_copy['bins'] = pd.cut(df_copy[field], bins, precision=8) # 根据分箱规则进行分箱.
self.bins = pd.Series(df_copy['bins'].unique(), name='bins').sort_values()
self.bins = self.bins.dropna()
# 包含各种信息的字典. # 包含各种信息的字典.
# 如: {'全样本': # 如: {'全样本':
# {'3月': # {'3月':
...@@ -317,21 +404,23 @@ class ModelMonitor: ...@@ -317,21 +404,23 @@ class ModelMonitor:
app_type_dict = {1: '首申', 2: '复申', 3: '复贷'} app_type_dict = {1: '首申', 2: '复申', 3: '复贷'}
df_copy_g = df_copy.groupby(['applied_type', 'applied_channel'])[field].count().sort_values(ascending=False) df_copy_g = df_copy.groupby(['applied_type', 'applied_channel'])[field].count().sort_values(ascending=False)
df_copy_g = df_copy_g.reset_index() df_copy_g = df_copy_g.reset_index()
## 过滤小客群. ## 过滤小客群.
df_copy_g = df_copy_g.loc[df_copy_g[field] > self.min_user_group * self.num_month] df_copy_g = df_copy_g.loc[df_copy_g[field] > self.min_user_group * self.num_month]
app_type_set = df_copy_g['applied_type'].unique()
app_chan_set = df_copy_g['applied_channel'].unique() tmp_df = pd.DataFrame()
for app_type in app_type_set: for i in range(df_copy_g.shape[0]):
for app_chan in app_chan_set: tmp_df = tmp_df.append(df_copy.loc[(df_copy['applied_type'] == df_copy_g.iloc[i]['applied_type']) &
if df_copy_g.loc[ (df_copy['applied_channel'] == df_copy_g.iloc[i]['applied_channel'])])
(df_copy_g['applied_type'] == app_type) & (df_copy_g['applied_channel'] == app_chan)].shape[0] != 0: user_group_dict[
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan) app_type_dict[df_copy_g.iloc[i]['applied_type']] + '-' + df_copy_g.iloc[i]['applied_channel']] = \
(df_copy_g.iloc[i]['applied_type'], df_copy_g.iloc[i]['applied_channel'])
del df_copy_g del df_copy_g
## 按划分的客群处理数据. ## 按划分的客群处理数据.
for user_group_name in user_group_dict: for user_group_name in user_group_dict:
self.helper_psi(user_group_name, df_copy.loc[ self.helper_psi(user_group_name, df_copy.loc[
(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & ( (df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])], df_copy['applied_channel'] == user_group_dict[user_group_name][1])],
info_dict, field) info_dict, field)
# 过滤不包含信息的客群. # 过滤不包含信息的客群.
remove_list = [] remove_list = []
...@@ -346,7 +435,7 @@ class ModelMonitor: ...@@ -346,7 +435,7 @@ class ModelMonitor:
print('开始画图.') print('开始画图.')
print('=' * 40) print('=' * 40)
for user_group_name in info_dict: for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name) print(self.field_query_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8)) plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
# print(m) # print(m)
...@@ -355,16 +444,16 @@ class ModelMonitor: ...@@ -355,16 +444,16 @@ class ModelMonitor:
plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])), plt.plot(range(len(info_dict[user_group_name][m]['各分箱样本占比'])),
[round(x, 3) for x in info_dict[user_group_name][m]['各分箱样本占比']], [round(x, 3) for x in info_dict[user_group_name][m]['各分箱样本占比']],
label='%s PSI: %.3f \n 样本量: %d' % ( label='%s PSI: %.3f \n 样本量: %d' % (
m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量'])) m, info_dict[user_group_name][m]['psi'], info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name) plt.title(self.field_query_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'PSI/' + self.model_feild_name_dict[field] + '-' + user_group_name) plt.savefig(self.save_path + 'PSI/' + self.field_query_name_dict[field] + '-' + user_group_name)
plt.show() plt.show()
# 保存统计信息. # 保存统计信息.
for user_group_name in info_dict: for user_group_name in info_dict:
# print(self.model_feild_name_dict[field] + '-' + user_group_name) # print(self.model_feild_name_dict[field] + '-' + user_group_name)
tmp_dict = {'模型名称': [self.model_feild_name_dict[field]], tmp_dict = {'模型名称': [self.field_query_name_dict[field]],
'客群名称': [user_group_name]} '客群名称': [user_group_name]}
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])] tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
...@@ -380,29 +469,46 @@ class ModelMonitor: ...@@ -380,29 +469,46 @@ class ModelMonitor:
# 分离数据. # 分离数据.
df_copy = self.merge_data[ df_copy = self.merge_data[
[field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day', 'applied_at']].copy() [field, 'month_label', 'applied_type', 'applied_channel', 'overdue', 'passdue_day', 'applied_at']].copy()
df_copy = df_copy[(df_copy['overdue'].notna()) & (df_copy[field].notna())]
tmp_df = pd.DataFrame()
for i in self.field_query_app_type_dict[field]:
tmp_df = tmp_df.append(df_copy[df_copy['applied_type'] == int(i)])
df_copy = tmp_df
# 过滤空跑数据.
df_copy = self.filter_data(df_copy, field)
if df_copy.shape[0] == 0:
print('仍在空跑.')
return None
## 筛选出放款, 且逾期表现的数据. ## 筛选出放款, 且逾期表现的数据.
if repr(df_copy['applied_at'].dtype) == "dtype('O')": if repr(df_copy['applied_at'].dtype) == "dtype('O')":
df_copy = df_copy.loc[ df_copy = df_copy.loc[
(df_copy[field].notna()) & (df_copy['applied_at'].apply(lambda x: x[:10]) <= self.response_date) & ( (df_copy[field].notna()) & (df_copy['applied_at'].apply(lambda x: x[:10]) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())] df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
else: else:
df_copy = df_copy.loc[(df_copy[field].notna()) & ( df_copy = df_copy.loc[(df_copy[field].notna()) & (
df_copy['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d')) <= self.response_date) & ( df_copy['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d')) <= self.response_date) & (
df_copy[field] > 0) & (df_copy['passdue_day'].notna())] df_copy[field] > 0) & (df_copy['passdue_day'].notna())]
# 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准. # 对模型分进行分箱, 选取数据中该模型分最开始的那个月作为基准.
bins = None
for m in range(self.first_month, self.response_month + 1): for m in range(self.first_month, self.response_month + 1):
bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field]) if df_copy.loc[df_copy['month_label'] == m, field].shape[0] < self.min_user_group:
if bins: continue
print('%s以%d月为基准月.' % (self.model_feild_name_dict[field], m)) else:
break bins = self.make_bin(df_copy.loc[df_copy['month_label'] == m, field])
if bins:
print('%s以%d月为基准月.' % (self.field_query_name_dict[field], m))
self.bench_month = m
break
if not bins: if not bins:
self.na_enough_data_auc_set.add(self.model_feild_name_dict[field]) self.na_enough_data_auc_set.add(self.field_query_name_dict[field])
print('%s 数据时间跨度不足, 放弃画图.' % self.model_feild_name_dict[field]) print('%s 数据时间跨度不足, 放弃画图.' % self.field_query_name_dict[field])
print('=' * 40) print('=' * 40)
return None return None
df_copy['bins'] = pd.cut(df_copy[field], bins) # 根据分箱规则进行分箱. df_copy['bins'] = pd.cut(df_copy[field], bins, precision=8) # 根据分箱规则进行分箱.
self.bins = pd.Series(df_copy['bins'].unique(), name='bins').sort_values()
self.bins = self.bins.dropna()
# 包含各种信息的字典. # 包含各种信息的字典.
# 如: {'全样本': # 如: {'全样本':
...@@ -465,7 +571,7 @@ class ModelMonitor: ...@@ -465,7 +571,7 @@ class ModelMonitor:
print('开始画图.') print('开始画图.')
print('=' * 40) print('=' * 40)
for user_group_name in info_dict: for user_group_name in info_dict:
print(self.model_feild_name_dict[field] + '-' + user_group_name) print(self.field_query_name_dict[field] + '-' + user_group_name)
plt.figure(figsize=(16, 8)) plt.figure(figsize=(16, 8))
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
## 若某月数量少, 则放弃画图. ## 若某月数量少, 则放弃画图.
...@@ -478,13 +584,13 @@ class ModelMonitor: ...@@ -478,13 +584,13 @@ class ModelMonitor:
m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'], m, info_dict[user_group_name][m]['auc'], info_dict[user_group_name][m]['aucR'],
info_dict[user_group_name][m]['该月样本量'])) info_dict[user_group_name][m]['该月样本量']))
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.title(self.model_feild_name_dict[field] + '-' + user_group_name) plt.title(self.field_query_name_dict[field] + '-' + user_group_name)
plt.savefig(self.save_path + 'AUC/' + self.model_feild_name_dict[field] + '-' + user_group_name) plt.savefig(self.save_path + 'AUC/' + self.field_query_name_dict[field] + '-' + user_group_name)
plt.show() plt.show()
# 保存统计信息. # 保存统计信息.
for user_group_name in info_dict: for user_group_name in info_dict:
tmp_dict = {'模型名称': [self.model_feild_name_dict[field]], tmp_dict = {'模型名称': [self.field_query_name_dict[field]],
'客群名称': [user_group_name]} '客群名称': [user_group_name]}
for m in info_dict[user_group_name]: for m in info_dict[user_group_name]:
tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])] tmp_dict[m[0] + '月数量'] = [int(info_dict[user_group_name][m]['该月样本量'])]
...@@ -495,17 +601,9 @@ class ModelMonitor: ...@@ -495,17 +601,9 @@ class ModelMonitor:
def abnormal_psi(self): def abnormal_psi(self):
def is_abnormal_psi(data): def is_abnormal_psi(data):
first = True
for idx in data.index: for idx in data.index:
if 'PSI' in idx and pd.notna(data[idx]): if 'PSI' in idx and pd.notna(data[idx]) and data[idx] > self.max_psi:
if first: return True
if data[idx] > self.max_psi * 5:
return True
else:
first = False
else:
if data[idx] > self.max_psi:
return True
return False return False
self.psi_info_df['是否异常'] = self.psi_info_df.apply(is_abnormal_psi, axis=1) self.psi_info_df['是否异常'] = self.psi_info_df.apply(is_abnormal_psi, axis=1)
...@@ -513,7 +611,7 @@ class ModelMonitor: ...@@ -513,7 +611,7 @@ class ModelMonitor:
def abnormal_auc(self): def abnormal_auc(self):
def is_abnormal_auc(data): def is_abnormal_auc(data):
for i in data.index: for i in data.index:
if 'AUCR' in i and data[i] < self.min_aucr: if 'AUCR' in i and pd.notna(data[i]) and data[i] < self.min_aucr:
return True return True
return False return False
...@@ -521,32 +619,41 @@ class ModelMonitor: ...@@ -521,32 +619,41 @@ class ModelMonitor:
def run(self): def run(self):
# 获取MySQL数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月). # 获取MySQL数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
if not self.if_load: if self.if_read:
self.mysql_df = self.sql_query('''SELECT order_no, applied_at, self.mysql_df = self.sql_query('''SELECT order_no, applied_at,
applied_type, applied_from, applied_channel, transacted, passdue_day applied_from, applied_channel, transacted, passdue_day
FROM risk_analysis FROM risk_analysis
WHERE applied_at >= "%s 00:00:00" WHERE applied_at >= "%s 00:00:00"
AND applied_at <= "%s 00:00:00"''' AND applied_at <= "%s 00:00:00"'''
% (self.first_date, self.current_date)) % (self.first_date, datetime.date.today().strftime('%Y-%m-%d')))
print('MySQL数据获取成功.') print('MySQL数据获取成功.')
if self.if_save: if self.if_save:
self.mysql_df.to_csv('./mysql_data.csv', index=False) self.mysql_df.to_csv('./mysql_data.csv', index=False)
else: else:
self.mysql_df = pd.read_csv('./mysql_data.csv') self.mysql_df = pd.read_csv('./mysql_data.csv')
def func_0(data):
try:
return int(int(data) + 1)
except:
return np.nan
# 获取MongoDB数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月). # 获取MongoDB数据, 取近期num_month个月数据(如今天7.27, 则这27天算进7月).
if not self.if_load: if self.if_read:
condition = {'wf_created_at': {'$gte': '%s 00:00:00' % self.first_date, condition = {'wf_created_at': {'$gte': '%s 00:00:00' % self.first_date,
'$lte': '%s 00:00:00' % self.current_date}} '$lte': '%s 00:00:00' % datetime.date.today().strftime('%Y-%m-%d')}}
fields = {'wf_biz_no': 1, 'wf_created_at': 1} fields = {'wf_biz_no': 1, 'wf_created_at': 1, 'wf_loan_type': 1}
for f in self.model_feild_list: # 加入Excel中预置的模型分名称 for f in self.field_query_list: # 加入Excel中预置的模型分名称
fields[f] = 1 fields[f] = 1
self.mongo_df = self.mongo_query(condition, fields) self.mongo_df = self.mongo_query(condition, fields)
self.mongo_df['applied_type'] = self.mongo_df['wf_loan_type'].apply(func_0)
del self.mongo_df['wf_loan_type']
print('MongoDB数据获取成功.') print('MongoDB数据获取成功.')
if self.if_save: if self.if_save:
self.mongo_df.to_csv('./mongo_data.csv', index=False) self.mongo_df.to_csv('./mongo_data.csv', index=False)
else: else:
self.mongo_df = pd.read_csv('./mongo_data.csv') self.mongo_df = pd.read_csv('./mongo_data.csv')
self.mongo_df = self.mongo_df.loc[self.mongo_df['applied_type'].notna()]
# MySQL数据去重. # MySQL数据去重.
self.mysql_df = self.mysql_df.sort_values('passdue_day') self.mysql_df = self.mysql_df.sort_values('passdue_day')
...@@ -555,7 +662,8 @@ class ModelMonitor: ...@@ -555,7 +662,8 @@ class ModelMonitor:
# 拼接数据. # 拼接数据.
self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df, self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df,
left_on='order_no', right_on='wf_biz_no', how='left') left_on='order_no', right_on='wf_biz_no', how='inner')
print('数据拼接完成.')
## 定义逾期用户. ## 定义逾期用户.
def overdue(data): def overdue(data):
...@@ -566,6 +674,13 @@ class ModelMonitor: ...@@ -566,6 +674,13 @@ class ModelMonitor:
self.merge_data['overdue'] = self.merge_data['passdue_day'].apply(overdue) self.merge_data['overdue'] = self.merge_data['passdue_day'].apply(overdue)
# 清洗时间格式, 使其转换成统一的字符串格式.
if repr(self.merge_data['applied_at'].dtype) == "dtype('O')":
self.merge_data['applied_at'] = self.merge_data['applied_at'].apply(lambda x: x[:10])
else:
self.merge_data['applied_at'] = self.merge_data['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d'))
# 清洗数据. # 清洗数据.
def clean_data(data): def clean_data(data):
try: try:
...@@ -574,20 +689,19 @@ class ModelMonitor: ...@@ -574,20 +689,19 @@ class ModelMonitor:
return np.nan return np.nan
na_field_list = [] na_field_list = []
for field in self.model_feild_list: for field in self.field_query_list:
if field in self.merge_data.columns.tolist(): if field in self.merge_data.columns.tolist():
print('正在清洗%s' % self.model_feild_name_dict[field]) print('正在清洗%s' % self.field_query_name_dict[field])
self.merge_data[field] = self.merge_data[field].apply(clean_data) self.merge_data[field] = self.merge_data[field].apply(clean_data)
else: else:
na_field_list.append(field) na_field_list.append(field)
## 去除因为一些原因未抽取到的字段. ## 去除因为一些原因未抽取到的字段.
print('不包含以下字段:') print('不包含以下字段:')
for field in na_field_list: for field in na_field_list:
print(self.model_feild_name_dict[field]) print(self.field_query_name_dict[field])
self.model_feild_list.remove(field) self.field_query_list.remove(field)
self.model_name_list.remove(self.model_feild_name_dict[field]) self.field_name_list.remove(self.field_query_name_dict[field])
del self.model_feild_name_dict[field] del self.field_query_name_dict[field]
print('数据拼接完成.')
# 数据按月划分. # 数据按月划分.
self.merge_data['month_label'] = 0 self.merge_data['month_label'] = 0
...@@ -601,11 +715,24 @@ class ModelMonitor: ...@@ -601,11 +715,24 @@ class ModelMonitor:
## PSI. ## PSI.
print('开始画图-PSI.') print('开始画图-PSI.')
for field in self.model_feild_list: for field in self.field_query_list:
# if self.field_query_name_dict[field] == '新颜v3':
# self.plot_psi(field)
# if self.field_query_name_dict[field] == '短信分':
# self.plot_psi(field)
# if self.field_query_name_dict[field] == '探知':
# self.plot_psi(field)
self.plot_psi(field) self.plot_psi(field)
## AUC. ## AUC.
print('开始画图-AUC.') print('开始画图-AUC.')
for field in self.model_feild_list: for field in self.field_query_list:
# if self.model_feild_name_dict[field] == '短信分':
# self.plot_auc(field)
# if self.model_feild_name_dict[field] == '百融v1':
# self.plot_auc(field)
self.plot_auc(field) self.plot_auc(field)
# 输出数据不足的模型. # 输出数据不足的模型.
......
# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict
from scipy.stats import norm
import pymysql
import pymongo
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 100
class ModelMonitorVLM:
def __init__(self, excel_path='./model_score.xlsx',
sheet_name='mongo_model',
fig_save_path='./image/', info_save_path='./info/',
data_save_path='./data/',
if_read=True, if_save=True,
alpha=0.01, min_user_group=10000):
# 考虑到数据库配置基本不变, 所以不设置创建对象时对应输入变量.
self.mysql_engine = pymysql.connect(host='172.20.6.9',
port=9030,
user='fengkong_read_only',
passwd='mT2HFUgI',
db='risk_analysis',
charset='utf8')
self.mongo_client = pymongo.MongoClient(
"mongodb://haoyue.shu:x2egwRHk7WhQ4So1@172.18.3.22:27017/?authSource=rc_mgo_feature_dp")
self.mongo_db = self.mongo_client['rc_mgo_feature_dp']
self.mongo_table = self.mongo_db['wf_audit_log_with_feature']
# 读取整理在Excel中的模型相关信息.
self.field_info_df = pd.read_excel(excel_path, sheet_name=sheet_name)
self.field_name_list = self.field_info_df.field_name.tolist()
self.field_query_list = self.field_info_df.field_query.tolist()
self.field_app_type_list = self.field_info_df.app_type.tolist()
self.field_app_type_list = [str(x) for x in self.field_app_type_list]
self.field_query_name_dict = dict(zip(self.field_query_list, self.field_name_list))
self.field_query_app_type_dict = dict(zip(self.field_query_list, self.field_app_type_list))
## 空跑信息.
self.na_time = self.field_info_df.na_time.tolist() # 空跑时间段
self.na_app_type = self.field_info_df.na_app_type.tolist() # 空跑申请类型
self.na_app_chan = self.field_info_df.na_app_chan.tolist() # 空跑渠道
# 文件存储路径.
self.fig_save_path = fig_save_path
self.info_save_path = info_save_path
self.data_save_path = data_save_path
if not os.path.exists(fig_save_path):
os.mkdir(fig_save_path)
if not os.path.exists(fig_save_path + 'image/'):
os.mkdir(fig_save_path + 'image/')
if not os.path.exists(fig_save_path + 'over_3std/'):
os.mkdir(fig_save_path + 'over_3std/')
if not os.path.exists(fig_save_path + 'trend/'):
os.mkdir(fig_save_path + 'trend/')
if not os.path.exists(info_save_path):
os.mkdir(info_save_path)
if not os.path.exists(data_save_path):
os.mkdir(data_save_path)
# MK test显著性水平.
self.alpha = alpha
# 数据.
self.mysql_df = None
self.mongo_df = None
self.merge_data = None
# 数据模式.
self.if_read = if_read
self.if_save = if_save
# 获取当天日期信息.
self.current_date = (datetime.date.today() + relativedelta(days=-1)).strftime('%Y-%m-%d')
self.third_date = datetime.date.today().strftime('%Y-%m-01')
self.second_date = (datetime.date.today() + relativedelta(months=-1)).strftime('%Y-%m-01')
self.first_date = (datetime.date.today() + relativedelta(months=-2)).strftime('%Y-%m-01')
self.current_month = (datetime.date.today() + datetime.timedelta(days=-1)).month
self.second_month = (datetime.date.today() + relativedelta(months=-1)).month
self.first_month = (datetime.date.today() + relativedelta(months=-2)).month
self.num_day = len(pd.date_range(self.first_date, self.current_date))
# 统计信息.
# {group_name_0: {time_list: [...],
# value_list: [...],
# count_list: [...],
# miss_rate_list: [...],
# zero_rate_list: [...],
# mean: float,
# std: float,
# trend: [...],
# h: [...],
# p: [...],
# over_3std: bool}
self.vlm_info_dict = OrderedDict()
self.vlm_info_df = None
self.min_user_group = min_user_group
def query_mysql(self, sql):
'''
连接MySQL数据库, 根据SQL返回数据.
:param sql: str.
:return: DataFrame.
'''
try:
return pd.read_sql(sql, self.mysql_engine)
except:
print('SQL查询出现错误.')
pass
def query_mongo(self, condition, fields):
'''
连接MongoDB, 根据查询返回数据.
:param condition: dict
:param fields: dict
:return: DataFrame
'''
try:
return pd.DataFrame(list(self.mongo_table.find(condition, fields)))
except:
print('Mongo查询出现错误.')
pass
def filter_data(self, df, field):
'''
过滤空跑数据.
:param df: df.
:param field: str, 字段名.
:return: df, 过滤后的数据.
'''
df = df[~((df['applied_type'] == 1) & (df['applied_channel'].apply(lambda x: 'Android' in x)))]
field_idx = self.field_query_list.index(field)
na_time = self.na_time[field_idx]
na_type = self.na_app_type[field_idx]
na_chan = self.na_app_chan[field_idx]
print(na_chan, type(na_chan))
if pd.isnull(na_time): # 没有空跑时间, 则不记录.
return df
# 时间.
t_s, t_e = na_time.split('~')
print(t_s, t_e)
if len(t_e) == 0: # 若还在空跑, 则不记录.
return pd.DataFrame()
else:
print(df['applied_at'].head())
na_df = df[
(df['applied_at'].apply(lambda x: x[:10] >= t_s)) & (df['applied_at'].apply(lambda x: x[:10] <= t_e))]
if na_df.shape[0] == 0:
return df
# 申请类型.
if pd.isnull(na_type):
return df[~df.index.isin(na_df.index.values)]
else:
tmp_df = pd.DataFrame()
for i in str(int(na_type)):
print(i, 'wsnd')
tmp_df = tmp_df.append(na_df[na_df['applied_type'] == int(i)])
na_df = tmp_df
if na_df.shape[0] == 0:
return df
# 申请渠道.
if pd.isnull(na_chan):
return df[~df.index.isin(na_df.index.values)]
else:
tmp_df = pd.DataFrame()
for i in na_chan.split(','):
tmp_df = tmp_df.append(na_df[na_df['applied_channel'].apply(lambda x: i in x)])
na_df = tmp_df
if na_df.shape[0] == 0:
return df
print(df.shape[0], na_df.shape[0])
return df[~df.index.isin(na_df.index.values)]
def mk_test(self, x, alpha=0.01):
'''
MK test.
用于检测时间序列趋势.
:param x: list[float]
:param alpha: float, 显著性水平.
:return: 趋势, 是否显著, P值, Z统计值.
'''
if type(x) != list:
x = x.values.tolist()
n = len(x)
# calculate S
s = 0
for k in range(n - 1):
for j in range(k + 1, n):
s += np.sign(x[j] - x[k])
# calculate the unique data
unique_x = np.unique(x)
g = len(unique_x)
# calculate the var(s)
if n == g: # there is no tie
var_s = (n * (n - 1) * (2 * n + 5)) / 18
else: # there are some ties in data
tp = np.zeros(unique_x.shape)
for i in range(len(unique_x)):
tp[i] = sum(x == unique_x[i])
var_s = (n * (n - 1) * (2 * n + 5) - np.sum(tp * (tp - 1) * (2 * tp + 5))) / 18
if s > 0:
z = (s - 1) / np.sqrt(var_s)
elif s < 0:
z = (s + 1) / np.sqrt(var_s)
else: # s == 0:
z = 0
# calculate the p_value
p = 2 * (1 - norm.cdf(abs(z))) # two tail test
h = abs(z) > norm.ppf(1 - alpha / 2)
if (z < 0) and h:
trend = 'decreasing'
elif (z > 0) and h:
trend = 'increasing'
else:
trend = 'no trend'
return trend, h, p, z
def process_data_helper(self, group_name=None, df=None, field=None):
print('正在处理%s' % group_name)
# 得到一个连续时间序列.
date_list = pd.date_range(self.first_date, self.current_date).strftime('%Y-%m-%d').values.tolist()
# 将特殊值标定为缺失.
def set_na(data):
if pd.isnull(data):
return np.nan
if data < 0 or data > 999999:
return np.nan
return data
print(df.head())
df = df[['applied_at', field]]
df[field] = df[field].apply(set_na)
# 计算该字段在每天的均值, 数量, 缺失率, 零率.
def count(data):
return len(data)
def miss_rate(data):
return data.isnull().mean()
def zero_rate(data):
return (data == 0).mean()
df_g = df.groupby('applied_at').agg({field: ['mean', count, miss_rate, zero_rate]})
df_g.columns = ['_'.join(x) for x in df_g.columns.ravel()]
df_g = df_g.reset_index()
# 将没有数据的当天均值补为0.
tmp_df = pd.DataFrame({'applied_at': date_list})
df_g = pd.merge(left=tmp_df, right=df_g, on='applied_at', how='left')
df_g = df_g.fillna(0)
# 计算该字段整体的均值.
mean = df[field].mean()
# 计算该字段整体的标准差.
std = df_g[field + '_mean'].std()
# 计算该字段的趋势信息.
trend_0, h_0, p_0, z_0 = self.mk_test(df_g[field + '_mean'], alpha=self.alpha) # 整体.
trend_1, h_1, p_1, z_1 = self.mk_test(df_g.loc[df_g['applied_at'] < self.second_date, field + '_mean'],
alpha=self.alpha) # -2月份
trend_2, h_2, p_2, z_2 = self.mk_test(df_g.loc[(df_g['applied_at'] >= self.second_date) & (
df_g['applied_at'] < self.third_date), field + '_mean'], alpha=self.alpha) # -1月份
if datetime.date.today().day > 15: # 0月份
trend_3, h_3, p_3, z_3 = self.mk_test(df_g.loc[df_g['applied_at'] >= self.third_date, field + '_mean'],
alpha=self.alpha)
else:
trend_3, h_3, p_3, z_3 = 'NaN', np.nan, np.nan, np.nan
# 整合信息.
self.vlm_info_dict[group_name] = {}
self.vlm_info_dict[group_name]['time_list'] = date_list
self.vlm_info_dict[group_name]['value_list'] = df_g[field + '_mean'].values.tolist()
self.vlm_info_dict[group_name]['count_list'] = df_g[field + '_count'].values.tolist()
self.vlm_info_dict[group_name]['miss_rate_list'] = df_g[field + '_miss_rate'].values.tolist()
self.vlm_info_dict[group_name]['zero_rate_list'] = df_g[field + '_zero_rate'].values.tolist()
self.vlm_info_dict[group_name]['mean'] = mean
self.vlm_info_dict[group_name]['std'] = std
self.vlm_info_dict[group_name]['trend'] = [trend_0, trend_1, trend_2, trend_3]
self.vlm_info_dict[group_name]['h'] = [h_0, h_1, h_2, h_3]
self.vlm_info_dict[group_name]['p'] = [p_0, p_1, p_2, p_3]
if any([x > mean + 3 * std or x < mean - 3 * std for x in self.vlm_info_dict[group_name]['value_list']]):
self.vlm_info_dict[group_name]['over_3std'] = True
else:
self.vlm_info_dict[group_name]['over_3std'] = False
def process_data(self, field):
'''
对每个字段对应的不同客群进行数据处理.
:param field: str, 需要处理的字段.
:return: .
'''
# 过滤空跑数据.
df_copy = self.merge_data[['applied_at', 'applied_type', 'applied_channel', field]].copy()
df_copy = self.filter_data(df_copy, field)
if df_copy.shape[0] == 0:
print('%s还在空跑.' % self.field_query_name_dict[field])
return None
# 申请类型过滤.
tmp_df = pd.DataFrame()
for i in self.field_query_app_type_dict[field]:
print(i)
tmp_df = tmp_df.append(df_copy[df_copy['applied_type'] == int(i)])
df_copy = tmp_df
# 收集覆盖客群.
user_group_dict = {} # user_group_dict = {'首申-融360': (1, 融360)}
main_user_group_dict = {}
app_type_dict = {1: '首申', 2: '复申', 3: '复贷'}
df_copy_g = df_copy.groupby(['applied_type', 'applied_channel'])[field].count().sort_values(ascending=False)
df_copy_g = df_copy_g.reset_index()
df_copy_g = df_copy_g.loc[df_copy_g[field] > 100] # 太小的客群直接过滤.
for i in range(df_copy_g.shape[0]):
app_type = df_copy_g.iloc[i]['applied_type']
app_chan = df_copy_g.iloc[i]['applied_channel']
user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
if df_copy_g.iloc[i][field] > int(self.min_user_group * self.num_day / 30):
main_user_group_dict[app_type_dict[app_type] + '-' + app_chan] = (app_type, app_chan)
print(app_type, app_chan)
del df_copy_g
# 过滤非覆盖数据.
tmp_df = pd.DataFrame()
for user_group_name in user_group_dict:
tmp_df = tmp_df.append(df_copy.loc[
(df_copy['applied_type'] == user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] == user_group_dict[user_group_name][1])])
df_copy = tmp_df
## 覆盖全样本.
self.process_data_helper(group_name=self.field_query_name_dict[field] + '-全样本', df=df_copy, field=field)
## 首申/复申/复贷 若覆盖的话.
if df_copy.loc[df_copy['applied_type'] == 1].shape[0] > int(self.min_user_group * self.num_day / 30):
self.process_data_helper(group_name=self.field_query_name_dict[field] + '-首申',
df=df_copy.loc[df_copy['applied_type'] == 1],
field=field)
if df_copy.loc[df_copy['applied_type'] == 2].shape[0] > int(self.min_user_group * self.num_day / 30):
self.process_data_helper(group_name=self.field_query_name_dict[field] + '-复申',
df=df_copy.loc[df_copy['applied_type'] == 2],
field=field)
if df_copy.loc[df_copy['applied_type'] == 3].shape[0] > int(self.min_user_group * self.num_day / 30):
self.process_data_helper(group_name=self.field_query_name_dict[field] + '-复贷',
df=df_copy.loc[df_copy['applied_type'] == 3],
field=field)
## 按划分的客群处理数据.
for user_group_name in main_user_group_dict:
self.process_data_helper(group_name=self.field_query_name_dict[field] + '-' + user_group_name,
df=df_copy.loc[
(df_copy['applied_type'] == main_user_group_dict[user_group_name][0]) & (
df_copy['applied_channel'] ==
main_user_group_dict[user_group_name][1])],
field=field)
def plot(self):
'''
根据vlm_info_dict进行绘图.
'''
for user_group_name in self.vlm_info_dict:
print(user_group_name)
plt.figure(figsize=(30, 15))
# 画主线.
plt.plot(range(self.num_day),
self.vlm_info_dict[user_group_name]['value_list'])
# 打印趋势信息.
plt.text(x=0.9, y=0.75, s='整体趋势: %s\n'
'%d月趋势: %s\n'
'%d月趋势: %s\n'
'%d月趋势: %s\n' % (self.vlm_info_dict[user_group_name]['trend'][0],
self.first_month,
self.vlm_info_dict[user_group_name]['trend'][1],
self.second_month,
self.vlm_info_dict[user_group_name]['trend'][2],
self.second_month + 1,
self.vlm_info_dict[user_group_name]['trend'][3]),
fontsize=15, transform=plt.gca().transAxes)
# 画均值, 标准差.
plt.hlines(y=self.vlm_info_dict[user_group_name]['mean'], xmin=0, xmax=self.num_day - 1, colors='k', linestyles='--')
plt.hlines(y=self.vlm_info_dict[user_group_name]['mean'] + 3 * self.vlm_info_dict[user_group_name]['std'], xmin=0,
xmax=self.num_day - 1, colors='r',
linestyles='--')
plt.hlines(y=self.vlm_info_dict[user_group_name]['mean'] - 3 * self.vlm_info_dict[user_group_name]['std'], xmin=0,
xmax=self.num_day - 1, colors='r',
linestyles='--')
# 画表格.
cell_text = [[str(round(x, 3)) for x in self.vlm_info_dict[user_group_name]['value_list']],
[str(int(x)) for x in self.vlm_info_dict[user_group_name]['count_list']],
[str(100 * x)[:4] + '%' for x in self.vlm_info_dict[user_group_name]['miss_rate_list']],
[str(100 * x)[:4] + '%' for x in self.vlm_info_dict[user_group_name]['zero_rate_list']]]
rows = ['value', 'count', 'miss_rate', 'zero_rate']
cols = [x[5:] for x in self.vlm_info_dict[user_group_name]['time_list']]
plt.table(cellText=cell_text,
rowLabels=rows,
colLabels=cols,
colWidths=[0.91 / (self.num_day - 1)] * self.num_day,
loc='bottom')
plt.subplots_adjust(left=.1, bottom=.15)
# 画曲线下阴影线.
plt.vlines(x=range(self.num_day),
ymin=[self.vlm_info_dict[user_group_name]['mean'] - 3.5 * self.vlm_info_dict[user_group_name]['std']] * self.num_day,
ymax=self.vlm_info_dict[user_group_name]['value_list'],
colors='lightgrey',
linestyles='--')
# 展示.
plt.title(user_group_name + '-mean')
plt.grid()
plt.xticks([])
# 分开保存.
is_save = False
if self.vlm_info_dict[user_group_name]['h'][0] == True:
plt.savefig(self.fig_save_path + 'trend/' + user_group_name + '-mean')
is_save = True
if self.vlm_info_dict[user_group_name]['over_3std'] == True:
plt.savefig(self.fig_save_path + 'over_3std/' + user_group_name + '-mean')
is_save = True
if not is_save:
plt.savefig(self.fig_save_path + 'image/' + user_group_name + '-mean')
plt.show()
def save_vlm_info(self):
self.vlm_info_df = pd.DataFrame(columns=['group_name',
'trend_0', 'trend_1', 'trend_2', 'trend_3',
'h_0', 'h_1', 'h_2', 'h_3',
'p_0', 'p_1', 'p_2', 'p_3',
'over_3std'])
for g in self.vlm_info_dict:
tmp_df = pd.DataFrame({'group_name': [g],
'trend_0': [self.vlm_info_dict[g]['trend'][0]],
'trend_1': [self.vlm_info_dict[g]['trend'][1]],
'trend_2': [self.vlm_info_dict[g]['trend'][2]],
'trend_3': [self.vlm_info_dict[g]['trend'][3]],
'h_0': [self.vlm_info_dict[g]['h'][0]],
'h_1': [self.vlm_info_dict[g]['h'][1]],
'h_2': [self.vlm_info_dict[g]['h'][2]],
'h_3': [self.vlm_info_dict[g]['h'][3]],
'p_0': [self.vlm_info_dict[g]['p'][0]],
'p_1': [self.vlm_info_dict[g]['p'][1]],
'p_2': [self.vlm_info_dict[g]['p'][2]],
'p_3': [self.vlm_info_dict[g]['p'][3]],
'over_3std': [self.vlm_info_dict[g]['over_3std']]
})
self.vlm_info_df = self.vlm_info_df.append(tmp_df)
self.vlm_info_df.to_csv(self.info_save_path + 'vlm_info.csv', index=False)
with open(self.info_save_path + 'vlm_info.dict', 'wb') as f:
pickle.dump(self.vlm_info_dict, f)
f.close()
def run(self):
# 读取数据.
if self.if_read:
self.mysql_df = self.query_mysql('''SELECT order_no, applied_at,
applied_from, applied_channel, passdue_day
FROM risk_analysis
WHERE applied_at >= "%s 00:00:00"
AND applied_at <= "%s 00:00:00"'''
% (self.first_date, datetime.date.today().strftime('%Y-%m-%d')))
if self.if_save:
self.mysql_df.to_csv(self.data_save_path + 'mysql_data.csv', index=False)
else:
self.mysql_df = pd.read_csv(self.data_save_path + 'mysql_data.csv')
print('MySQL数据获取成功.')
def func_0(data):
try:
return int(int(data) + 1)
except:
return np.nan
if self.if_read:
condition = {'wf_created_at': {'$gte': '%s 00:00:00' % self.first_date,
'$lte': '%s 00:00:00' % datetime.date.today().strftime('%Y-%m-%d')}}
fields = {'wf_biz_no': 1, 'wf_created_at': 1, 'wf_loan_type': 1}
for f in self.field_query_list: # 加入Excel中预置的模型分名称
fields[f] = 1
self.mongo_df = self.query_mongo(condition, fields)
self.mongo_df['applied_type'] = self.mongo_df['wf_loan_type'].apply(func_0)
del self.mongo_df['wf_loan_type']
if self.if_save:
self.mongo_df.to_csv(self.data_save_path + 'mongo_data.csv', index=False)
else:
self.mongo_df = pd.read_csv(self.data_save_path + 'mongo_data.csv')
self.mongo_df = self.mongo_df.loc[self.mongo_df['applied_type'].notna()]
print('Mongo数据获取成功.')
# MySQL数据去重.
self.mysql_df = self.mysql_df.sort_values('passdue_day')
self.mysql_df = self.mysql_df.drop_duplicates('order_no', keep='first')
print('数据去重完成.')
# 拼接数据.
self.merge_data = pd.merge(left=self.mysql_df, right=self.mongo_df,
left_on='order_no', right_on='wf_biz_no',
how='inner')
print('拼接数据成功')
# 清洗数据.
def clean_data(data):
try:
return float(data)
except:
return np.nan
na_field_list = []
for field in self.field_query_list:
if field in self.merge_data.columns.tolist():
print('正在清洗%s' % self.field_query_name_dict[field])
self.merge_data[field] = self.merge_data[field].apply(clean_data)
else:
na_field_list.append(field)
# 清洗时间格式, 使其转换成统一的字符串格式.
if repr(self.merge_data['applied_at'].dtype) == "dtype('O')":
self.merge_data['applied_at'] = self.merge_data['applied_at'].apply(lambda x: x[:10])
else:
self.merge_data['applied_at'] = self.merge_data['applied_at'].apply(lambda x: x.strftime('%Y-%m-%d'))
# 确认数据时间范围.
self.merge_data = self.merge_data.loc[
(self.merge_data['applied_at'] >= self.first_date) & (self.merge_data['applied_at'] <= self.current_date)]
# 去除因为一些原因未抽取到的字段.
print('不包含以下字段:')
for field in na_field_list:
print(self.field_query_name_dict[field])
self.field_query_list.remove(field)
self.field_name_list.remove(self.field_query_name_dict[field])
del self.field_query_name_dict[field]
# 处理数据.
print('开始处理数据.')
print('=' * 40)
for field in self.field_query_list:
self.process_data(field)
print('数据处理完毕.')
print('=' * 40)
# 画图.
print('开始画图.')
print('=' * 40)
self.plot()
print('画图完成.')
print('=' * 40)
# 保存信息.
print('开始保存信息.')
print('=' * 40)
self.save_vlm_info()
print('保存信息完成.')
print('=' * 40)
if __name__ == '__main__':
pass
No preview for this file type
<!doctype html>
<html>
<head>
<meta charset='UTF-8'><meta name='viewport' content='width=device-width initial-scale=1'>
<title>MM_report_20190728</title><link href='https://fonts.loli.net/css?family=PT+Serif:400,400italic,700,700italic&subset=latin,cyrillic-ext,cyrillic,latin-ext' rel='stylesheet' type='text/css' /><style type='text/css'>html {overflow-x: initial !important;}:root { --bg-color:#ffffff; --text-color:#333333; --select-text-bg-color:#B5D6FC; --select-text-font-color:auto; --monospace:"Lucida Console",Consolas,"Courier",monospace; }
html { font-size: 14px; background-color: var(--bg-color); color: var(--text-color); font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; -webkit-font-smoothing: antialiased; }
body { margin: 0px; padding: 0px; height: auto; bottom: 0px; top: 0px; left: 0px; right: 0px; font-size: 1rem; line-height: 1.42857; overflow-x: hidden; background: inherit; tab-size: 4; }
iframe { margin: auto; }
a.url { word-break: break-all; }
a:active, a:hover { outline: 0px; }
.in-text-selection, ::selection { text-shadow: none; background: var(--select-text-bg-color); color: var(--select-text-font-color); }
#write { margin: 0px auto; height: auto; width: inherit; word-break: normal; word-wrap: break-word; position: relative; white-space: normal; overflow-x: visible; padding-top: 40px; }
#write.first-line-indent p { text-indent: 2em; }
#write.first-line-indent li p, #write.first-line-indent p * { text-indent: 0px; }
#write.first-line-indent li { margin-left: 2em; }
.for-image #write { padding-left: 8px; padding-right: 8px; }
body.typora-export { padding-left: 30px; padding-right: 30px; }
.typora-export .footnote-line, .typora-export li, .typora-export p { white-space: pre-wrap; }
@media screen and (max-width: 500px) {
body.typora-export { padding-left: 0px; padding-right: 0px; }
#write { padding-left: 20px; padding-right: 20px; }
.CodeMirror-sizer { margin-left: 0px !important; }
.CodeMirror-gutters { display: none !important; }
}
#write li > figure:first-child { margin-top: -20px; }
#write ol, #write ul { position: relative; }
img { max-width: 100%; vertical-align: middle; }
button, input, select, textarea { color: inherit; font-style: inherit; font-variant: inherit; font-weight: inherit; font-stretch: inherit; font-size: inherit; line-height: inherit; font-family: inherit; }
input[type="checkbox"], input[type="radio"] { line-height: normal; padding: 0px; }
*, ::after, ::before { box-sizing: border-box; }
#write h1, #write h2, #write h3, #write h4, #write h5, #write h6, #write p, #write pre { width: inherit; }
#write h1, #write h2, #write h3, #write h4, #write h5, #write h6, #write p { position: relative; }
h1, h2, h3, h4, h5, h6 { break-after: avoid-page; break-inside: avoid; orphans: 2; }
p { orphans: 4; }
h1 { font-size: 2rem; }
h2 { font-size: 1.8rem; }
h3 { font-size: 1.6rem; }
h4 { font-size: 1.4rem; }
h5 { font-size: 1.2rem; }
h6 { font-size: 1rem; }
.md-math-block, .md-rawblock, h1, h2, h3, h4, h5, h6, p { margin-top: 1rem; margin-bottom: 1rem; }
.hidden { display: none; }
.md-blockmeta { color: rgb(204, 204, 204); font-weight: 700; font-style: italic; }
a { cursor: pointer; }
sup.md-footnote { padding: 2px 4px; background-color: rgba(238, 238, 238, 0.7); color: rgb(85, 85, 85); border-radius: 4px; cursor: pointer; }
sup.md-footnote a, sup.md-footnote a:hover { color: inherit; text-transform: inherit; text-decoration: inherit; }
#write input[type="checkbox"] { cursor: pointer; width: inherit; height: inherit; }
figure { overflow-x: auto; margin: 1.2em 0px; max-width: calc(100% + 16px); padding: 0px; }
figure > table { margin: 0px !important; }
tr { break-inside: avoid; break-after: auto; }
thead { display: table-header-group; }
table { border-collapse: collapse; border-spacing: 0px; width: 100%; overflow: auto; break-inside: auto; text-align: left; }
table.md-table td { min-width: 32px; }
.CodeMirror-gutters { border-right: 0px; background-color: inherit; }
.CodeMirror-linenumber { user-select: none; }
.CodeMirror { text-align: left; }
.CodeMirror-placeholder { opacity: 0.3; }
.CodeMirror pre { padding: 0px 4px; }
.CodeMirror-lines { padding: 0px; }
div.hr:focus { cursor: none; }
#write pre { white-space: pre-wrap; }
#write.fences-no-line-wrapping pre { white-space: pre; }
#write pre.ty-contain-cm { white-space: normal; }
.CodeMirror-gutters { margin-right: 4px; }
.md-fences { font-size: 0.9rem; display: block; break-inside: avoid; text-align: left; overflow: visible; white-space: pre; background: inherit; position: relative !important; }
.md-diagram-panel { width: 100%; margin-top: 10px; text-align: center; padding-top: 0px; padding-bottom: 8px; overflow-x: auto; }
#write .md-fences.mock-cm { white-space: pre-wrap; }
.md-fences.md-fences-with-lineno { padding-left: 0px; }
#write.fences-no-line-wrapping .md-fences.mock-cm { white-space: pre; overflow-x: auto; }
.md-fences.mock-cm.md-fences-with-lineno { padding-left: 8px; }
.CodeMirror-line, twitterwidget { break-inside: avoid; }
.footnotes { opacity: 0.8; font-size: 0.9rem; margin-top: 1em; margin-bottom: 1em; }
.footnotes + .footnotes { margin-top: 0px; }
.md-reset { margin: 0px; padding: 0px; border: 0px; outline: 0px; vertical-align: top; background: 0px 0px; text-decoration: none; text-shadow: none; float: none; position: static; width: auto; height: auto; white-space: nowrap; cursor: inherit; -webkit-tap-highlight-color: transparent; line-height: normal; font-weight: 400; text-align: left; box-sizing: content-box; direction: ltr; }
li div { padding-top: 0px; }
blockquote { margin: 1rem 0px; }
li .mathjax-block, li p { margin: 0.5rem 0px; }
li { margin: 0px; position: relative; }
blockquote > :last-child { margin-bottom: 0px; }
blockquote > :first-child, li > :first-child { margin-top: 0px; }
.footnotes-area { color: rgb(136, 136, 136); margin-top: 0.714rem; padding-bottom: 0.143rem; white-space: normal; }
#write .footnote-line { white-space: pre-wrap; }
@media print {
body, html { border: 1px solid transparent; height: 99%; break-after: avoid; break-before: avoid; }
#write { margin-top: 0px; padding-top: 0px; border-color: transparent !important; }
.typora-export * { -webkit-print-color-adjust: exact; }
html.blink-to-pdf { font-size: 13px; }
.typora-export #write { padding-left: 32px; padding-right: 32px; padding-bottom: 0px; break-after: avoid; }
.typora-export #write::after { height: 0px; }
@page { margin: 20mm 0px; }
}
.footnote-line { margin-top: 0.714em; font-size: 0.7em; }
a img, img a { cursor: pointer; }
pre.md-meta-block { font-size: 0.8rem; min-height: 0.8rem; white-space: pre-wrap; background: rgb(204, 204, 204); display: block; overflow-x: hidden; }
p > .md-image:only-child:not(.md-img-error) img, p > img:only-child { display: block; margin: auto; }
p > .md-image:only-child { display: inline-block; width: 100%; }
#write .MathJax_Display { margin: 0.8em 0px 0px; }
.md-math-block { width: 100%; }
.md-math-block:not(:empty)::after { display: none; }
[contenteditable="true"]:active, [contenteditable="true"]:focus { outline: 0px; box-shadow: none; }
.md-task-list-item { position: relative; list-style-type: none; }
.task-list-item.md-task-list-item { padding-left: 0px; }
.md-task-list-item > input { position: absolute; top: 0px; left: 0px; margin-left: -1.2em; margin-top: calc(1em - 10px); border: none; }
.math { font-size: 1rem; }
.md-toc { min-height: 3.58rem; position: relative; font-size: 0.9rem; border-radius: 10px; }
.md-toc-content { position: relative; margin-left: 0px; }
.md-toc-content::after, .md-toc::after { display: none; }
.md-toc-item { display: block; color: rgb(65, 131, 196); }
.md-toc-item a { text-decoration: none; }
.md-toc-inner:hover { text-decoration: underline; }
.md-toc-inner { display: inline-block; cursor: pointer; }
.md-toc-h1 .md-toc-inner { margin-left: 0px; font-weight: 700; }
.md-toc-h2 .md-toc-inner { margin-left: 2em; }
.md-toc-h3 .md-toc-inner { margin-left: 4em; }
.md-toc-h4 .md-toc-inner { margin-left: 6em; }
.md-toc-h5 .md-toc-inner { margin-left: 8em; }
.md-toc-h6 .md-toc-inner { margin-left: 10em; }
@media screen and (max-width: 48em) {
.md-toc-h3 .md-toc-inner { margin-left: 3.5em; }
.md-toc-h4 .md-toc-inner { margin-left: 5em; }
.md-toc-h5 .md-toc-inner { margin-left: 6.5em; }
.md-toc-h6 .md-toc-inner { margin-left: 8em; }
}
a.md-toc-inner { font-size: inherit; font-style: inherit; font-weight: inherit; line-height: inherit; }
.footnote-line a:not(.reversefootnote) { color: inherit; }
.md-attr { display: none; }
.md-fn-count::after { content: "."; }
code, pre, samp, tt { font-family: var(--monospace); }
kbd { margin: 0px 0.1em; padding: 0.1em 0.6em; font-size: 0.8em; color: rgb(36, 39, 41); background: rgb(255, 255, 255); border: 1px solid rgb(173, 179, 185); border-radius: 3px; box-shadow: rgba(12, 13, 14, 0.2) 0px 1px 0px, rgb(255, 255, 255) 0px 0px 0px 2px inset; white-space: nowrap; vertical-align: middle; }
.md-comment { color: rgb(162, 127, 3); opacity: 0.8; font-family: var(--monospace); }
code { text-align: left; vertical-align: initial; }
a.md-print-anchor { white-space: pre !important; border-width: initial !important; border-style: none !important; border-color: initial !important; display: inline-block !important; position: absolute !important; width: 1px !important; right: 0px !important; outline: 0px !important; background: 0px 0px !important; text-decoration: initial !important; text-shadow: initial !important; }
.md-inline-math .MathJax_SVG .noError { display: none !important; }
.html-for-mac .inline-math-svg .MathJax_SVG { vertical-align: 0.2px; }
.md-math-block .MathJax_SVG_Display { text-align: center; margin: 0px; position: relative; text-indent: 0px; max-width: none; max-height: none; min-height: 0px; min-width: 100%; width: auto; overflow-y: hidden; display: block !important; }
.MathJax_SVG_Display, .md-inline-math .MathJax_SVG_Display { width: auto; margin: inherit; display: inline-block !important; }
.MathJax_SVG .MJX-monospace { font-family: var(--monospace); }
.MathJax_SVG .MJX-sans-serif { font-family: sans-serif; }
.MathJax_SVG { display: inline; font-style: normal; font-weight: 400; line-height: normal; zoom: 90%; text-indent: 0px; text-align: left; text-transform: none; letter-spacing: normal; word-spacing: normal; word-wrap: normal; white-space: nowrap; float: none; direction: ltr; max-width: none; max-height: none; min-width: 0px; min-height: 0px; border: 0px; padding: 0px; margin: 0px; }
.MathJax_SVG * { transition: none; }
.MathJax_SVG_Display svg { vertical-align: middle !important; margin-bottom: 0px !important; }
.os-windows.monocolor-emoji .md-emoji { font-family: "Segoe UI Symbol", sans-serif; }
.md-diagram-panel > svg { max-width: 100%; }
[lang="mermaid"] svg, [lang="flow"] svg { max-width: 100%; }
[lang="mermaid"] .node text { font-size: 1rem; }
table tr th { border-bottom: 0px; }
video { max-width: 100%; display: block; margin: 0px auto; }
iframe { max-width: 100%; width: 100%; border: none; }
.highlight td, .highlight tr { border: 0px; }
:root { --active-file-bg-color: rgba(32, 43, 51, 0.63); --active-file-text-color: white; --bg-color: #f3f2ee; --text-color: #1f0909; --control-text-color: #444; --rawblock-edit-panel-bd: #e5e5e5; --select-text-bg-color: rgba(32, 43, 51, 0.63); --select-text-font-color: white; }
pre { --select-text-bg-color: #36284e; --select-text-font-color: #fff; }
html { font-size: 16px; }
html, body { background-color: rgb(243, 242, 238); font-family: "PT Serif", "Times New Roman", Times, serif; color: rgb(31, 9, 9); line-height: 1.5em; }
#write { max-width: 40em; }
ol li { list-style-type: decimal; list-style-position: outside; }
ul li { list-style-type: disc; list-style-position: outside; }
ol, ul { list-style: none; }
blockquote, q { quotes: none; }
blockquote::before, blockquote::after, q::before, q::after { content: none; }
table { border-collapse: collapse; border-spacing: 0px; }
h1, h2, h3, h4, h5, h6 { font-weight: bold; }
h1 { font-size: 1.875em; line-height: 1.6em; margin-top: 2em; }
h2, h3 { font-size: 1.3125em; line-height: 1.15; margin-top: 2.28571em; margin-bottom: 1.15em; }
h3 { font-weight: normal; }
h4 { font-size: 1.125em; margin-top: 2.67em; }
h5, h6 { font-size: 1em; }
h1 { border-bottom: 1px solid; margin-bottom: 1.875em; padding-bottom: 0.8125em; }
a { text-decoration: none; color: rgb(6, 85, 136); }
a:hover, a:active { text-decoration: underline; }
p, blockquote, .md-fences { margin-bottom: 1.5em; }
h1, h2, h3, h4, h5, h6 { margin-bottom: 1.5em; }
blockquote { font-style: italic; border-left: 5px solid; margin-left: 2em; padding-left: 1em; }
ul, ol { margin: 0px 0px 1.5em 1.5em; }
.md-meta, .md-before, .md-after { color: rgb(153, 153, 153); }
table { margin-bottom: 1.5em; font-size: 1em; }
thead th, tfoot th { padding: 0.25em 0.25em 0.25em 0.4em; text-transform: uppercase; }
th { text-align: left; }
td { vertical-align: top; padding: 0.25em 0.25em 0.25em 0.4em; }
code, .md-fences { background-color: rgb(218, 218, 218); }
code { padding-left: 2px; padding-right: 2px; }
.md-fences { margin-left: 2em; margin-bottom: 3em; padding-left: 1ch; padding-right: 1ch; }
pre, code, tt { font-size: 0.875em; line-height: 1.71429em; }
h1 { line-height: 1.3em; font-weight: normal; margin-bottom: 0.5em; }
p + ul, p + ol { margin-top: 0.5em; }
h3 + ul, h4 + ul, h5 + ul, h6 + ul, h3 + ol, h4 + ol, h5 + ol, h6 + ol { margin-top: 0.5em; }
li > ul, li > ol { margin-top: inherit; margin-bottom: 0px; }
li ol > li { list-style-type: lower-alpha; }
li li ol > li { list-style-type: lower-roman; }
h2, h3 { margin-bottom: 0.75em; }
hr { border-top: none; border-right: none; border-bottom: 1px solid; border-left: none; }
h1 { border-color: rgb(197, 197, 197); }
blockquote { border-color: rgb(186, 186, 186); color: rgb(101, 101, 101); }
blockquote ul, blockquote ol { margin-left: 0px; }
.ty-table-edit { background-color: transparent; }
thead { background-color: rgb(218, 218, 218); }
tr:nth-child(2n) { background: rgb(232, 231, 231); }
hr { border-color: rgb(197, 197, 197); }
.task-list { padding-left: 1rem; }
.md-task-list-item { padding-left: 1.5rem; list-style-type: none; }
.md-task-list-item > input::before { content: "√"; display: inline-block; width: 1.25rem; height: 1.6rem; vertical-align: middle; text-align: center; color: rgb(221, 221, 221); background-color: rgb(243, 242, 238); }
.md-task-list-item > input:checked::before, .md-task-list-item > input[checked]::before { color: inherit; }
#write pre.md-meta-block { min-height: 1.875rem; color: rgb(85, 85, 85); border: 0px; background: transparent; margin-left: 1em; margin-top: 1em; }
.md-image > .md-meta { color: rgb(155, 81, 70); }
.md-image > .md-meta { font-family: Menlo, "Ubuntu Mono", Consolas, "Courier New", "Microsoft Yahei", "Hiragino Sans GB", "WenQuanYi Micro Hei", serif; }
#write > h3.md-focus::before { left: -1.5rem; color: rgb(153, 153, 153); border-color: rgb(153, 153, 153); }
#write > h4.md-focus::before { left: -1.5rem; top: 0.25rem; color: rgb(153, 153, 153); border-color: rgb(153, 153, 153); }
#write > h5.md-focus::before { left: -1.5rem; color: rgb(153, 153, 153); border-color: rgb(153, 153, 153); }
#write > h6.md-focus::before { left: -1.5rem; top: 0.3125rem; color: rgb(153, 153, 153); border-color: rgb(153, 153, 153); }
.md-toc:focus .md-toc-content { margin-top: 19px; }
.md-toc-content:empty::before { color: rgb(6, 85, 136); }
.md-toc-item { color: rgb(6, 85, 136); }
#write div.md-toc-tooltip { background-color: rgb(243, 242, 238); }
#typora-sidebar { background-color: rgb(243, 242, 238); box-shadow: rgba(0, 0, 0, 0.376) 0px 6px 12px; }
.pin-outline #typora-sidebar { background: inherit; box-shadow: none; border-right: 1px dashed; }
.pin-outline #typora-sidebar:hover .outline-title-wrapper { border-left: 1px dashed; }
.outline-item:hover { background-color: rgb(218, 218, 218); border-left: 28px solid rgb(218, 218, 218); border-right: 18px solid rgb(218, 218, 218); }
.typora-node .outline-item:hover { border-right: 28px solid rgb(218, 218, 218); }
.outline-expander::before { content: ""; font-family: FontAwesome; font-size: 14px; top: 1px; }
.outline-expander:hover::before, .outline-item-open > .outline-item > .outline-expander::before { content: ""; }
.modal-content { background-color: rgb(243, 242, 238); }
.auto-suggest-container ul li { list-style-type: none; }
.megamenu-menu, #top-titlebar, #top-titlebar *, .megamenu-content { background: rgb(243, 242, 238); color: rgb(31, 9, 9); }
.megamenu-menu-header { border-bottom: 1px dashed rgb(32, 43, 51); }
.megamenu-menu { box-shadow: none; border-right: 1px dashed; }
header, .context-menu, .megamenu-content, footer { font-family: "PT Serif", "Times New Roman", Times, serif; color: rgb(31, 9, 9); }
#megamenu-back-btn { color: rgb(31, 9, 9); border-color: rgb(31, 9, 9); }
.megamenu-menu-header #megamenu-menu-header-title::before { color: rgb(31, 9, 9); }
.megamenu-menu-list li a:hover, .megamenu-menu-list li a.active { color: inherit; background-color: rgb(232, 231, 223); }
.long-btn:hover { background-color: rgb(232, 231, 223); }
#recent-file-panel tbody tr:nth-child(2n-1) { background-color: transparent !important; }
.megamenu-menu-panel tbody tr:hover td:nth-child(2) { color: inherit; }
.megamenu-menu-panel .btn { background-color: rgb(210, 209, 209); }
.btn-default { background-color: transparent; }
.typora-sourceview-on #toggle-sourceview-btn, .ty-show-word-count #footer-word-count { background: rgb(199, 197, 197); }
#typora-quick-open { background-color: inherit; }
.md-diagram-panel { margin-top: 8px; }
.file-list-item-file-name { font-weight: initial; }
.file-list-item-summary { opacity: 1; }
.file-list-item { color: rgb(119, 119, 119); }
.file-list-item.active { background-color: inherit; color: black; }
.ty-side-sort-btn.active { background-color: inherit; }
.file-list-item.active .file-list-item-file-name { font-weight: bold; }
.file-list-item { opacity: 1 !important; }
.file-library-node.active > .file-node-background { background-color: var(--active-file-bg-color); }
.file-tree-node.active > .file-node-content { color: var(--active-file-text-color); }
.md-task-list-item > input { margin-left: -1.6em; margin-top: calc(1rem - 12px); }
input { border: 1px solid rgb(170, 170, 170); }
.megamenu-menu-header #megamenu-menu-header-title, .megamenu-menu-header:hover, .megamenu-menu-header:focus { color: inherit; }
.dropdown-menu .divider { border-color: rgb(229, 229, 229); }
.os-windows-7 strong, .os-windows-7 strong { font-weight: 760; }
#write { counter-reset: h1 0; }
h1 { counter-reset: h2 0; }
h2 { counter-reset: h3 0; }
h3 { counter-reset: h4 0; }
h4 { counter-reset: h5 0; }
h5 { counter-reset: h6 0; }
#write h1::before { counter-increment: h1 1; content: counter(h1) " "; }
#write h2::before { counter-increment: h2 1; content: counter(h1) "." counter(h2) " "; }
#write h3::before, h3.md-focus.md-heading::before { counter-increment: h3 1; content: counter(h1) "." counter(h2) "." counter(h3) " "; }
#write h4::before, h4.md-focus.md-heading::before { counter-increment: h4 1; content: counter(h1) "." counter(h2) "." counter(h3) "." counter(h4) " "; }
#write h5::before, h5.md-focus.md-heading::before { counter-increment: h5 1; content: counter(h1) "." counter(h2) "." counter(h3) "." counter(h4) "." counter(h5) " "; }
#write h6::before, h6.md-focus.md-heading::before { counter-increment: h6 1; content: counter(h1) "." counter(h2) "." counter(h3) "." counter(h4) "." counter(h5) "." counter(h6) " "; }
#write > h3.md-focus::before, #write > h4.md-focus::before, #write > h5.md-focus::before, #write > h6.md-focus::before, h3.md-focus::before, h4.md-focus::before, h5.md-focus::before, h6.md-focus::before { color: inherit; border: inherit; border-radius: inherit; position: inherit; left: initial; float: none; top: initial; font-size: inherit; padding-left: inherit; padding-right: inherit; vertical-align: inherit; font-weight: inherit; line-height: inherit; }
.typora-export li, .typora-export p, .typora-export, .footnote-line {white-space: normal;}
</style>
</head>
<body class='typora-export os-windows' >
<div id='write' class = 'is-node'><div class='md-toc' mdtype='toc'><p class="md-toc-content"><span class="md-toc-item md-toc-h1" data-ref="n2"><a class="md-toc-inner" href="#header-n2">概述</a></span><span class="md-toc-item md-toc-h2" data-ref="n3"><a class="md-toc-inner" href="#header-n3">报告内容</a></span><span class="md-toc-item md-toc-h2" data-ref="n11"><a class="md-toc-inner" href="#header-n11">监控的模型</a></span><span class="md-toc-item md-toc-h2" data-ref="n64"><a class="md-toc-inner" href="#header-n64">时间跨度</a></span><span class="md-toc-item md-toc-h1" data-ref="n68"><a class="md-toc-inner" href="#header-n68">PSI</a></span><span class="md-toc-item md-toc-h2" data-ref="n69"><a class="md-toc-inner" href="#header-n69">与整体差别较大的客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n70"><a class="md-toc-inner" href="#header-n70">概述</a></span><span class="md-toc-item md-toc-h3" data-ref="n78"><a class="md-toc-inner" href="#header-n78">有哪些客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n144"><a class="md-toc-inner" href="#header-n144">资质相对好的客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n153"><a class="md-toc-inner" href="#header-n153">资质相对差的客群</a></span><span class="md-toc-item md-toc-h2" data-ref="n158"><a class="md-toc-inner" href="#header-n158">随时间分布变化较大的客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n159"><a class="md-toc-inner" href="#header-n159">有哪些客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n200"><a class="md-toc-inner" href="#header-n200">资质变好的客群</a></span><span class="md-toc-item md-toc-h3" data-ref="n214"><a class="md-toc-inner" href="#header-n214">资质变差的客群</a></span><span class="md-toc-item md-toc-h1" data-ref="n226"><a class="md-toc-inner" href="#header-n226">AUC</a></span><span class="md-toc-item md-toc-h2" data-ref="n227"><a class="md-toc-inner" href="#header-n227">概述</a></span><span class="md-toc-item md-toc-h2" data-ref="n233"><a class="md-toc-inner" href="#header-n233">相比整体模型效果较差的客群</a></span><span class="md-toc-item md-toc-h2" data-ref="n255"><a class="md-toc-inner" href="#header-n255">模型效果随时间变差的客群</a></span></p></div><h1><a name='header-n2' class='md-header-anchor '></a>概述</h1><h2><a name='header-n3' class='md-header-anchor '></a>报告内容</h2><ul><li>本次模型报告由于代码处于开发阶段, 所以内容并不完整.</li><li>主要包括模型分的PSI, 以及AUC随时间的变化, 以及客群与整体的差别.</li><li>尽管模型监控代码加入了自动筛选异常客群的功能, 但要更加深入分析仍需要时间, 此版报告为简略版.</li></ul><h2><a name='header-n11' class='md-header-anchor '></a>监控的模型</h2><ul><li><p>已监控模型:</p><ul><li>同盾V1</li><li>V6</li><li>百融V1</li><li>首贷融合模型</li><li>融360首贷融合模型</li><li>新颜V2</li><li>新颜V3</li><li>魔蝎</li><li>复贷新融合模型(无运营商)</li><li>复贷新融合模型(有运营商)</li><li>短信模型</li><li>探知</li><li>同盾V2</li><li>电话邦V2</li><li>百融V2</li><li>复贷V4</li><li>首贷新融合模型(无运营商)</li><li>首贷新融合模型(有运营商)</li></ul></li><li><p>暂未监控模型:</p><ul><li><p>不明原因(待与模型负责人交流):</p><ul><li>APP模型</li><li>量信分V2</li></ul></li><li><p>还有一些模型待加入.</p></li></ul></li></ul><h2><a name='header-n64' class='md-header-anchor '></a>时间跨度</h2><ul><li>考虑到整体模型的时间跨度, 本次模型监控报告的取数时间范围为2019.03.01 - 2019.07. 26.</li></ul><h1><a name='header-n68' class='md-header-anchor '></a>PSI</h1><h2><a name='header-n69' class='md-header-anchor '></a>与整体差别较大的客群</h2><h3><a name='header-n70' class='md-header-anchor '></a>概述</h3><ul><li>由模型监控代码在已监控模型中, 产生的有效客群数量为465个.</li><li>设定PSI阈值(0.5)后, 筛选出的与整体差别较大的异常客群有15个.</li><li>设定PSI阈值(0.1)后, 筛选出的变化较大的异常客群有25个.</li></ul><h3><a name='header-n78' class='md-header-anchor '></a>有哪些客群</h3><ul><li><p>客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th><th style='text-align:center;' >月份</th><th style='text-align:center;' >PSI</th></tr></thead><tbody><tr><td style='text-align:center;' >同盾V1-复贷-百融榕树</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.57</td></tr><tr><td style='text-align:center;' >V6-首申-国美api</td><td style='text-align:center;' >6</td><td style='text-align:center;' >2.47</td></tr><tr><td style='text-align:center;' >V6-首申-壹账通H5</td><td style='text-align:center;' >3</td><td style='text-align:center;' >1.75</td></tr><tr><td style='text-align:center;' >V6-首申-平安一贷通安卓</td><td style='text-align:center;' >5</td><td style='text-align:center;' >2.07</td></tr><tr><td style='text-align:center;' >V6-复贷-51公积金API</td><td style='text-align:center;' >3</td><td style='text-align:center;' >0.53</td></tr><tr><td style='text-align:center;' >新颜V2-复申-挖财api</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.57</td></tr><tr><td style='text-align:center;' >首贷融合模型-复贷-全渠道</td><td style='text-align:center;' >5</td><td style='text-align:center;' >1.09</td></tr><tr><td style='text-align:center;' >首贷融合模型-首申-壹账通H5</td><td style='text-align:center;' >3</td><td style='text-align:center;' >1.89</td></tr><tr><td style='text-align:center;' >首贷融合模型-首申-平安一贷通安卓</td><td style='text-align:center;' >5</td><td style='text-align:center;' >2.13</td></tr><tr><td style='text-align:center;' >首贷融合模型-复申-挖财api</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.51</td></tr><tr><td style='text-align:center;' >融360首贷融合模型-复贷-全渠道</td><td style='text-align:center;' >5</td><td style='text-align:center;' >1.01</td></tr><tr><td style='text-align:center;' >融360首贷融合模型-首申-壹账通H5</td><td style='text-align:center;' >3</td><td style='text-align:center;' >1.82</td></tr><tr><td style='text-align:center;' >融360首贷融合模型-首申-平安一贷通安卓</td><td style='text-align:center;' >5</td><td style='text-align:center;' >2.13</td></tr><tr><td style='text-align:center;' >融360首贷融合模型-复申-挖财api</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.67</td></tr></tbody></table></figure><p>&nbsp;</p></li></ul><h3><a name='header-n144' class='md-header-anchor '></a>资质相对好的客群</h3><ul><li><p>暂时鉴别方式为在异常的基础上去看图, 准备后续加入趋势判断, 自动识别是变好还是变差.</p></li><li><p>经过查看, 均为资质相对较好的客群.</p></li><li><p>例如:</p><p><img src='../doc/image/融360首贷融合模型-复申-挖财api.png' alt='融360首贷融合模型-复申-挖财api' referrerPolicy='no-referrer' /></p></li></ul><h3><a name='header-n153' class='md-header-anchor '></a>资质相对差的客群</h3><ul><li>无.</li></ul><p>&nbsp;</p><h2><a name='header-n158' class='md-header-anchor '></a>随时间分布变化较大的客群</h2><h3><a name='header-n159' class='md-header-anchor '></a>有哪些客群</h3><ul><li><p>部分客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th><th style='text-align:center;' >月份</th><th style='text-align:center;' >PSI</th></tr></thead><tbody><tr><td style='text-align:center;' >同盾分v1-首申-百融榕树</td><td style='text-align:center;' >5</td><td style='text-align:center;' >0.29</td></tr><tr><td style='text-align:center;' >V6-全样本</td><td style='text-align:center;' >7</td><td style='text-align:center;' >0.10</td></tr><tr><td style='text-align:center;' >新颜V2-复贷-全渠道</td><td style='text-align:center;' >5</td><td style='text-align:center;' >0.66</td></tr><tr><td style='text-align:center;' >新颜v3-全样本</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.10</td></tr><tr><td style='text-align:center;' >魔蝎-全样本</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.34</td></tr><tr><td style='text-align:center;' >短信分-全样本</td><td style='text-align:center;' >7</td><td style='text-align:center;' >0.13</td></tr><tr><td style='text-align:center;' >融360首贷融合模型-首申-融360</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.12</td></tr><tr><td style='text-align:center;' >电话邦V2-首申-全渠道</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.13</td></tr></tbody></table></figure></li></ul><h3><a name='header-n200' class='md-header-anchor '></a>资质变好的客群</h3><ul><li><p>暂时鉴别方式为在异常的基础上去看图, 准备后续加入自动识别.</p></li><li><p>部分资质变好客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th></tr></thead><tbody><tr><td style='text-align:center;' >V6-全样本</td></tr><tr><td style='text-align:center;' >新颜V2-复贷-全渠道</td></tr></tbody></table></figure><p><img src='../doc/image/V6-全样本.png' alt='V6-全样本' referrerPolicy='no-referrer' /></p></li></ul><h3><a name='header-n214' class='md-header-anchor '></a>资质变差的客群</h3><ul><li><p>部分资质变差客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th></tr></thead><tbody><tr><td style='text-align:center;' >同盾分v1-首申-百融榕树</td></tr><tr><td style='text-align:center;' >短信分-全样本</td></tr></tbody></table></figure><p><img src='../doc/image/同盾分v1-首申-百融榕树.png' alt='同盾分v1-首申-百融榕树' referrerPolicy='no-referrer' /></p></li></ul><h1><a name='header-n226' class='md-header-anchor '></a>AUC</h1><h2><a name='header-n227' class='md-header-anchor '></a>概述</h2><ul><li>由代码产生的客群为113个, 因为本身放款数量相对申请数量少, 且需要响应时间.</li><li>设定AUCR阈值为0.85, 得到的异常客群数量为23个.</li></ul><h2><a name='header-n233' class='md-header-anchor '></a>相比整体模型效果较差的客群</h2><ul><li><p>客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th><th style='text-align:center;' >月份</th><th style='text-align:center;' >AUCR</th></tr></thead><tbody><tr><td style='text-align:center;' >新颜V2-首申-挖财api</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.80</td></tr><tr><td style='text-align:center;' >新颜V2-首申-爱奇艺H5</td><td style='text-align:center;' >3</td><td style='text-align:center;' >0.80</td></tr><tr><td style='text-align:center;' >新颜v3-首申-360金融API</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.85</td></tr></tbody></table></figure><p><img src='../doc/image/新颜V2-首申-爱奇艺H5.png' alt='新颜V2-首申-爱奇艺H5' referrerPolicy='no-referrer' /></p></li></ul><h2><a name='header-n255' class='md-header-anchor '></a>模型效果随时间变差的客群</h2><ul><li><p>部分客群列举:</p><figure><table><thead><tr><th style='text-align:center;' >客群名称</th><th style='text-align:center;' >月份</th><th style='text-align:center;' >AUCR</th></tr></thead><tbody><tr><td style='text-align:center;' >同盾分v1-首申-爱奇艺H5</td><td style='text-align:center;' >4</td><td style='text-align:center;' >0.76</td></tr><tr><td style='text-align:center;' >同盾分v1-首申-挖财api</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.77</td></tr><tr><td style='text-align:center;' >V6-复申-全渠道</td><td style='text-align:center;' >6</td><td style='text-align:center;' >0.70</td></tr><tr><td style='text-align:center;' >首贷融合模型-首申-融360</td><td style='text-align:center;' >5</td><td style='text-align:center;' >0.75</td></tr></tbody></table></figure><p><img src='../doc/image/V6-复申-全渠道.png' alt='V6-复申-全渠道' referrerPolicy='no-referrer' /></p></li></ul><p>&nbsp;</p></div>
</body>
</html>
\ No newline at end of file
[TOC]
# 概述
## 报告内容
- 本次模型报告由于代码处于开发阶段, 所以内容并不完整.
- 主要包括模型分的PSI, 以及AUC随时间的变化, 以及客群与整体的差别.
- 尽管模型监控代码加入了自动筛选异常客群的功能, 但要更加深入分析仍需要时间, 此版报告为简略版.
## 监控的模型
- 已监控模型:
- 同盾V1
- V6
- 百融V1
- 首贷融合模型
- 融360首贷融合模型
- 新颜V2
- 新颜V3
- 魔蝎
- 复贷新融合模型(无运营商)
- 复贷新融合模型(有运营商)
- 短信模型
- 探知
- 同盾V2
- 电话邦V2
- 百融V2
- 复贷V4
- 首贷新融合模型(无运营商)
- 首贷新融合模型(有运营商)
- 暂未监控模型:
- 不明原因(待与模型负责人交流):
- APP模型
- 量信分V2
- 还有一些模型待加入.
## 时间跨度
- 考虑到整体模型的时间跨度, 本次模型监控报告的取数时间范围为2019.03.01 - 2019.07. 26.
# PSI
## 与整体差别较大的客群
### 概述
- 由模型监控代码在已监控模型中, 产生的有效客群数量为465个.
- 设定PSI阈值(0.5)后, 筛选出的与整体差别较大的异常客群有15个.
- 设定PSI阈值(0.1)后, 筛选出的变化较大的异常客群有25个.
### 有哪些客群
- 客群列举:
| 客群名称 | 月份 | PSI |
| :-----------------------------------: | :--: | :--: |
| 同盾V1-复贷-百融榕树 | 6 | 0.57 |
| V6-首申-国美api | 6 | 2.47 |
| V6-首申-壹账通H5 | 3 | 1.75 |
| V6-首申-平安一贷通安卓 | 5 | 2.07 |
| V6-复贷-51公积金API | 3 | 0.53 |
| 新颜V2-复申-挖财api | 4 | 0.57 |
| 首贷融合模型-复贷-全渠道 | 5 | 1.09 |
| 首贷融合模型-首申-壹账通H5 | 3 | 1.89 |
| 首贷融合模型-首申-平安一贷通安卓 | 5 | 2.13 |
| 首贷融合模型-复申-挖财api | 4 | 0.51 |
| 融360首贷融合模型-复贷-全渠道 | 5 | 1.01 |
| 融360首贷融合模型-首申-壹账通H5 | 3 | 1.82 |
| 融360首贷融合模型-首申-平安一贷通安卓 | 5 | 2.13 |
| 融360首贷融合模型-复申-挖财api | 4 | 0.67 |
### 资质相对好的客群
- 暂时鉴别方式为在异常的基础上去看图, 准备后续加入趋势判断, 自动识别是变好还是变差.
- 经过查看, 均为资质相对较好的客群.
- 例如:
![融360首贷融合模型-复申-挖财api](../doc/image/融360首贷融合模型-复申-挖财api.png)
### 资质相对差的客群
- 无.
## 随时间分布变化较大的客群
### 有哪些客群
- 部分客群列举:
| 客群名称 | 月份 | PSI |
| :--------------------------: | :--: | :--: |
| 同盾分v1-首申-百融榕树 | 5 | 0.29 |
| V6-全样本 | 7 | 0.10 |
| 新颜V2-复贷-全渠道 | 5 | 0.66 |
| 新颜v3-全样本 | 6 | 0.10 |
| 魔蝎-全样本 | 6 | 0.34 |
| 短信分-全样本 | 7 | 0.13 |
| 融360首贷融合模型-首申-融360 | 4 | 0.12 |
| 电话邦V2-首申-全渠道 | 6 | 0.13 |
### 资质变好的客群
- 暂时鉴别方式为在异常的基础上去看图, 准备后续加入自动识别.
- 部分资质变好客群列举:
| 客群名称 |
| :----------------: |
| V6-全样本 |
| 新颜V2-复贷-全渠道 |
![V6-全样本](../doc/image/V6-全样本.png)
### 资质变差的客群
- 部分资质变差客群列举:
| 客群名称 |
| :--------------------: |
| 同盾分v1-首申-百融榕树 |
| 短信分-全样本 |
![同盾分v1-首申-百融榕树](../doc/image/同盾分v1-首申-百融榕树.png)
# AUC
## 概述
- 由代码产生的客群为113个, 因为本身放款数量相对申请数量少, 且需要响应时间.
- 设定AUCR阈值为0.85, 得到的异常客群数量为23个.
## 相比整体模型效果较差的客群
- 客群列举:
| 客群名称 | 月份 | AUCR |
| :--------------------: | :--: | :--: |
| 新颜V2-首申-挖财api | 4 | 0.80 |
| 新颜V2-首申-爱奇艺H5 | 3 | 0.80 |
| 新颜v3-首申-360金融API | 4 | 0.85 |
![新颜V2-首申-爱奇艺H5](../doc/image/新颜V2-首申-爱奇艺H5.png)
## 模型效果随时间变差的客群
- 部分客群列举:
| 客群名称 | 月份 | AUCR |
| :---------------------: | :--: | :--: |
| 同盾分v1-首申-爱奇艺H5 | 4 | 0.76 |
| 同盾分v1-首申-挖财api | 6 | 0.77 |
| V6-复申-全渠道 | 6 | 0.70 |
| 首贷融合模型-首申-融360 | 5 | 0.75 |
![V6-复申-全渠道](../doc/image/V6-复申-全渠道.png)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment