Commit c8ae753d authored by 王家华's avatar 王家华

报告版

parent 085d706c
This diff is collapsed.
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
# 基于gbdt的特征选择
# def selectFromGBDT(df_train, features, target):
# SelectFromModel(GradientBoostingClassifier()).fit_transform(df_train[features], df_train[target])
\ No newline at end of file
...@@ -19,8 +19,10 @@ def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/ ...@@ -19,8 +19,10 @@ def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/
plt.title("Feature Importances") plt.title("Feature Importances")
path = save_path + title + "_featureImportance.png" path = save_path + title + "_featureImportance.png"
plt.tight_layout()
plt.savefig(path) plt.savefig(path)
plt.show() plt.show()
return path return path
......
...@@ -99,7 +99,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_ ...@@ -99,7 +99,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%') print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn) optimal_para = list(topn)
for deepth in np.arange(2, 4, 1): for deepth in np.arange(2, 5, 1):
for leaves in np.arange(2, 2 ** deepth, 4): for leaves in np.arange(2, 2 ** deepth, 4):
params['max_depth'] = deepth params['max_depth'] = deepth
params['num_leaves'] = leaves params['num_leaves'] = leaves
......
''' '''
@allocator V1.0 @allocator V1.0
作用: 程序入口,用于设置报告和refit相关的参数,设置
''' '''
#################################################### report settings ################################################### #################################################### report settings ###################################################
from models_obj import dhb_obj from models_obj import dhb_obj
...@@ -8,15 +10,18 @@ import pandas as pd ...@@ -8,15 +10,18 @@ import pandas as pd
import os import os
from mvp import refit from mvp import refit
from tools import datacal from tools import datacal
import pickle
import time
from sklearn.externals import joblib
# 渠道列表 # 渠道列表(用于报告生成)
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'} applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
# 申请类型列表 # 申请类型列表(用于报告生成)
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'} applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
# workspace 路径 # workspace 路径 工作目录路径
worksapce = 'E:\\bla\\model_mvp\\' workspace = 'E:\\bla\\model_mvp\\'
# 样本路径 # 样本路径
sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv' sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv'
...@@ -26,26 +31,54 @@ target = 'target' ...@@ -26,26 +31,54 @@ target = 'target'
# 线上模型分字段 # 线上模型分字段
score = 'score' score = 'score'
#score = 'online_score'
# 预测模型分字段 # 预测模型分字段
prediction = 'predict' prediction = 'predict'
# 报告生成路径 # 报告生成路径 (这里暂时设置成workspace也就是根目录下)
report_path = worksapce report_path = workspace
# 报告名称 # 报告名称
report_name = "lgb_report.docx" report_name = "lgb_report.docx"
# 切换到workspace目录下 避免相对路径不能识别问题 # 切换到workspace目录下 避免相对路径不能识别问题
os.chdir(worksapce) os.chdir(workspace)
#################################################### training settings ################################################# #################################################### training settings #################################################
# 生成电话帮对象(使用默认参数) # 生成电话帮对象(使用默认参数)
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15) dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 需要对特征进行调整时,在这里直接dhb.features = 赋值即可 # 需要对特征进行调整时,在这里直接dhb.features = 赋值即可
# dhb.features = ['overview_ntdun_call_duration_above60',
# 'overview_dun_call_total_duration', 'overview_dun_call_tel_total_nums',
# 'overview_ntdun_call_tel_total_nums',
# 'last_60_and_90_days_ntdun_call_in_duration',
# 'last_two_weeks_ntdun_call_in_duration',
# 'last_two_weeks_dun_call_in_duration',
# 'last_three_weeks_ntdun_call_in_duration',
# 'last_60_and_90_days_ntdun_call_avg_duration', 'dun_first_call_time',
# 'last_30_and_60_days_ntdun_call_total_duration',
# 'overview_ntdun_call_in_times',
# 'last_30_days_ntdun_call_tel_total_nums',
# 'last_60_and_90_days_ntdun_call_total_duration',
# 'overview_ntdun_call_total_duration',
# 'last_30_days_dun_call_total_duration',
# 'overview_ntdun_call_total_times', 'overview_ntdun_call_in_duration',
# 'last_30_days_ntdun_call_total_duration',
# 'last_30_days_dun_call_duration_between15_and_30',
# 'ntdun_last_call_time', 'last_30_and_60_days_ntdun_call_in_times',
# 'last_30_and_60_days_ntdun_call_duration_between15_and_30',
# 'last_30_days_dun_call_in_times',
# 'last_30_days_ntdun_call_duration_above60',
# 'last_30_days_dun_call_in_duration',
# 'overview_ntdun_call_duration_between15_and_30',
# 'last_30_and_60_days_dun_call_tel_total_nums',
# 'last_30_and_60_days_dun_call_in_duration']
# 提取样本 # 提取样本
#df_sample = dhb.dhb_features_extract() #df_sample = dhb.dhb_features_extract()
# 这里直接使用csv读入样本 # 这里直接使用csv读入样本
df_sample = pd.read_csv(sample_path,engine='python') df_sample = pd.read_csv(sample_path,engine='python')
#df_sample['target'] = df_sample.passdue_day.map(lambda x : 1 if x > 15 else 0)
# 电话帮数据处理 # 电话帮数据处理
# 自定义方法 / 默认数据处理方法 # 自定义方法 / 默认数据处理方法
...@@ -59,9 +92,11 @@ df_sample = dhb.dhb_features_prepocessing(df_sample) ...@@ -59,9 +92,11 @@ df_sample = dhb.dhb_features_prepocessing(df_sample)
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target, df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7, split_methods='random', random_state=7, split_methods='random',
time_label='applied_at') time_label='applied_at')
# del variable df_sample for saving storage space
del df_sample del df_sample
# 模型refit # 模型refit,返回对应的模型性能二维表和模型分类器lgbm
model_matrix, lgbm = refit.model_fit(df_train, df_val, df_test, dhb, target, score) model_matrix, lgbm = refit.model_fit(df_train, df_val, df_test, dhb, target, score)
print(model_matrix) print(model_matrix)
...@@ -69,6 +104,11 @@ print(model_matrix) ...@@ -69,6 +104,11 @@ print(model_matrix)
status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target, model_matrix, status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target, model_matrix,
score, prediction, report_path, report_name, applied_from, applied_type, topN=3) score, prediction, report_path, report_name, applied_from, applied_type, topN=3)
# 保存pkl
joblib.dump(lgbm,'/home/public/dhb_refit'+time.strftime('%Y-%m-%d',time.localtime(time.time()))+'.pkl')
......
...@@ -15,6 +15,7 @@ import pandas as pd ...@@ -15,6 +15,7 @@ import pandas as pd
def model_fit(df_train, df_val, df_test, model_obj, target, score): def model_fit(df_train, df_val, df_test, model_obj, target, score):
''' '''
instructions : refit model with lightGBM
:param df_train: 训练集 :param df_train: 训练集
:param df_val: 验证集 :param df_val: 验证集
...@@ -109,10 +110,11 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -109,10 +110,11 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
document.add_paragraph('特征权重图') document.add_paragraph('特征权重图')
# docx加入特征权重图像 # docx加入特征权重图像
document.add_picture(topnfeat_path) document.add_picture(topnfeat_path,width=Inches(6))
# 新增 univar_chart段 # 新增 univar_chart段
document.add_paragraph('univar_chart') document.add_paragraph('univar_chart')
# 遍历目标features画出univarchart # 遍历目标features画出univarchart
for i in importanct_feat: for i in importanct_feat:
# 训练集 univar # 训练集 univar
...@@ -133,6 +135,7 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -133,6 +135,7 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
# 新增pdp段 # 新增pdp段
document.add_paragraph('PDP_chart') document.add_paragraph('PDP_chart')
# 遍历目标features 画出对应PDP # 遍历目标features 画出对应PDP
for i in range(math.ceil(len(importanct_feat)/9)): for i in range(math.ceil(len(importanct_feat)/9)):
# pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10) # pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
...@@ -143,8 +146,8 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -143,8 +146,8 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
# 新增liftchart段 # 新增liftchart段
document.add_paragraph('lift_chart') document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
# 遍历给定渠道 & 客群 默认等频画出liftchart
try: try:
lift_pred = datacal.cal_lift(df_test, score=prediction) lift_pred = datacal.cal_lift(df_test, score=prediction)
lift_online = datacal.cal_lift(df_test, score=score) lift_online = datacal.cal_lift(df_test, score=score)
...@@ -157,14 +160,12 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -157,14 +160,12 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False, tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
saved_path='./mvp/plots/cache/') saved_path='./mvp/plots/cache/')
document.add_picture(liftChart, width=Inches(7)) document.add_picture(liftChart, width=Inches(5.5))
# 遍历渠道 # 遍历渠道
for channel in applied_from.keys(): for channel in applied_from.keys():
print('lift ',channel) print('lift ',channel)
# 数据切片 # 数据切片
df_sliced = df_test[ df_sliced = df_test[df_test.applied_from.map(lambda x: True if str(x) in channel.split(',') else False)]
df_test.applied_type.map(lambda x: True if str(x) in type.split(',') else False) & df_test.applied_from.map(
lambda x: True if str(x) in channel.split(',') else False)]
# #
lift_pred = datacal.cal_lift(df_sliced, score=prediction) lift_pred = datacal.cal_lift(df_sliced, score=prediction)
lift_online = datacal.cal_lift(df_sliced, score=score) lift_online = datacal.cal_lift(df_sliced, score=score)
...@@ -182,7 +183,7 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -182,7 +183,7 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
for type in applied_type.keys(): for type in applied_type.keys():
print('lift ',type) print('lift ',type)
# 数据切片 # 数据切片
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)] df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False)]
# #
lift_pred = datacal.cal_lift(df_sliced,score=prediction) lift_pred = datacal.cal_lift(df_sliced,score=prediction)
lift_online = datacal.cal_lift(df_sliced,score=score) lift_online = datacal.cal_lift(df_sliced,score=score)
...@@ -195,7 +196,6 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix, ...@@ -195,7 +196,6 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
except Exception as e: except Exception as e:
print(e) print(e)
pass pass
# 存在某些渠道量很少的情况,加入try catch异常处理 # 存在某些渠道量很少的情况,加入try catch异常处理
# docx 保存 # docx 保存
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment