Commit 589bfcb3 authored by 王家华's avatar 王家华

对画图组件做了一些bug修正,新增了多个子图的支持

parent e511a80c
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (model_mvp)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (model_mvp)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/model_mvp.iml" filepath="$PROJECT_DIR$/.idea/model_mvp.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
def topN_feature_importance(classifier, clf, topN=20, model=lgb): import matplotlib.pyplot as plt
def topN_feature_importance(classifier, clf ,mode , topN=20):
''' '''
plot feature importance squence plot feature importance squence
''' '''
...@@ -11,3 +15,4 @@ def topN_feature_importance(classifier, clf, topN=20, model=lgb): ...@@ -11,3 +15,4 @@ def topN_feature_importance(classifier, clf, topN=20, model=lgb):
plt.title("Feature Importances") plt.title("Feature Importances")
plt.show() plt.show()
...@@ -44,8 +44,7 @@ def returnAUC(clf, training_set, validation_set, features, target='target'): ...@@ -44,8 +44,7 @@ def returnAUC(clf, training_set, validation_set, features, target='target'):
return train_auc, val_auc return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target', def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
featureImportance_path = '../mvp/plots/', topN_featureImportance=20, featureImportance_title='lightgbm'):
''' '''
instructions : training lightgbm model with specified params instructions : training lightgbm model with specified params
...@@ -70,8 +69,6 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target' ...@@ -70,8 +69,6 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False) lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features) train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
matplot.topN_feature_importance(lgb, lgbm, title=featureImportance_title,
save_path = featureImportance_path, topN=topN_featureImportance)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features])) # auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm return train_auc, val_auc, lgbm
...@@ -102,12 +99,12 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_ ...@@ -102,12 +99,12 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%') print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn) optimal_para = list(topn)
for deepth in np.arange(2, 7, 1): for deepth in np.arange(2, 4, 1):
for leaves in np.arange(2, 2 ** deepth, 2): for leaves in np.arange(2, 2 ** deepth, 4):
params['max_depth'] = deepth params['max_depth'] = deepth
params['num_leaves'] = leaves params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves) print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False) cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=30)
# return max auc(best performance) # return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max() auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score) print('auc ', auc_score)
...@@ -122,7 +119,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_ ...@@ -122,7 +119,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
para['max_depth'] = deepth para['max_depth'] = deepth
para['num_leaves'] = leaves para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para optimal_para[topn.argmin()] = para
return optimal_para, topn return optimal_para, list(topn)
# training_curve.append(train_auc) # training_curve.append(train_auc)
...@@ -168,7 +165,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_ ...@@ -168,7 +165,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
def predict(lgbm,df_test,features,target='target'): def predict(lgbm,df_test,features,target='target'):
predictions = lgbm.predict(df_test[features]) predictions = lgbm.predict(df_test[features])
auc = roc_auc_score(predictions,df_test[target]) auc = roc_auc_score(df_test[target],predictions)
return predictions, auc return predictions, auc
......
...@@ -6,85 +6,122 @@ from models_kit import xgboost ...@@ -6,85 +6,122 @@ from models_kit import xgboost
import lightgbm as lgb import lightgbm as lgb
from graph import matplot from graph import matplot
from tools import filetool from tools import filetool
from sklearn.metrics import roc_auc_score
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15) dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本 # 提取样本
#df_sample = dhb.dhb_features_extract() #df_sample = dhb.dhb_features_extract()
######### temp ############# ######### temp #############
import pandas as pd import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\dhb_loan_sample——2019-04-23.csv',engine='python') df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
target = 'target'
score = 'score'
prediction = 'predict'
############################ ############################
# 备份df_sample # 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx") #df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分 # 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify='target', df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7,split_methods='random', random_state=7,split_methods='random',
time_label='applied_at') time_label='applied_at')
del df_sample del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn # 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target='target', optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
topN=3, cv_fold=5) topN=3, cv_fold=5)
print('topn 通过train交叉验证得到的auc ',topn) print('topn 通过train交叉验证得到的auc ',topn)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=optimal_para, target='target')
predictions ,test_auc = lightgbm.predict(lgbm,df_test,features=dhb.features) # model matrix
df_test['predict'] = predictions model_matrix_index = ['name','Params','trainAUC','validationAUC']
model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
pointer = 0
for param in optimal_para:
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=param, target=target)
model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
pointer += 1
# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=best_params, target='target')
# 用新模型预测结果
predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target)
# 把新的预测结果加入test
df_test[prediction] = predictions
####### allocator cache ############ ####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'} applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'} applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
#################################### ####################################
### report ### report
# plot feature importance # plot feature importance
path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./plots/', topN=20) topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/', topN=20)
# report file # report file
report_path = "E:\\bla\\" report_path = "E:/bla/model_mvp/"
report_name = "lgb_report.docx" report_name = "lgb_report.docx"
# 生成docx Documents
document = filetool.buildDocument(report_path, report_name) document = filetool.buildDocument(report_path, report_name)
# docx加入title
document.add_heading('lightGBM 算法refit报告') document.add_heading('lightGBM 算法refit报告')
filetool.Document.add_paragraph('特征权重图') # docx新增 特征权重段
document.add_paragraph('特征权重图')
# docx加入特征权重图像
document.add_picture(topnfeat_path)
filetool.add_picture(path) # 新增 univar_chart段
document.add_paragraph('univar_chart')
filetool.Document.add_paragraph('univar_chart') # 遍历目标features画出univarchart
for i in dhb.features[:3]:
univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
tab_df_list=None, plot_tab=True,
saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' univar Chart' + ".png")
for i in dhb.features: document.add_paragraph('PDP_chart')
univar = datacal.cal_univar(df_train,score='raw_score') # 遍历目标features 画出对应PDP
univarChart = matplot.plot_table(univar,title= i +' univar Chart',saved_path='./plots/cache') for i in dhb.features[:3]:
filetool.add_picture("./plots/cache" + i +' univar Chart') pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' PDP Chart' + ".png")
for i in dhb.features:
pdp = datacal.cal_pdp(df_test,score='predict')
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' PDP Chart')
for i in dhb.features:
lift = datacal.cal_liftchart(df_test,score='predict')
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' lift Chart')
filetool.saveDocument(document, report_path, report_name) filetool.saveDocument(document, report_path, report_name)
document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
for channel in applied_from:
for type in applied_type:
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
lift = datacal.cal_liftchart(df_sliced,score=prediction)
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache')
document.add_picture("./mvp/plots/cache" + i + ' lift Chart.png')
filetool.saveDocument(document, report_path, report_name)
......
...@@ -4,25 +4,78 @@ import datetime ...@@ -4,25 +4,78 @@ import datetime
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
def cal_lift(df_list, score, target='target', qcut=10, retbin=False):
def liftchart(df,target='target',qcut=10,retbins=True):
''' '''
instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
Params : Params :
df - dataframe(注意一定是是放款集!!) df - dataframe(注意一定是是放款集!!) list
score - 模型分数
target - label column target - label column
qcut - quantiles qcut - quantiles
retbins - return bins interval when 'retbins' is True, else False retbins - return bins interval when 'retbins' is True, else False
:return: :return:
liftchart dataframe liftchart pivot
''' '''
pivot = pd.DataFrame([])
if type(df_list) == pd.DataFrame:
df = df_list.copy()
# fillin missing with -1
df.fillna(value=-1,inplace=True)
df = df[[score, target]]
# create a bins column
df_noneNA = [df[score] < 0]
df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
pivot = pd.concat([pivot, pivot_tmp], axis=1)
if type(df_list) == list:
print('none')
for df in df_list:
df = df.copy() df = df.copy()
df = df[[score, target]]
# create a bins column # create a bins column
df['bins'] = pd.qcut(df, q=10, precision=6, retbins=False, duplicates='drop') df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot = df[['bins','target']].groupby('bins').agg(['mean','count']) pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
return pivot pivot = pd.concat([pivot, pivot_tmp], axis=1)
return pivot[target]
def cal_univar(df, feature, target, qcut=10):
'''
instructions : return univar pivot
Params:
:param df: dataframe with unvariable & label target(overdue label)
:param feature: single feature to
:param target:
:param qcut: N bins in the same frequency
:return: univar pivot
'''
df = df.copy()
df = df[[feature, target]]
# fill missing with -1
df.fillna(value=-1,inplace=True)
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[target,'bins']].groupby('bins').sum() / df[[target,'bins']].groupby('bins').count()
return pivot[target]
def cal_pdp(df, score, feature, qcut=10):
'''
instructions : return pdp pivot
:param df: dataframe of test set
:param score: score that predicts by model
:param feature:
:param qcut:
:return:
'''
df = df.copy()
df = df[[feature, score]]
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[score,'bins']].groupby('bins').sum() / df[[score,'bins']].groupby('bins').count()
return pivot[score]
...@@ -184,45 +237,45 @@ def cal_accume(df,feature,target,bin=10,classes=[]): ...@@ -184,45 +237,45 @@ def cal_accume(df,feature,target,bin=10,classes=[]):
return df_out return df_out
def cal_univar(df,feature,target,bin=10,classes=[]): # def cal_univar(df,feature,target,bin=10,classes=[]):
''' # '''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算 # groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe # :param df: dataframe
:param feature: feature in df.columns # :param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target) # :param target: in df.columns eg: count(target) mean(target)
:param bins:default =10 # :param bins:default =10
:param classes: 分组 # :param classes: 分组
:return: # :return:
''' # '''
if df.shape[0]==0: # if df.shape[0]==0:
raise('no data') # raise('no data')
columns=df.columns.tolist() # columns=df.columns.tolist()
if target not in columns: # if target not in columns:
raise('not found %s' % target) # raise('not found %s' % target)
if feature not in columns: # if feature not in columns:
raise('not found %s' % feature) # raise('not found %s' % feature)
#
tmp=df.copy() # tmp=df.copy()
tmp[feature].fillna(-1, inplace=True) # tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字 # # == bin 划分,feature 有可能 非数字
try: # try:
tmp[feature] = tmp[feature].astype(float) # tmp[feature] = tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp, feature, bin) # feature_grid = cal_feature_grid(tmp, feature, bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True) # tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
tmp['grid'] = tmp['lbl'].cat.codes # tmp['grid'] = tmp['lbl'].cat.codes
except ValueError: # except ValueError:
tmp['lbl']=tmp[feature] # tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature] # tmp['grid']=tmp[feature]
#
if len(classes) > 0: # if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index() # df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum'] # df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
df_out=df_gp # df_out=df_gp
else: # else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index() # df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum'] # df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
df_out = df_all # df_out = df_all
return df_out # return df_out
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment