Commit 589bfcb3 authored by 王家华's avatar 王家华

对画图组件做了一些bug修正,新增了多个子图的支持

parent e511a80c
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (model_mvp)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (model_mvp)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/model_mvp.iml" filepath="$PROJECT_DIR$/.idea/model_mvp.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
import matplotlib.pyplot as plt
def topN_feature_importance(classifier, clf ,mode , topN=20):
'''
plot feature importance squence
'''
......@@ -11,3 +15,4 @@ def topN_feature_importance(classifier, clf, topN=20, model=lgb):
plt.title("Feature Importances")
plt.show()
......@@ -44,8 +44,7 @@ def returnAUC(clf, training_set, validation_set, features, target='target'):
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target',
featureImportance_path = '../mvp/plots/', topN_featureImportance=20, featureImportance_title='lightgbm'):
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
......@@ -70,8 +69,6 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
matplot.topN_feature_importance(lgb, lgbm, title=featureImportance_title,
save_path = featureImportance_path, topN=topN_featureImportance)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
......@@ -102,12 +99,12 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
for deepth in np.arange(2, 7, 1):
for leaves in np.arange(2, 2 ** deepth, 2):
for deepth in np.arange(2, 4, 1):
for leaves in np.arange(2, 2 ** deepth, 4):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=30)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
......@@ -122,7 +119,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
para['max_depth'] = deepth
para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para
return optimal_para, topn
return optimal_para, list(topn)
# training_curve.append(train_auc)
......@@ -168,7 +165,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
def predict(lgbm,df_test,features,target='target'):
predictions = lgbm.predict(df_test[features])
auc = roc_auc_score(predictions,df_test[target])
auc = roc_auc_score(df_test[target],predictions)
return predictions, auc
......
......@@ -6,85 +6,122 @@ from models_kit import xgboost
import lightgbm as lgb
from graph import matplot
from tools import filetool
from sklearn.metrics import roc_auc_score
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本
#df_sample = dhb.dhb_features_extract()
######### temp #############
import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\dhb_loan_sample——2019-04-23.csv',engine='python')
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
target = 'target'
score = 'score'
prediction = 'predict'
############################
# 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify='target',
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7,split_methods='random',
time_label='applied_at')
del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target='target',
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
topN=3, cv_fold=5)
print('topn 通过train交叉验证得到的auc ',topn)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=optimal_para, target='target')
predictions ,test_auc = lightgbm.predict(lgbm,df_test,features=dhb.features)
df_test['predict'] = predictions
# model matrix
model_matrix_index = ['name','Params','trainAUC','validationAUC']
model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
pointer = 0
for param in optimal_para:
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=param, target=target)
model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
pointer += 1
# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=best_params, target='target')
# 用新模型预测结果
predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target)
# 把新的预测结果加入test
df_test[prediction] = predictions
####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
####################################
### report
# plot feature importance
path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./plots/', topN=20)
topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/', topN=20)
# report file
report_path = "E:\\bla\\"
report_path = "E:/bla/model_mvp/"
report_name = "lgb_report.docx"
# 生成docx Documents
document = filetool.buildDocument(report_path, report_name)
# docx加入title
document.add_heading('lightGBM 算法refit报告')
filetool.Document.add_paragraph('特征权重图')
# docx新增 特征权重段
document.add_paragraph('特征权重图')
# docx加入特征权重图像
document.add_picture(topnfeat_path)
filetool.add_picture(path)
# 新增 univar_chart段
document.add_paragraph('univar_chart')
filetool.Document.add_paragraph('univar_chart')
# 遍历目标features画出univarchart
for i in dhb.features[:3]:
univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
tab_df_list=None, plot_tab=True,
saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' univar Chart' + ".png")
for i in dhb.features:
univar = datacal.cal_univar(df_train,score='raw_score')
univarChart = matplot.plot_table(univar,title= i +' univar Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i +' univar Chart')
document.add_paragraph('PDP_chart')
# 遍历目标features 画出对应PDP
for i in dhb.features[:3]:
pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' PDP Chart' + ".png")
for i in dhb.features:
pdp = datacal.cal_pdp(df_test,score='predict')
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' PDP Chart')
for i in dhb.features:
lift = datacal.cal_liftchart(df_test,score='predict')
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' lift Chart')
filetool.saveDocument(document, report_path, report_name)
document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
for channel in applied_from:
for type in applied_type:
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
lift = datacal.cal_liftchart(df_sliced,score=prediction)
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache')
document.add_picture("./mvp/plots/cache" + i + ' lift Chart.png')
filetool.saveDocument(document, report_path, report_name)
......
......@@ -4,26 +4,79 @@ import datetime
from sklearn.model_selection import train_test_split
def liftchart(df,target='target',qcut=10,retbins=True):
def cal_lift(df_list, score, target='target', qcut=10, retbin=False):
'''
instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
Params :
df - dataframe(注意一定是是放款集!!)
df - dataframe(注意一定是是放款集!!) list
score - 模型分数
target - label column
qcut - quantiles
retbins - return bins interval when 'retbins' is True, else False
:return:
liftchart dataframe
liftchart pivot
'''
pivot = pd.DataFrame([])
if type(df_list) == pd.DataFrame:
df = df_list.copy()
# fillin missing with -1
df.fillna(value=-1,inplace=True)
df = df[[score, target]]
# create a bins column
df_noneNA = [df[score] < 0]
df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
pivot = pd.concat([pivot, pivot_tmp], axis=1)
if type(df_list) == list:
print('none')
for df in df_list:
df = df.copy()
df = df[[score, target]]
# create a bins column
df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
pivot = pd.concat([pivot, pivot_tmp], axis=1)
return pivot[target]
def cal_univar(df, feature, target, qcut=10):
'''
instructions : return univar pivot
Params:
:param df: dataframe with unvariable & label target(overdue label)
:param feature: single feature to
:param target:
:param qcut: N bins in the same frequency
:return: univar pivot
'''
df = df.copy()
# create a bins column
df['bins'] = pd.qcut(df, q=10, precision=6, retbins=False, duplicates='drop')
pivot = df[['bins','target']].groupby('bins').agg(['mean','count'])
return pivot
df = df[[feature, target]]
# fill missing with -1
df.fillna(value=-1,inplace=True)
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[target,'bins']].groupby('bins').sum() / df[[target,'bins']].groupby('bins').count()
return pivot[target]
def cal_pdp(df, score, feature, qcut=10):
'''
instructions : return pdp pivot
:param df: dataframe of test set
:param score: score that predicts by model
:param feature:
:param qcut:
:return:
'''
df = df.copy()
df = df[[feature, score]]
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[score,'bins']].groupby('bins').sum() / df[[score,'bins']].groupby('bins').count()
return pivot[score]
......@@ -184,45 +237,45 @@ def cal_accume(df,feature,target,bin=10,classes=[]):
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if df.shape[0]==0:
raise('no data')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
if feature not in columns:
raise('not found %s' % feature)
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature] = tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp, feature, bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature]
if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
df_out = df_all
return df_out
# def cal_univar(df,feature,target,bin=10,classes=[]):
# '''
# groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
# :param df: dataframe
# :param feature: feature in df.columns
# :param target: in df.columns eg: count(target) mean(target)
# :param bins:default =10
# :param classes: 分组
# :return:
# '''
# if df.shape[0]==0:
# raise('no data')
# columns=df.columns.tolist()
# if target not in columns:
# raise('not found %s' % target)
# if feature not in columns:
# raise('not found %s' % feature)
#
# tmp=df.copy()
# tmp[feature].fillna(-1, inplace=True)
# # == bin 划分,feature 有可能 非数字
# try:
# tmp[feature] = tmp[feature].astype(float)
# feature_grid = cal_feature_grid(tmp, feature, bin)
# tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
# tmp['grid'] = tmp['lbl'].cat.codes
# except ValueError:
# tmp['lbl']=tmp[feature]
# tmp['grid']=tmp[feature]
#
# if len(classes) > 0:
# df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
# df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
# df_out=df_gp
# else:
# df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
# df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
# df_out = df_all
# return df_out
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment