Commit 37c70174 authored by linfang.wang's avatar linfang.wang

xgb report done

parent 75c387db
...@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10): ...@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001])) feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
return feature_grid return feature_grid
def cal_accume(df,feature,target,bin=10): def cal_accume(df,feature,target,bin=10,classes=[]):
df_out=cal_univar(df,feature,target,bin) '''
df_out['acmCnt']=df_out['count'].cumsum()
df_out['acmEvent']=df_out['sum'].cumsum() :param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,坏样本数量,坏样本比例
'''
df_out=cal_univar(df,feature,target,bin,classes=classes)
df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
df_out['acmEvent']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmEventRate']=df_out['acmEvent']/df_out['acmCnt'] df_out['acmEventRate']=df_out['acmEvent']/df_out['acmCnt']
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]): def cal_univar(df,feature,target,bin=10,classes=[]):
......
from pyplotz.pyplotz import PyplotZ from pyplotz.pyplotz import PyplotZ
from pyplotz.pyplotz import plt from pyplotz.pyplotz import plt
from data.analyis import datacal
import seaborn as sns import seaborn as sns
import pandas as pd
plt.rc('figure',figsize=(8,6)) plt.rc('figure',figsize=(8,6))
font_options={ font_options={
...@@ -11,6 +12,67 @@ font_options={ ...@@ -11,6 +12,67 @@ font_options={
plt.rc('font',**font_options) plt.rc('font',**font_options)
def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
# values=['count'], aggfunc=['mean'])
plt.cla()
if classes !='':
df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
plt.subplot(2, 1,1)
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out,'grid','acmEventRate',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else :
df_out = datacal.cal_accume(df, x, y, bin)
plt.subplot(2, 1, 1)
draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out, 'grid','acmEventRate', title=title, xlabel=xlabel, ylabel=ylabel)
plt.tight_layout()
# plt.show()
return plt
def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.cla()
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.cla()
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
''' '''
双坐标轴 双坐标轴
''' '''
...@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''): ...@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
''' '''
def draw_barplot(df,x,y,hue='',title='',path=None,filename=None): def draw_barplot(df,x,y,hue='',title=''):
''' '''
:param df: dataframe :param df: dataframe
:param x: 横坐标 :param x: 横坐标
...@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None): ...@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
return fig return fig
def draw_lineplot(df,x,y,hue='',title=''): def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
''' '''
:param df: dataframe :param df: dataframe
:param x: 横坐标 :param x: 横坐标
...@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''): ...@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
''' '''
pltz = PyplotZ() pltz = PyplotZ()
pltz.enable_chinese() pltz.enable_chinese()
fig = plt.figure() # fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if hue != '': if hue != '':
for type in df[hue].unique().tolist(): for type in df[hue].unique().tolist():
# == 画图 # == 画图
...@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''): ...@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
else: else:
plt.plot(df[x], df[y], linestyle='dashed', marker='o') plt.plot(df[x], df[y], linestyle='dashed', marker='o')
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist()) # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
if xlabel !='':
pltz.xlabel(xlabel)
else:
pltz.xlabel(x) pltz.xlabel(x)
if ylabel !='':
pltz.ylabel(ylabel)
else:
pltz.ylabel(y) pltz.ylabel(y)
pltz.title(title) pltz.title(title)
pltz.legend() pltz.legend()
plt.grid() plt.grid()
plt.show() # plt.show()
return fig return plt
\ No newline at end of file \ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import xgboost as xgb import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV,StratifiedKFold from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics from sklearn import metrics
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5): scale_pos_weight=1, base_score=0.5):
...@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, ...@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier :return:XGBClassifier
''' '''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators, return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=1,silent=True,objective='binary:logistic', verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight, booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree, max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight, reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
...@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc ...@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:param kfold: :param kfold:
:return: :return:
''' '''
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7) # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kflod,verbose=0,iid=True,refit=True) grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练 #== 模型训练
grid_search.fit(dftrain[features].values,dftrain[resp].values) grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数 #== 获取最优参数
return grid_search return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_metric='auc',early_stopping_rounds=20): def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
''' '''
模型训练 模型训练
:type useTrainCV: object :type useTrainCV: object
...@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m ...@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
:return: :return:
''' '''
if useTrainCV: if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params() xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values) xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds, cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True) metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0]) clf.set_params(n_estimators=cvresult.shape[0])
...@@ -106,9 +105,9 @@ def predict(clf,df,features): ...@@ -106,9 +105,9 @@ def predict(clf,df,features):
def auc(clf,df,features,label): def auc(clf,df,features,label):
#== 计算准确率,auc等指标 #== 计算准确率,auc等指标
df=predict(clf,df,features) df=predict(clf,df,features)
accu=metrics.accuracy_score(df[label], df['predict']) accu=metrics.accuracy_score(df[label].values, df['predict'].values)
auc=metrics.roc_auc_score(df[label],df['predict_proba']) auc=metrics.roc_auc_score(df[label],df['predict_proba'])
return dict({'accuracy':accu,'auc':auc}) return {'accuracy':accu,'auc':auc}
def featureImportance(clf,features): def featureImportance(clf,features):
...@@ -121,4 +120,6 @@ def featureImportance(clf,features): ...@@ -121,4 +120,6 @@ def featureImportance(clf,features):
# Print Feature Importance: # Print Feature Importance:
feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last') feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
feat_imp = feat_imp[feat_imp > 0] feat_imp = feat_imp[feat_imp > 0]
feat_imp=feat_imp.to_frame().reset_index()
feat_imp.columns=['feature','weight']
return feat_imp return feat_imp
import pandas as pd
import numpy as np
import datetime
from mvp import xgbreport
from data.analyis import datacal
if __name__ == '__main__':
features=[
'third_data_source#xy_pan_newapplyAcredibility',
'third_data_source#xy_pan_newapplyAscore',
'third_data_source#xy_pan_newconsfinAavgAlimit',
'third_data_source#xy_pan_newconsfinAcredibility',
'third_data_source#xy_pan_newconsfinAcreditAlimit',
'third_data_source#xy_pan_newconsfinAmaxAlimit',
'third_data_source#xy_pan_newconsfinAorgAcountq',
'third_data_source#xy_pan_newconsfinAorgAcountx',
'third_data_source#xy_pan_newconsfinAproductAcount',
'third_data_source#xy_pan_newhistoryAfailAfee',
'third_data_source#xy_pan_newhistoryAsucAfee',
'third_data_source#xy_pan_newlatestAoneAmonthAfail',
'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
'third_data_source#xy_pan_newlatestAoneAmonthd',
'third_data_source#xy_pan_newlatestAoneAmonthj',
'third_data_source#xy_pan_newlatestAqueryAtime',
'third_data_source#xy_pan_newlatestAsixAmontha',
'third_data_source#xy_pan_newlatestAsixAmonthv',
'third_data_source#xy_pan_newlatestAthreeAmonthb',
'third_data_source#xy_pan_newlatestAthreeAmonthf',
'third_data_source#xy_pan_newloansAavgAlimit',
'third_data_source#xy_pan_newloansAcashAcount',
'third_data_source#xy_pan_newloansAcount',
'third_data_source#xy_pan_newloansAcredibilityh',
'third_data_source#xy_pan_newloansAcredibilitys',
'third_data_source#xy_pan_newloansAcreditAlimit',
'third_data_source#xy_pan_newloansAlatestAtime',
'third_data_source#xy_pan_newloansAlongAtime',
'third_data_source#xy_pan_newloansAmaxAlimit',
'third_data_source#xy_pan_newloansAorgAcounta',
'third_data_source#xy_pan_newloansAorgAcountg',
'third_data_source#xy_pan_newloansAoverdueAcount',
'third_data_source#xy_pan_newloansAproductAcount',
'third_data_source#xy_pan_newloansAscore',
'third_data_source#xy_pan_newloansAsettleAcount',
'third_data_source#xy_pan_newqueryAcashAcount',
'third_data_source#xy_pan_newqueryAfinanceAcount',
'third_data_source#xy_pan_newqueryAorgAcount',
'third_data_source#xy_pan_newqueryAsumAcount'
]
label='y'
df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc')
\ No newline at end of file
...@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt ...@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename): def report(dftrain,dftest,features,label,path,filename):
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
:param dftrain:
:param dftest:
:param features:
:param label:
:param path:
:param filename:
:return:
'''
document=filetool.buildDocument(path,filename) document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告') document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf() clf=xgboost.buildClf()
...@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename): ...@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
min_child_weight=range(1,4,1) min_child_weight=range(1,4,1)
document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label) document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label)
# gamma # # gamma
gamma=[i/10 for i in range(0,5)] # gamma=[i/10 for i in range(0,5)]
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label) # document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
#
# subsample colsample_bytree # # subsample colsample_bytree
subsample=[0.8,0.9,1] # subsample=[0.8,0.9,1]
colsample_bytree=[0.8,0.9,1] # colsample_bytree=[0.8,0.9,1]
document, clf = tun_params(document, clf, dftrain, dftest, # document, clf = tun_params(document, clf, dftrain, dftest,
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label) # {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
#
# reg_alpha # # reg_alpha
reg_alpha=[0.001,0.01,0.1,1,10] # reg_alpha=[0.001,0.01,0.1,1,10]
document, clf = tun_params(document, clf, dftrain, dftest, # document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_alpha': reg_alpha}, features, label) # {'reg_alpha': reg_alpha}, features, label)
#
# reg_lambda # # reg_lambda
reg_lambda = [0.001, 0.01, 0.1, 1, 10] # reg_lambda = [0.001, 0.01, 0.1, 1, 10]
document, clf = tun_params(document, clf, dftrain, dftest, # document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_lambda': reg_lambda}, features, label) # {'reg_lambda': reg_lambda}, features, label)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart #==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features) dftrain=xgboost.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features) dftest=xgboost.predict(clf,dftest,features)
featureimp=xgboost.featureImportance(clf,features).to_frame(name=['weight','feature']) #== 特征权重
featureimp=xgboost.featureImportance(clf,features)
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance') fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png') fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征') document.add_paragraph('特征权重图,近前10个特征')
document.add_picture('tmp.png') document.add_picture('tmp.png')
#== 模型分同逾期率的关系图
dftrain['flag']='训练集'
dftest['flag']='测试集'
drawplot.liftchart(pd.concat([dftrain,dftest]), 'predict_proba', label, bin=10, classes='flag', title='liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('整体--liftchart')
document.add_picture('tmp.png')
filetool.saveDocument(document,path,filename) #== 分月份查看-- 只看测试集
dftest=datacal.cal_month(dftest,'applied_at','applied_month')
drawplot.liftchart(dftest, 'predict_proba', label, bin=10, classes='applied_month', title='分月liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分月--liftchart')
document.add_picture('tmp.png')
#== 分用户类型分月查看
drawplot.liftchart(dftest,'predict_proba',label,bin=10,classes='applied_type',title='分用户类型liftchart',xlabel='模型分',ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分用户类型--liftchart')
document.add_picture('tmp.png')
#== 分渠道分月查看--取前5个渠道查看
channels=dftest.applied_channel.value_counts()[:5].index
drawplot.liftchart(dftest[dftest.applied_channel.isin(channels)], 'predict_proba', label, bin=10, classes='applied_channel', title='分渠道liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分渠道--liftchart')
document.add_picture('tmp.png')
#== 各个特征的 单变量图 和 pdp 图
for i in featureimp.feature.tolist():
drawplot.univarchart(dftest, i, label, bin=10, title='单变量%s' % i,
ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('单变量%s' % i)
document.add_picture('tmp.png')
#= pdp
drawplot.pdpchart(dftest, i, 'predict_proba', bin=10, title='pdp %s' % i,
ylabel='模型分').savefig('tmp.png')
document.add_paragraph('pdp %s' % i)
document.add_picture('tmp.png')
filetool.saveDocument(document, path, filename)
...@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label): ...@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label) grid_search = xgboost.automodelfit(clf, params,dftrain, features, label)
clf = grid_search.best_estimator_ clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params())) document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
clf = xgboost.modelfit(clf, dftrain, features, label) #==
# clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_)) document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf, dftrain, features, label))) document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label)))
return document,clf return document,clf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment