Commit 37c70174 authored by linfang.wang's avatar linfang.wang

xgb report done

parent 75c387db
......@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
return feature_grid
def cal_accume(df,feature,target,bin=10):
df_out=cal_univar(df,feature,target,bin)
df_out['acmCnt']=df_out['count'].cumsum()
df_out['acmEvent']=df_out['sum'].cumsum()
def cal_accume(df,feature,target,bin=10,classes=[]):
'''
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,坏样本数量,坏样本比例
'''
df_out=cal_univar(df,feature,target,bin,classes=classes)
df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
df_out['acmEvent']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmEventRate']=df_out['acmEvent']/df_out['acmCnt']
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
......
from pyplotz.pyplotz import PyplotZ
from pyplotz.pyplotz import plt
from data.analyis import datacal
import seaborn as sns
import pandas as pd
plt.rc('figure',figsize=(8,6))
font_options={
......@@ -11,6 +12,67 @@ font_options={
plt.rc('font',**font_options)
def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
# values=['count'], aggfunc=['mean'])
plt.cla()
if classes !='':
df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
plt.subplot(2, 1,1)
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out,'grid','acmEventRate',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else :
df_out = datacal.cal_accume(df, x, y, bin)
plt.subplot(2, 1, 1)
draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out, 'grid','acmEventRate', title=title, xlabel=xlabel, ylabel=ylabel)
plt.tight_layout()
# plt.show()
return plt
def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.cla()
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.cla()
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
'''
双坐标轴
'''
......@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
'''
def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
def draw_barplot(df,x,y,hue='',title=''):
'''
:param df: dataframe
:param x: 横坐标
......@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
return fig
def draw_lineplot(df,x,y,hue='',title=''):
def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
'''
:param df: dataframe
:param x: 横坐标
......@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
'''
pltz = PyplotZ()
pltz.enable_chinese()
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
# fig = plt.figure()
if hue != '':
for type in df[hue].unique().tolist():
# == 画图
......@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
else:
plt.plot(df[x], df[y], linestyle='dashed', marker='o')
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz.xlabel(x)
pltz.ylabel(y)
if xlabel !='':
pltz.xlabel(xlabel)
else:
pltz.xlabel(x)
if ylabel !='':
pltz.ylabel(ylabel)
else:
pltz.ylabel(y)
pltz.title(title)
pltz.legend()
plt.grid()
plt.show()
return fig
\ No newline at end of file
# plt.show()
return plt
\ No newline at end of file
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics
from sklearn import metrics
......@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=1,silent=True,objective='binary:logistic',
verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
......@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:param kfold:
:return:
'''
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kflod,verbose=0,iid=True,refit=True)
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features].values,dftrain[resp].values)
grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数
return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_metric='auc',early_stopping_rounds=20):
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:type useTrainCV: object
......@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
:return:
'''
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds,
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
......@@ -106,9 +105,9 @@ def predict(clf,df,features):
def auc(clf,df,features,label):
#== 计算准确率,auc等指标
df=predict(clf,df,features)
accu=metrics.accuracy_score(df[label], df['predict'])
accu=metrics.accuracy_score(df[label].values, df['predict'].values)
auc=metrics.roc_auc_score(df[label],df['predict_proba'])
return dict({'accuracy':accu,'auc':auc})
return {'accuracy':accu,'auc':auc}
def featureImportance(clf,features):
......@@ -121,4 +120,6 @@ def featureImportance(clf,features):
# Print Feature Importance:
feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
feat_imp = feat_imp[feat_imp > 0]
feat_imp=feat_imp.to_frame().reset_index()
feat_imp.columns=['feature','weight']
return feat_imp
import pandas as pd
import numpy as np
import datetime
from mvp import xgbreport
from data.analyis import datacal
if __name__ == '__main__':
features=[
'third_data_source#xy_pan_newapplyAcredibility',
'third_data_source#xy_pan_newapplyAscore',
'third_data_source#xy_pan_newconsfinAavgAlimit',
'third_data_source#xy_pan_newconsfinAcredibility',
'third_data_source#xy_pan_newconsfinAcreditAlimit',
'third_data_source#xy_pan_newconsfinAmaxAlimit',
'third_data_source#xy_pan_newconsfinAorgAcountq',
'third_data_source#xy_pan_newconsfinAorgAcountx',
'third_data_source#xy_pan_newconsfinAproductAcount',
'third_data_source#xy_pan_newhistoryAfailAfee',
'third_data_source#xy_pan_newhistoryAsucAfee',
'third_data_source#xy_pan_newlatestAoneAmonthAfail',
'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
'third_data_source#xy_pan_newlatestAoneAmonthd',
'third_data_source#xy_pan_newlatestAoneAmonthj',
'third_data_source#xy_pan_newlatestAqueryAtime',
'third_data_source#xy_pan_newlatestAsixAmontha',
'third_data_source#xy_pan_newlatestAsixAmonthv',
'third_data_source#xy_pan_newlatestAthreeAmonthb',
'third_data_source#xy_pan_newlatestAthreeAmonthf',
'third_data_source#xy_pan_newloansAavgAlimit',
'third_data_source#xy_pan_newloansAcashAcount',
'third_data_source#xy_pan_newloansAcount',
'third_data_source#xy_pan_newloansAcredibilityh',
'third_data_source#xy_pan_newloansAcredibilitys',
'third_data_source#xy_pan_newloansAcreditAlimit',
'third_data_source#xy_pan_newloansAlatestAtime',
'third_data_source#xy_pan_newloansAlongAtime',
'third_data_source#xy_pan_newloansAmaxAlimit',
'third_data_source#xy_pan_newloansAorgAcounta',
'third_data_source#xy_pan_newloansAorgAcountg',
'third_data_source#xy_pan_newloansAoverdueAcount',
'third_data_source#xy_pan_newloansAproductAcount',
'third_data_source#xy_pan_newloansAscore',
'third_data_source#xy_pan_newloansAsettleAcount',
'third_data_source#xy_pan_newqueryAcashAcount',
'third_data_source#xy_pan_newqueryAfinanceAcount',
'third_data_source#xy_pan_newqueryAorgAcount',
'third_data_source#xy_pan_newqueryAsumAcount'
]
label='y'
df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc')
\ No newline at end of file
......@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename):
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
:param dftrain:
:param dftest:
:param features:
:param label:
:param path:
:param filename:
:return:
'''
document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf()
......@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
min_child_weight=range(1,4,1)
document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label)
# gamma
gamma=[i/10 for i in range(0,5)]
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
# subsample colsample_bytree
subsample=[0.8,0.9,1]
colsample_bytree=[0.8,0.9,1]
document, clf = tun_params(document, clf, dftrain, dftest,
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
# reg_alpha
reg_alpha=[0.001,0.01,0.1,1,10]
document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_alpha': reg_alpha}, features, label)
# reg_lambda
reg_lambda = [0.001, 0.01, 0.1, 1, 10]
document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_lambda': reg_lambda}, features, label)
# # gamma
# gamma=[i/10 for i in range(0,5)]
# document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
#
# # subsample colsample_bytree
# subsample=[0.8,0.9,1]
# colsample_bytree=[0.8,0.9,1]
# document, clf = tun_params(document, clf, dftrain, dftest,
# {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
#
# # reg_alpha
# reg_alpha=[0.001,0.01,0.1,1,10]
# document, clf = tun_params(document, clf, dftrain, dftest,
# {'reg_alpha': reg_alpha}, features, label)
#
# # reg_lambda
# reg_lambda = [0.001, 0.01, 0.1, 1, 10]
# document, clf = tun_params(document, clf, dftrain, dftest,
# {'reg_lambda': reg_lambda}, features, label)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features)
featureimp=xgboost.featureImportance(clf,features).to_frame(name=['weight','feature'])
#== 特征权重
featureimp=xgboost.featureImportance(clf,features)
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征')
document.add_picture('tmp.png')
#== 模型分同逾期率的关系图
dftrain['flag']='训练集'
dftest['flag']='测试集'
drawplot.liftchart(pd.concat([dftrain,dftest]), 'predict_proba', label, bin=10, classes='flag', title='liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('整体--liftchart')
document.add_picture('tmp.png')
filetool.saveDocument(document,path,filename)
#== 分月份查看-- 只看测试集
dftest=datacal.cal_month(dftest,'applied_at','applied_month')
drawplot.liftchart(dftest, 'predict_proba', label, bin=10, classes='applied_month', title='分月liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分月--liftchart')
document.add_picture('tmp.png')
#== 分用户类型分月查看
drawplot.liftchart(dftest,'predict_proba',label,bin=10,classes='applied_type',title='分用户类型liftchart',xlabel='模型分',ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分用户类型--liftchart')
document.add_picture('tmp.png')
#== 分渠道分月查看--取前5个渠道查看
channels=dftest.applied_channel.value_counts()[:5].index
drawplot.liftchart(dftest[dftest.applied_channel.isin(channels)], 'predict_proba', label, bin=10, classes='applied_channel', title='分渠道liftchart',
xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('测试集分渠道--liftchart')
document.add_picture('tmp.png')
#== 各个特征的 单变量图 和 pdp 图
for i in featureimp.feature.tolist():
drawplot.univarchart(dftest, i, label, bin=10, title='单变量%s' % i,
ylabel='逾期率').savefig('tmp.png')
document.add_paragraph('单变量%s' % i)
document.add_picture('tmp.png')
#= pdp
drawplot.pdpchart(dftest, i, 'predict_proba', bin=10, title='pdp %s' % i,
ylabel='模型分').savefig('tmp.png')
document.add_paragraph('pdp %s' % i)
document.add_picture('tmp.png')
filetool.saveDocument(document, path, filename)
......@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label)
clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
clf = xgboost.modelfit(clf, dftrain, features, label)
#==
# clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label)))
return document,clf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment