xgb report done

37c70174 · linfang.wang · 75c387db · 37c70174 · 37c70174 · 37c70174
Commit 37c70174 authored Apr 18, 2019 by linfang.wang
Showing with 276 additions and 25 deletions

datacal.py data/analyis/datacal.py +14 -4

drawplot.py data/graph/drawplot.py +76 -9

xgboost.py models/xgboost.py +13 -12

report.py mvp/report.py +52 -0

xgbreport.py mvp/xgbreport.py +121 -0

No files found.
--- a/data/analyis/datacal.py
+++ b/data/analyis/datacal.py
@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
        feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
    return feature_grid
-def cal_accume(df,feature,target,bin=10):
+def cal_accume(df,feature,target,bin=10,classes=[]):
-    df_out=cal_univar(df,feature,target,bin)
+    '''
-    df_out['acmCnt']=df_out['count'].cumsum()
-    df_out['acmEvent']=df_out['sum'].cumsum()
+    :param df:
+    :param feature:
+    :param target:
+    :param bin:
+    :param classes:
+    :return: 对feature 进行分段；计算每个区间的mean,count,sum 累计 count,坏样本数量，坏样本比例
+    '''
+    df_out=cal_univar(df,feature,target,bin,classes=classes)
+    df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
+    df_out['acmEvent']=df_out.groupby(classes)['sum'].cumsum()
    df_out['acmEventRate']=df_out['acmEvent']/df_out['acmCnt']
+    return df_out
 def cal_univar(df,feature,target,bin=10,classes=[]):

--- a/data/graph/drawplot.py
+++ b/data/graph/drawplot.py
 from pyplotz.pyplotz import PyplotZ
 from pyplotz.pyplotz import plt
+from data.analyis import datacal
 import seaborn as sns
+import pandas as pd
 plt.rc('figure',figsize=(8,6))
 font_options={
@@ -11,6 +12,67 @@ font_options={
 plt.rc('font',**font_options)
+def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
+    # #== 单个TODO 待输出
+    # df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
+    #                values=['count'], aggfunc=['mean'])
+    plt.cla()
+    if classes !='':
+        df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
+        plt.subplot(2, 1,1)
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+        plt.subplot(2, 1, 2)
+        draw_lineplot(df_out,'grid','acmEventRate',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+    else :
+        df_out = datacal.cal_accume(df, x, y, bin)
+        plt.subplot(2, 1, 1)
+        draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
+        plt.subplot(2, 1, 2)
+        draw_lineplot(df_out, 'grid','acmEventRate', title=title, xlabel=xlabel, ylabel=ylabel)
+    plt.tight_layout()
+    # plt.show()
+    return plt
+def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
+    '''
+    特征与label的关系图,y为label
+    :param df:
+    :return:
+    '''
+    plt.cla()
+    plt.subplot(1, 1, 1)
+    if classes !='':
+        df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+    else:
+        df_out = datacal.cal_univar(df, x, y, bin)
+        draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
+    # plt.show()
+    return plt
+def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
+    '''
+    特征与label的关系图,y为label
+    :param df:
+    :return:
+    '''
+    plt.cla()
+    plt.subplot(1, 1, 1)
+    if classes !='':
+        df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+    else:
+        df_out = datacal.cal_univar(df, x, y, bin)
+        draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
+    # plt.show()
+    return plt
 '''
 双坐标轴
 '''
@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
    '''
-def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
+def draw_barplot(df,x,y,hue='',title=''):
    '''
    :param df: dataframe
    :param x: 横坐标
@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
    return fig
-def draw_lineplot(df,x,y,hue='',title=''):
+def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
    '''
    :param df: dataframe
    :param x: 横坐标
@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
    '''
    pltz = PyplotZ()
    pltz.enable_chinese()
-    fig = plt.figure()
+    # fig = plt.figure()
-    ax = fig.add_subplot(1, 1, 1)
    if hue != '':
        for type in df[hue].unique().tolist():
            # == 画图
@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
    else:
        plt.plot(df[x], df[y], linestyle='dashed', marker='o')
    # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
+    if xlabel !='':
+        pltz.xlabel(xlabel)
+    else:
        pltz.xlabel(x)
+    if ylabel !='':
+        pltz.ylabel(ylabel)
+    else:
        pltz.ylabel(y)
    pltz.title(title)
    pltz.legend()
    plt.grid()
-    plt.show()
+    # plt.show()
-    return fig
+    return plt
\ No newline at end of file
--- a/models/xgboost.py
+++ b/models/xgboost.py
 import pandas as pd
 import numpy as np
 import xgboost as xgb
-from sklearn.model_selection import KFold, train_test_split, GridSearchCV,StratifiedKFold
+from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn import metrics
 def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
                  min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
                  scale_pos_weight=1, base_score=0.5):
@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
    :return:XGBClassifier
    '''
    return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
-                             verbosity=1,silent=True,objective='binary:logistic',
+                             verbosity=0,silent=0,objective='binary:logistic',
                             booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
                             max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
                             reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
    :param kfold:
    :return:
    '''
-    kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
+    # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
-    grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kflod,verbose=0,iid=True,refit=True)
+    grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
    #== 模型训练
-    grid_search.fit(dftrain[features].values,dftrain[resp].values)
+    grid_search.fit(dftrain[features],dftrain[resp])
    #== 获取最优参数
    return grid_search
-def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_metric='auc',early_stopping_rounds=20):
+def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
    '''
    模型训练
    :type useTrainCV: object
@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
    :return:
    '''
    if useTrainCV:
+        # kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
        xgb_param = clf.get_xgb_params()
        xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
-        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds,
+        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
            metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
        clf.set_params(n_estimators=cvresult.shape[0])
@@ -106,9 +105,9 @@ def predict(clf,df,features):
 def auc(clf,df,features,label):
    #== 计算准确率，auc等指标
    df=predict(clf,df,features)
-    accu=metrics.accuracy_score(df[label], df['predict'])
+    accu=metrics.accuracy_score(df[label].values, df['predict'].values)
    auc=metrics.roc_auc_score(df[label],df['predict_proba'])
-    return dict({'accuracy':accu,'auc':auc})
+    return {'accuracy':accu,'auc':auc}
 def featureImportance(clf,features):
@@ -121,4 +120,6 @@ def featureImportance(clf,features):
    # Print Feature Importance:
    feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
    feat_imp = feat_imp[feat_imp > 0]
+    feat_imp=feat_imp.to_frame().reset_index()
+    feat_imp.columns=['feature','weight']
    return feat_imp
--- a/mvp/report.py
+++ b/mvp/report.py
+import pandas as pd
+import numpy as np
+import datetime
+from mvp import xgbreport
+from data.analyis import datacal
+if __name__ == '__main__':
+    features=[
+        'third_data_source#xy_pan_newapplyAcredibility',
+        'third_data_source#xy_pan_newapplyAscore',
+        'third_data_source#xy_pan_newconsfinAavgAlimit',
+        'third_data_source#xy_pan_newconsfinAcredibility',
+        'third_data_source#xy_pan_newconsfinAcreditAlimit',
+        'third_data_source#xy_pan_newconsfinAmaxAlimit',
+        'third_data_source#xy_pan_newconsfinAorgAcountq',
+        'third_data_source#xy_pan_newconsfinAorgAcountx',
+        'third_data_source#xy_pan_newconsfinAproductAcount',
+        'third_data_source#xy_pan_newhistoryAfailAfee',
+        'third_data_source#xy_pan_newhistoryAsucAfee',
+        'third_data_source#xy_pan_newlatestAoneAmonthAfail',
+        'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
+        'third_data_source#xy_pan_newlatestAoneAmonthd',
+        'third_data_source#xy_pan_newlatestAoneAmonthj',
+        'third_data_source#xy_pan_newlatestAqueryAtime',
+        'third_data_source#xy_pan_newlatestAsixAmontha',
+        'third_data_source#xy_pan_newlatestAsixAmonthv',
+        'third_data_source#xy_pan_newlatestAthreeAmonthb',
+        'third_data_source#xy_pan_newlatestAthreeAmonthf',
+        'third_data_source#xy_pan_newloansAavgAlimit',
+        'third_data_source#xy_pan_newloansAcashAcount',
+        'third_data_source#xy_pan_newloansAcount',
+        'third_data_source#xy_pan_newloansAcredibilityh',
+        'third_data_source#xy_pan_newloansAcredibilitys',
+        'third_data_source#xy_pan_newloansAcreditAlimit',
+        'third_data_source#xy_pan_newloansAlatestAtime',
+        'third_data_source#xy_pan_newloansAlongAtime',
+        'third_data_source#xy_pan_newloansAmaxAlimit',
+        'third_data_source#xy_pan_newloansAorgAcounta',
+        'third_data_source#xy_pan_newloansAorgAcountg',
+        'third_data_source#xy_pan_newloansAoverdueAcount',
+        'third_data_source#xy_pan_newloansAproductAcount',
+        'third_data_source#xy_pan_newloansAscore',
+        'third_data_source#xy_pan_newloansAsettleAcount',
+        'third_data_source#xy_pan_newqueryAcashAcount',
+        'third_data_source#xy_pan_newqueryAfinanceAcount',
+        'third_data_source#xy_pan_newqueryAorgAcount',
+        'third_data_source#xy_pan_newqueryAsumAcount'
+    ]
+    label='y'
+    df=pd.read_csv('test.csv')
+    dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
+    xgbreport.report(dftrain,dftest,features,label,'','tmp.doc')
\ No newline at end of file
--- a/mvp/xgboostreport.py
+++ b/mvp/xgboostreport.py
@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
 from data.graph import drawplot
 def report(dftrain,dftest,features,label,path,filename):
+    '''
+    dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
+    :param dftrain:
+    :param dftest:
+    :param features:
+    :param label:
+    :param path:
+    :param filename:
+    :return:
+    '''
    document=filetool.buildDocument(path,filename)
    document.add_heading('xgboost 算法运行报告')
    clf=xgboost.buildClf()
@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
    min_child_weight=range(1,4,1)
    document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label)
-    # gamma
+    # # gamma
-    gamma=[i/10 for i in range(0,5)]
+    # gamma=[i/10 for i in range(0,5)]
-    document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
+    # document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
+    #
-    # subsample colsample_bytree
+    # # subsample colsample_bytree
-    subsample=[0.8,0.9,1]
+    # subsample=[0.8,0.9,1]
-    colsample_bytree=[0.8,0.9,1]
+    # colsample_bytree=[0.8,0.9,1]
-    document, clf = tun_params(document, clf, dftrain, dftest,
+    # document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
+    #                            {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
+    #
-    # reg_alpha
+    # # reg_alpha
-    reg_alpha=[0.001,0.01,0.1,1,10]
+    # reg_alpha=[0.001,0.01,0.1,1,10]
-    document, clf = tun_params(document, clf, dftrain, dftest,
+    # document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'reg_alpha': reg_alpha}, features, label)
+    #                            {'reg_alpha': reg_alpha}, features, label)
+    #
-    # reg_lambda
+    # # reg_lambda
-    reg_lambda = [0.001, 0.01, 0.1, 1, 10]
+    # reg_lambda = [0.001, 0.01, 0.1, 1, 10]
-    document, clf = tun_params(document, clf, dftrain, dftest,
+    # document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'reg_lambda': reg_lambda}, features, label)
+    #                            {'reg_lambda': reg_lambda}, features, label)
    #==生成模型最后的报告，各个特征的单变量图，PDP，liftchart
    dftrain=xgboost.predict(clf,dftrain,features)
    dftest=xgboost.predict(clf,dftest,features)
-    featureimp=xgboost.featureImportance(clf,features).to_frame(name=['weight','feature'])
+    #== 特征权重
+    featureimp=xgboost.featureImportance(clf,features)
    fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
    fig.savefig('tmp.png')
    document.add_paragraph('特征权重图，近前10个特征')
    document.add_picture('tmp.png')
+    #== 模型分同逾期率的关系图
+    dftrain['flag']='训练集'
+    dftest['flag']='测试集'
+    drawplot.liftchart(pd.concat([dftrain,dftest]), 'predict_proba', label, bin=10, classes='flag', title='liftchart',
+                       xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
+    document.add_paragraph('整体--liftchart')
+    document.add_picture('tmp.png')
-    filetool.saveDocument(document,path,filename)
+    #== 分月份查看-- 只看测试集
+    dftest=datacal.cal_month(dftest,'applied_at','applied_month')
+    drawplot.liftchart(dftest, 'predict_proba', label, bin=10, classes='applied_month', title='分月liftchart',
+                       xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
+    document.add_paragraph('测试集分月--liftchart')
+    document.add_picture('tmp.png')
+    #== 分用户类型分月查看
+    drawplot.liftchart(dftest,'predict_proba',label,bin=10,classes='applied_type',title='分用户类型liftchart',xlabel='模型分',ylabel='逾期率').savefig('tmp.png')
+    document.add_paragraph('测试集分用户类型--liftchart')
+    document.add_picture('tmp.png')
+    #== 分渠道分月查看--取前5个渠道查看
+    channels=dftest.applied_channel.value_counts()[:5].index
+    drawplot.liftchart(dftest[dftest.applied_channel.isin(channels)], 'predict_proba', label, bin=10, classes='applied_channel', title='分渠道liftchart',
+                       xlabel='模型分', ylabel='逾期率').savefig('tmp.png')
+    document.add_paragraph('测试集分渠道--liftchart')
+    document.add_picture('tmp.png')
+    #== 各个特征的 单变量图 和 pdp 图
+    for i in featureimp.feature.tolist():
+        drawplot.univarchart(dftest, i, label, bin=10, title='单变量%s' % i,
+                            ylabel='逾期率').savefig('tmp.png')
+        document.add_paragraph('单变量%s' % i)
+        document.add_picture('tmp.png')
+        #= pdp
+        drawplot.pdpchart(dftest, i, 'predict_proba', bin=10, title='pdp %s' % i,
+                             ylabel='模型分').savefig('tmp.png')
+        document.add_paragraph('pdp %s' % i)
+        document.add_picture('tmp.png')
+    filetool.saveDocument(document, path, filename)
@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
    grid_search = xgboost.automodelfit(clf, params,dftrain, features, label)
    clf = grid_search.best_estimator_
    document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
-    clf = xgboost.modelfit(clf, dftrain, features, label)
+    #==
+    # clf = xgboost.modelfit(clf, dftrain, features, label)
+    document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
    document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
-    document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf, dftrain, features, label)))
+    document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label)))
-    document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
+    document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label)))
    return document,clf