Commit 75c387db authored by linfang.wang's avatar linfang.wang

报告xgboost

parent 76a74874
......@@ -3,7 +3,30 @@ import numpy as np
import datetime
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain=df.reset_index()
#== dftrain 中划分 训练集,验证集
if trainsplit=='random':
# 随机分配 train / val
train = dftrain.sample(frac=trainsplitRatio, random_state=7)
val = dftrain[~dftrain.index.isin(train.index)]
elif trainsplit=='timeSeries':
# 按时间序列分配 train /val
train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
val = dftrain[~dftrain.index.isin(train.index)]
else:
train = df
val = None
return train,val
def cal_week(df,date_name,date_name_new):
'''
......@@ -36,6 +59,18 @@ def cal_month(df,date_name,date_name_new):
return df
# def cal_isometric(df,feature,bin=10,method=2):
# '''
# 等分计算,默认等频;等宽 1 ,等频 2 ,聚类 3
# :param df:
# :param feature:
# :param bin:
# :param method: 1:等宽;2:等频;3:聚类;默认2
# :return:
# '''
# if method==1:
#
def cal_feature_grid(df,feature,bin=10):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
......
......@@ -3,12 +3,13 @@ from docx import Document
from docx.shared import Inches
def buildDocument(path,filename):
if str.rfind(filename,0,3)!='doc':
if str.rfind(filename,0,4) !='docx':
if filename[-3:]!='doc':
if filename[-4:] !='docx':
raise ValueError('{} is not a word file'.format(filename))
if os.path.exists(os.path.join(path,filename)):
return Document(os.path.join(path,filename))
return Document()
def saveDocument(document,path,filename):
if str.rfind(filename,0,3)!='doc':
if str.rfind(filename,0,4) !='docx':
......
......@@ -27,7 +27,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
'''
def draw_barplot(df,x,y,hue='',title=''):
def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
'''
:param df: dataframe
:param x: 横坐标
......
......@@ -3,34 +3,11 @@ import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain=df.reset_index()
#== dftrain 中划分 训练集,验证集
if trainsplit=='random':
# 随机分配 train / val
train = dftrain.sample(frac=trainsplitRatio, random_state=7)
val = dftrain[~dftrain.index.isin(train.index)]
elif trainsplit=='timeSeries':
# 按时间序列分配 train /val
train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
val = dftrain[~dftrain.index.isin(train.index)]
else:
train = df
val = None
return train,val
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
......@@ -61,27 +38,18 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=True,objective='binary:logistic',
verbosity=1,silent=True,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7
)
def buildParamGrid(learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3],gamma=[i/10 for i in range(0,5)],max_depth=[2,3],
min_child_weight=[1,2,3,4,5,6],subsample=[i/10 for i in range(6,10)],colsample_bytree=[i/10 for i in range(6,10)],
reg_alpha=[0.001,0.01,0.05,0.1,1,10],reg_lambda=[0.001,0.01,0.05,0.1,1,10]):
param_grid = dict(learning_rate=learning_rate, gamma=gamma, max_depth=max_depth, min_child_weight=min_child_weight,
subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha,
reg_lambda=reg_lambda)
return param_grid
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
'''
模型自动调参
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定,buildParamGrid
:param param_grid : dict,调参的区间设定
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param features:
......@@ -90,7 +58,7 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:return:
'''
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kflod,verbose=3,iid=True,refit=True)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kflod,verbose=0,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features].values,dftrain[resp].values)
#== 获取最优参数
......@@ -135,6 +103,14 @@ def predict(clf,df,features):
df['predict_proba']=clf.predict_proba(df[features])[:,1]
return df
def auc(clf,df,features,label):
#== 计算准确率,auc等指标
df=predict(clf,df,features)
accu=metrics.accuracy_score(df[label], df['predict'])
auc=metrics.roc_auc_score(df[label],df['predict_proba'])
return dict({'accuracy':accu,'auc':auc})
def featureImportance(clf,features):
'''
获取模型 特征权重
......
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import xgboost
from matplotlib import pyplot as plt
from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename):
document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf()
document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
clf=xgboost.modelfit(clf,dftrain,features,label)
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
document.add_heading('调整参数')
max_depth=[2,3]
min_child_weight=range(1,4,1)
document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label)
# gamma
gamma=[i/10 for i in range(0,5)]
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
# subsample colsample_bytree
subsample=[0.8,0.9,1]
colsample_bytree=[0.8,0.9,1]
document, clf = tun_params(document, clf, dftrain, dftest,
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
# reg_alpha
reg_alpha=[0.001,0.01,0.1,1,10]
document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_alpha': reg_alpha}, features, label)
# reg_lambda
reg_lambda = [0.001, 0.01, 0.1, 1, 10]
document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_lambda': reg_lambda}, features, label)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features)
featureimp=xgboost.featureImportance(clf,features).to_frame(name=['weight','feature'])
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征')
document.add_picture('tmp.png')
filetool.saveDocument(document,path,filename)
def tun_params(document,clf,dftrain,dftest,params,features,label):
for i in dict(params).keys():
document.add_paragraph('调参{},取值{}'.format(i,params[i]))
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label)
clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
return document,clf
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment