Commit 981da436 authored by linfang.wang's avatar linfang.wang

xgboost 引入调参

parent 9f42261f
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain=df.reset_index()
#== dftrain 中划分 训练集,验证集
if trainsplit=='random':
# 随机分配 train / val
train = dftrain.sample(frac=trainsplitRatio, random_state=7)
val = dftrain[~dftrain.index.isin(train.index)]
elif trainsplit=='timeSeries':
# 按时间序列分配 train /val
train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
val = dftrain[~dftrain.index.isin(train.index)]
else:
train = df
val = None
return train,val
# log 损失函数
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5):
'''
创建 XGBClassifier instance
:param max_depth:叶子节点深度,值越大越容易过拟合。可使用CV 进行调节-- booster 参数
:param learning_rate:学习率,alias eta-- booster 参数
:param n_estimators:number of trees
:param verbosity:0:silent;3:debug replace silent 是否输出模型迭代信息-- 通用参数
:param objective:目标学习函数-- 学习目标参数
binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
multi:softmax 使用softmax的多分类器,返回预测的类别(不是概率)。 需要设置num_class(类别数目)。
multi:softprob 和multi:softmax参数一样,但是返回的是每个数据属于各个类别的概率
:param booster:gbtree gblinear dart-- 通用参数
:param n_jobs: replaces nthread 进程数-- 通用参数
:param gamma:如果损失函数下降,则分裂节点。控制最小损失函数下降值-- booster 参数
:param min_child_weight:最小叶子节点样本权重和。避免过拟合,使用cv进行调整,值大,防过拟合,亦可能欠拟合-- booster 参数
:param max_delta_step:限制每棵树权重改变的最大步长。0:无约束,>0 保守-- booster 参数
:param subsample:这个参数控制对于每棵树,随机采样的比例
:param colsample_bytree:用来控制每棵随机采样的列数的占比(每一列是一个特征)。
:param reg_alpha:L1 正则项参数
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed
:param missing:缺失值
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=2,objective='binary:logistic',
booster='gbtree',n_jobs=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,missing=-9999999
)
def automodelfit(dftrain,features,resp, kfold=10,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col=None):
'''
模型自动调参,指定标准为 roc_auc
:param df:
:param features:
:param resp:
:param kfold:
:param trainsplit:
:param trainsplitRatio:
:param sort_col:
:return:
'''
clf=buildClf()
learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3]
gamma=[i/10 for i in range(0,5)]
max_depth=[2,3]
min_child_weight=[1,2,3,4,5,6]
subsample=[i/10 for i in range(6,10)]
colsample_bytree=[i/10 for i in range(6,10)]
reg_alpha=[0.001,0.01,0.05,0.1,1,10]
reg_lambda=[0.001,0.01,0.05,0.1,1,10]
param_grid=dict(learning_rate=learning_rate,gamma=gamma,max_depth=max_depth,min_child_weight=min_child_weight,
subsample=subsample,colsample_bytree=colsample_bytree,reg_alpha=reg_alpha,reg_lambda=reg_lambda)
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring='roc_auc',n_jobs=-1,cv=kflod,verbose=2,iid=True,refit=True)
grid_search.fit(dftrain[features].values,dftrain[resp].values)
#== 获取最优参数
return grid_search.best_estimator_
def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stopping_rounds=20,
eval_metric='auc',trainsplit='random',trainsplitRatio=0.8,sort_col=None):
'''
模型训练
:param clf:XGBClassifier
:param df:
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
dftrain,dfval=split_train_val(df,features,resp,trainsplit,trainsplitRatio,sort_col)
if useTrainCV:
xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values,missing=-9999999)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds,
metrics=(['auc']), early_stopping_rounds=early_stopping_rounds , verbose_eval = 100)
clf.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data and save the model
if not dfval:
clf.fit(dftrain[features].values,dftrain[resp].values,
eval_set=(dfval[features].values,dfval[resp].values),
eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
else:
clf.fit(dftrain[features].values, dftrain[resp].values,
eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
return clf
def predict(clf,df,features):
'''
计算预测值
:param clf:
:param df:
:param features:
:return:
'''
df['predict']=clf.predict(df[features].values)
df['predict_proba']=clf.predict_proba(df[features].values)[:1]
return df
def featureImportance(clf,features):
'''
获取模型 特征权重
:param clf:
:param features:
:return:
'''
# Print Feature Importance:
feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
feat_imp = feat_imp[feat_imp > 0]
return feat_imp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment