Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
981da436
Commit
981da436
authored
Apr 09, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
xgboost 引入调参
parent
9f42261f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
175 additions
and
0 deletions
+175
-0
__init__.py
model/lightgbm/__init__.py
+0
-0
__init__.py
model/randomforest/__init__.py
+0
-0
xgboost.py
model/xgboost.py
+175
-0
__init__.py
model/xgboost/__init__.py
+0
-0
No files found.
model/lightgbm/__init__.py
deleted
100644 → 0
View file @
9f42261f
model/randomforest/__init__.py
deleted
100644 → 0
View file @
9f42261f
model/xgboost.py
0 → 100644
View file @
981da436
import
pandas
as
pd
import
numpy
as
np
import
xgboost
as
xgb
from
sklearn.model_selection
import
KFold
,
train_test_split
,
GridSearchCV
,
StratifiedKFold
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain
=
df
.
reset_index
()
#== dftrain 中划分 训练集,验证集
if
trainsplit
==
'random'
:
# 随机分配 train / val
train
=
dftrain
.
sample
(
frac
=
trainsplitRatio
,
random_state
=
7
)
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
elif
trainsplit
==
'timeSeries'
:
# 按时间序列分配 train /val
train
=
dftrain
.
sort_values
(
by
=
sort_col
)
.
head
(
int
(
len
(
dftrain
)
*
trainsplitRatio
))
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
else
:
train
=
df
val
=
None
return
train
,
val
# log 损失函数
def
logregobj
(
preds
,
dtrain
):
labels
=
dtrain
.
get_label
()
preds
=
1.0
/
(
1.0
+
np
.
exp
(
-
preds
))
grad
=
preds
-
labels
hess
=
preds
*
(
1.0
-
preds
)
return
grad
,
hess
def
buildClf
(
max_depth
=
2
,
learning_rate
=
0.1
,
n_estimators
=
5000
,
gamma
=
0
,
min_child_weight
=
1
,
max_delta_step
=
0
,
subsample
=
0.8
,
colsample_bytree
=
0.8
,
reg_alpha
=
0
,
reg_lambda
=
1
,
scale_pos_weight
=
1
,
base_score
=
0.5
):
'''
创建 XGBClassifier instance
:param max_depth:叶子节点深度,值越大越容易过拟合。可使用CV 进行调节-- booster 参数
:param learning_rate:学习率,alias eta-- booster 参数
:param n_estimators:number of trees
:param verbosity:0:silent;3:debug replace silent 是否输出模型迭代信息-- 通用参数
:param objective:目标学习函数-- 学习目标参数
binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
multi:softmax 使用softmax的多分类器,返回预测的类别(不是概率)。 需要设置num_class(类别数目)。
multi:softprob 和multi:softmax参数一样,但是返回的是每个数据属于各个类别的概率
:param booster:gbtree gblinear dart-- 通用参数
:param n_jobs: replaces nthread 进程数-- 通用参数
:param gamma:如果损失函数下降,则分裂节点。控制最小损失函数下降值-- booster 参数
:param min_child_weight:最小叶子节点样本权重和。避免过拟合,使用cv进行调整,值大,防过拟合,亦可能欠拟合-- booster 参数
:param max_delta_step:限制每棵树权重改变的最大步长。0:无约束,>0 保守-- booster 参数
:param subsample:这个参数控制对于每棵树,随机采样的比例
:param colsample_bytree:用来控制每棵随机采样的列数的占比(每一列是一个特征)。
:param reg_alpha:L1 正则项参数
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed
:param missing:缺失值
:return:XGBClassifier
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
2
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
base_score
=
base_score
,
random_state
=
7
,
missing
=-
9999999
)
def
automodelfit
(
dftrain
,
features
,
resp
,
kfold
=
10
,
trainsplit
=
'timeSeries'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
模型自动调参,指定标准为 roc_auc
:param df:
:param features:
:param resp:
:param kfold:
:param trainsplit:
:param trainsplitRatio:
:param sort_col:
:return:
'''
clf
=
buildClf
()
learning_rate
=
[
0.001
,
0.01
,
0.05
,
0.1
,
0.2
,
0.3
]
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)]
max_depth
=
[
2
,
3
]
min_child_weight
=
[
1
,
2
,
3
,
4
,
5
,
6
]
subsample
=
[
i
/
10
for
i
in
range
(
6
,
10
)]
colsample_bytree
=
[
i
/
10
for
i
in
range
(
6
,
10
)]
reg_alpha
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]
reg_lambda
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]
param_grid
=
dict
(
learning_rate
=
learning_rate
,
gamma
=
gamma
,
max_depth
=
max_depth
,
min_child_weight
=
min_child_weight
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
)
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
'roc_auc'
,
n_jobs
=-
1
,
cv
=
kflod
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
#== 获取最优参数
return
grid_search
.
best_estimator_
def
modelfit
(
clf
,
df
,
features
,
resp
,
useTrainCV
=
True
,
cv_folds
=
10
,
early_stopping_rounds
=
20
,
eval_metric
=
'auc'
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
模型训练
:param clf:XGBClassifier
:param df:
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
dftrain
,
dfval
=
split_train_val
(
df
,
features
,
resp
,
trainsplit
,
trainsplitRatio
,
sort_col
)
if
useTrainCV
:
xgb_param
=
clf
.
get_xgb_params
()
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
,
missing
=-
9999999
)
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
cv_folds
,
metrics
=
([
'auc'
]),
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
100
)
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
# Fit the algorithm on the data and save the model
if
not
dfval
:
clf
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
,
eval_set
=
(
dfval
[
features
]
.
values
,
dfval
[
resp
]
.
values
),
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
)
else
:
clf
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
)
return
clf
def
predict
(
clf
,
df
,
features
):
'''
计算预测值
:param clf:
:param df:
:param features:
:return:
'''
df
[
'predict'
]
=
clf
.
predict
(
df
[
features
]
.
values
)
df
[
'predict_proba'
]
=
clf
.
predict_proba
(
df
[
features
]
.
values
)[:
1
]
return
df
def
featureImportance
(
clf
,
features
):
'''
获取模型 特征权重
:param clf:
:param features:
:return:
'''
# Print Feature Importance:
feat_imp
=
pd
.
Series
(
clf
.
get_booster
()
.
get_fscore
(),
features
)
.
sort_values
(
ascending
=
False
,
na_position
=
'last'
)
feat_imp
=
feat_imp
[
feat_imp
>
0
]
return
feat_imp
model/xgboost/__init__.py
deleted
100644 → 0
View file @
9f42261f
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment