Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
313190e7
Commit
313190e7
authored
Apr 11, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
加入MySQL 连接
parent
981da436
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
49 deletions
+36
-49
drawplot.py
data/graph/drawplot.py
+2
-2
xgboost.py
model/xgboost.py
+34
-47
No files found.
data/graph/drawplot.py
View file @
313190e7
...
...
@@ -48,7 +48,7 @@ def draw_barplot(df,x,y,hue='',title=''):
sns
.
barplot
(
x
,
y
,
hue
=
hue
,
data
=
df
,
ax
=
ax
)
else
:
sns
.
barplot
(
x
,
y
,
data
=
df
,
ax
=
ax
)
pltz
.
xticks
(
range
(
len
(
df
[
x
]
.
unique
()
.
tolist
())),
df
[
x
]
.
unique
()
.
tolist
())
#
pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz
.
xlabel
(
x
)
pltz
.
ylabel
(
y
)
pltz
.
title
(
title
)
...
...
@@ -78,7 +78,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
plt
.
plot
(
tmp
[
x
],
tmp
[
y
],
linestyle
=
'dashed'
,
marker
=
'o'
,
label
=
type
)
else
:
plt
.
plot
(
df
[
x
],
df
[
y
],
linestyle
=
'dashed'
,
marker
=
'o'
)
pltz
.
xticks
(
range
(
len
(
df
[
x
]
.
unique
()
.
tolist
())),
df
[
x
]
.
unique
()
.
tolist
())
#
pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz
.
xlabel
(
x
)
pltz
.
ylabel
(
y
)
pltz
.
title
(
title
)
...
...
model/xgboost.py
View file @
313190e7
...
...
@@ -65,63 +65,52 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed
:param missing:缺失值
:param random_state: replace seed,统一设置为7,仅为随机可复现
:return:XGBClassifier
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
2
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
verbosity
=
True
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
base_score
=
base_score
,
random_state
=
7
,
missing
=-
9999999
base_score
=
base_score
,
random_state
=
7
,
seed
=
7
)
def
automodelfit
(
dftrain
,
features
,
resp
,
kfold
=
10
,
trainsplit
=
'timeSeries'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
def
buildParamGrid
(
learning_rate
=
[
0.001
,
0.01
,
0.05
,
0.1
,
0.2
,
0.3
],
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)],
max_depth
=
[
2
,
3
],
min_child_weight
=
[
1
,
2
,
3
,
4
,
5
,
6
],
subsample
=
[
i
/
10
for
i
in
range
(
6
,
10
)],
colsample_bytree
=
[
i
/
10
for
i
in
range
(
6
,
10
)],
reg_alpha
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
],
reg_lambda
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]):
param_grid
=
dict
(
learning_rate
=
learning_rate
,
gamma
=
gamma
,
max_depth
=
max_depth
,
min_child_weight
=
min_child_weight
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
)
return
param_grid
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
'''
模型自动调参,指定标准为 roc_auc
:param df:
模型自动调参
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定,buildParamGrid
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param features:
:param resp:
:param kfold:
:param trainsplit:
:param trainsplitRatio:
:param sort_col:
:return:
'''
clf
=
buildClf
()
learning_rate
=
[
0.001
,
0.01
,
0.05
,
0.1
,
0.2
,
0.3
]
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)]
max_depth
=
[
2
,
3
]
min_child_weight
=
[
1
,
2
,
3
,
4
,
5
,
6
]
subsample
=
[
i
/
10
for
i
in
range
(
6
,
10
)]
colsample_bytree
=
[
i
/
10
for
i
in
range
(
6
,
10
)]
reg_alpha
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]
reg_lambda
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]
param_grid
=
dict
(
learning_rate
=
learning_rate
,
gamma
=
gamma
,
max_depth
=
max_depth
,
min_child_weight
=
min_child_weight
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
)
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
'roc_auc'
,
n_jobs
=-
1
,
cv
=
kflod
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=-
1
,
cv
=
kflod
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
#== 模型训练
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
#== 获取最优参数
return
grid_search
.
best_estimator_
return
grid_search
def
modelfit
(
clf
,
df
,
features
,
resp
,
useTrainCV
=
True
,
cv_folds
=
10
,
early_stopping_rounds
=
20
,
eval_metric
=
'auc'
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
dfval
=
None
,
useTrainCV
=
True
,
cv_folds
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
'''
模型训练
:param clf:XGBClassifier
:param df:
:param dftrain:训练集
:param dfval 验证集用于模型训练,early_stopping_rounds
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
...
...
@@ -130,26 +119,24 @@ def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stop
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
dftrain
,
dfval
=
split_train_val
(
df
,
features
,
resp
,
trainsplit
,
trainsplitRatio
,
sort_col
)
if
useTrainCV
:
xgb_param
=
clf
.
get_xgb_params
()
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
,
missing
=-
9999999
)
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
)
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
cv_folds
,
metrics
=
([
'auc'
]),
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
100
)
metrics
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
True
)
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
# Fit the algorithm on the data and save the model
if
not
dfval
:
clf
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
,
eval_set
=
(
dfval
[
features
]
.
values
,
dfval
[
resp
]
.
values
)
,
if
not
dfval
==
None
:
#== 如果有验证集的话,则无需进行cv运算 基于验证集auc early_stopping_rounds
clf
.
fit
(
dftrain
[
features
],
dftrain
[
resp
],
eval_set
=
[(
dftrain
[
features
],
dftrain
[
resp
]),
(
dfval
[
features
],
dfval
[
resp
])]
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
)
else
:
clf
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
)
clf
.
fit
(
dftrain
[
features
],
dftrain
[
resp
],
eval_metric
=
eval_metric
)
return
clf
def
predict
(
clf
,
df
,
features
):
'''
计算预测值
...
...
@@ -158,8 +145,8 @@ def predict(clf,df,features):
:param features:
:return:
'''
df
[
'predict'
]
=
clf
.
predict
(
df
[
features
]
.
values
)
df
[
'predict_proba'
]
=
clf
.
predict_proba
(
df
[
features
]
.
values
)[:
1
]
df
[
'predict'
]
=
clf
.
predict
(
df
[
features
])
df
[
'predict_proba'
]
=
clf
.
predict_proba
(
df
[
features
]
)[:,
1
]
return
df
def
featureImportance
(
clf
,
features
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment