Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
75c387db
Commit
75c387db
authored
Apr 18, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
报告xgboost
parent
76a74874
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
127 additions
and
40 deletions
+127
-40
datacal.py
data/analyis/datacal.py
+36
-1
filetool.py
data/analyis/filetool.py
+3
-2
drawplot.py
data/graph/drawplot.py
+1
-1
__init__.py
models/__init__.py
+0
-0
xgboost.py
models/xgboost.py
+12
-36
xgboostreport.py
mvp/xgboostreport.py
+75
-0
No files found.
data/analyis/datacal.py
View file @
75c387db
...
@@ -3,7 +3,30 @@ import numpy as np
...
@@ -3,7 +3,30 @@ import numpy as np
import
datetime
import
datetime
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain
=
df
.
reset_index
()
#== dftrain 中划分 训练集,验证集
if
trainsplit
==
'random'
:
# 随机分配 train / val
train
=
dftrain
.
sample
(
frac
=
trainsplitRatio
,
random_state
=
7
)
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
elif
trainsplit
==
'timeSeries'
:
# 按时间序列分配 train /val
train
=
dftrain
.
sort_values
(
by
=
sort_col
)
.
head
(
int
(
len
(
dftrain
)
*
trainsplitRatio
))
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
else
:
train
=
df
val
=
None
return
train
,
val
def
cal_week
(
df
,
date_name
,
date_name_new
):
def
cal_week
(
df
,
date_name
,
date_name_new
):
'''
'''
...
@@ -36,6 +59,18 @@ def cal_month(df,date_name,date_name_new):
...
@@ -36,6 +59,18 @@ def cal_month(df,date_name,date_name_new):
return
df
return
df
# def cal_isometric(df,feature,bin=10,method=2):
# '''
# 等分计算,默认等频;等宽 1 ,等频 2 ,聚类 3
# :param df:
# :param feature:
# :param bin:
# :param method: 1:等宽;2:等频;3:聚类;默认2
# :return:
# '''
# if method==1:
#
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
):
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
):
'''
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
定义 N分位切割区间,负数单独一个区间,非负数N 切割
...
...
data/analyis/filetool.py
View file @
75c387db
...
@@ -3,12 +3,13 @@ from docx import Document
...
@@ -3,12 +3,13 @@ from docx import Document
from
docx.shared
import
Inches
from
docx.shared
import
Inches
def
buildDocument
(
path
,
filename
):
def
buildDocument
(
path
,
filename
):
if
str
.
rfind
(
filename
,
0
,
3
)
!=
'doc'
:
if
filename
[
-
3
:]
!=
'doc'
:
if
str
.
rfind
(
filename
,
0
,
4
)
!=
'docx'
:
if
filename
[
-
4
:]
!=
'docx'
:
raise
ValueError
(
'{} is not a word file'
.
format
(
filename
))
raise
ValueError
(
'{} is not a word file'
.
format
(
filename
))
if
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
filename
)):
if
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
filename
)):
return
Document
(
os
.
path
.
join
(
path
,
filename
))
return
Document
(
os
.
path
.
join
(
path
,
filename
))
return
Document
()
return
Document
()
def
saveDocument
(
document
,
path
,
filename
):
def
saveDocument
(
document
,
path
,
filename
):
if
str
.
rfind
(
filename
,
0
,
3
)
!=
'doc'
:
if
str
.
rfind
(
filename
,
0
,
3
)
!=
'doc'
:
if
str
.
rfind
(
filename
,
0
,
4
)
!=
'docx'
:
if
str
.
rfind
(
filename
,
0
,
4
)
!=
'docx'
:
...
...
data/graph/drawplot.py
View file @
75c387db
...
@@ -27,7 +27,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
...
@@ -27,7 +27,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
'''
'''
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
):
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
,
path
=
None
,
filename
=
None
):
'''
'''
:param df: dataframe
:param df: dataframe
:param x: 横坐标
:param x: 横坐标
...
...
model/__init__.py
→
model
s
/__init__.py
View file @
75c387db
File moved
model/xgboost.py
→
model
s
/xgboost.py
View file @
75c387db
...
@@ -3,34 +3,11 @@ import numpy as np
...
@@ -3,34 +3,11 @@ import numpy as np
import
xgboost
as
xgb
import
xgboost
as
xgb
from
sklearn.model_selection
import
KFold
,
train_test_split
,
GridSearchCV
,
StratifiedKFold
from
sklearn.model_selection
import
KFold
,
train_test_split
,
GridSearchCV
,
StratifiedKFold
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn
import
metrics
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain
=
df
.
reset_index
()
#== dftrain 中划分 训练集,验证集
if
trainsplit
==
'random'
:
# 随机分配 train / val
train
=
dftrain
.
sample
(
frac
=
trainsplitRatio
,
random_state
=
7
)
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
elif
trainsplit
==
'timeSeries'
:
# 按时间序列分配 train /val
train
=
dftrain
.
sort_values
(
by
=
sort_col
)
.
head
(
int
(
len
(
dftrain
)
*
trainsplitRatio
))
val
=
dftrain
[
~
dftrain
.
index
.
isin
(
train
.
index
)]
else
:
train
=
df
val
=
None
return
train
,
val
def
buildClf
(
max_depth
=
2
,
learning_rate
=
0.1
,
n_estimators
=
5000
,
gamma
=
0
,
def
buildClf
(
max_depth
=
2
,
learning_rate
=
0.1
,
n_estimators
=
5000
,
gamma
=
0
,
...
@@ -61,27 +38,18 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
...
@@ -61,27 +38,18 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier
:return:XGBClassifier
'''
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
True
,
objective
=
'binary:logistic'
,
verbosity
=
1
,
silent
=
True
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
base_score
=
base_score
,
random_state
=
7
,
seed
=
7
base_score
=
base_score
,
random_state
=
7
,
seed
=
7
)
)
def
buildParamGrid
(
learning_rate
=
[
0.001
,
0.01
,
0.05
,
0.1
,
0.2
,
0.3
],
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)],
max_depth
=
[
2
,
3
],
min_child_weight
=
[
1
,
2
,
3
,
4
,
5
,
6
],
subsample
=
[
i
/
10
for
i
in
range
(
6
,
10
)],
colsample_bytree
=
[
i
/
10
for
i
in
range
(
6
,
10
)],
reg_alpha
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
],
reg_lambda
=
[
0.001
,
0.01
,
0.05
,
0.1
,
1
,
10
]):
param_grid
=
dict
(
learning_rate
=
learning_rate
,
gamma
=
gamma
,
max_depth
=
max_depth
,
min_child_weight
=
min_child_weight
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
)
return
param_grid
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
'''
'''
模型自动调参
模型自动调参
:param clf : XGBClassifier
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定
,buildParamGrid
:param param_grid : dict,调参的区间设定
:param scoring : 调参 评估标准 默认 roc_auc
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param dftrain:
:param features:
:param features:
...
@@ -90,7 +58,7 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
...
@@ -90,7 +58,7 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:return:
:return:
'''
'''
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kflod
,
verbose
=
3
,
iid
=
True
,
refit
=
True
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kflod
,
verbose
=
0
,
iid
=
True
,
refit
=
True
)
#== 模型训练
#== 模型训练
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
#== 获取最优参数
#== 获取最优参数
...
@@ -135,6 +103,14 @@ def predict(clf,df,features):
...
@@ -135,6 +103,14 @@ def predict(clf,df,features):
df
[
'predict_proba'
]
=
clf
.
predict_proba
(
df
[
features
])[:,
1
]
df
[
'predict_proba'
]
=
clf
.
predict_proba
(
df
[
features
])[:,
1
]
return
df
return
df
def
auc
(
clf
,
df
,
features
,
label
):
#== 计算准确率,auc等指标
df
=
predict
(
clf
,
df
,
features
)
accu
=
metrics
.
accuracy_score
(
df
[
label
],
df
[
'predict'
])
auc
=
metrics
.
roc_auc_score
(
df
[
label
],
df
[
'predict_proba'
])
return
dict
({
'accuracy'
:
accu
,
'auc'
:
auc
})
def
featureImportance
(
clf
,
features
):
def
featureImportance
(
clf
,
features
):
'''
'''
获取模型 特征权重
获取模型 特征权重
...
...
mvp/xgboostreport.py
0 → 100644
View file @
75c387db
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
data.analyis
import
filetool
from
data.analyis
import
datacal
from
models
import
xgboost
from
matplotlib
import
pyplot
as
plt
from
data.graph
import
drawplot
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
):
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
clf
=
xgboost
.
buildClf
()
document
.
add_paragraph
(
'初始化参数运行{}'
.
format
(
clf
.
get_xgb_params
()))
clf
=
xgboost
.
modelfit
(
clf
,
dftrain
,
features
,
label
)
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftest
,
features
,
label
)))
document
.
add_heading
(
'调整参数'
)
max_depth
=
[
2
,
3
]
min_child_weight
=
range
(
1
,
4
,
1
)
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'max_depth'
:
max_depth
,
'min_child_weight'
:
min_child_weight
},
features
,
label
)
# gamma
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,{
'gamma'
:
gamma
},
features
,
label
)
# subsample colsample_bytree
subsample
=
[
0.8
,
0.9
,
1
]
colsample_bytree
=
[
0.8
,
0.9
,
1
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'subsample'
:
subsample
,
'colsample_bytree'
:
colsample_bytree
},
features
,
label
)
# reg_alpha
reg_alpha
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'reg_alpha'
:
reg_alpha
},
features
,
label
)
# reg_lambda
reg_lambda
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'reg_lambda'
:
reg_lambda
},
features
,
label
)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain
=
xgboost
.
predict
(
clf
,
dftrain
,
features
)
dftest
=
xgboost
.
predict
(
clf
,
dftest
,
features
)
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
.
to_frame
(
name
=
[
'weight'
,
'feature'
])
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
document
.
add_picture
(
'tmp.png'
)
filetool
.
saveDocument
(
document
,
path
,
filename
)
def
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
params
,
features
,
label
):
for
i
in
dict
(
params
)
.
keys
():
document
.
add_paragraph
(
'调参{},取值{}'
.
format
(
i
,
params
[
i
]))
grid_search
=
xgboost
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
)
clf
=
grid_search
.
best_estimator_
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
clf
=
xgboost
.
modelfit
(
clf
,
dftrain
,
features
,
label
)
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftest
,
features
,
label
)))
return
document
,
clf
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment