Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
37c70174
Commit
37c70174
authored
Apr 18, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
xgb report done
parent
75c387db
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
276 additions
and
25 deletions
+276
-25
datacal.py
data/analyis/datacal.py
+14
-4
drawplot.py
data/graph/drawplot.py
+76
-9
xgboost.py
models/xgboost.py
+13
-12
report.py
mvp/report.py
+52
-0
xgbreport.py
mvp/xgbreport.py
+121
-0
No files found.
data/analyis/datacal.py
View file @
37c70174
...
...
@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
return
feature_grid
def
cal_accume
(
df
,
feature
,
target
,
bin
=
10
):
df_out
=
cal_univar
(
df
,
feature
,
target
,
bin
)
df_out
[
'acmCnt'
]
=
df_out
[
'count'
]
.
cumsum
()
df_out
[
'acmEvent'
]
=
df_out
[
'sum'
]
.
cumsum
()
def
cal_accume
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
'''
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,坏样本数量,坏样本比例
'''
df_out
=
cal_univar
(
df
,
feature
,
target
,
bin
,
classes
=
classes
)
df_out
[
'acmCnt'
]
=
df_out
.
groupby
(
classes
)[
'count'
]
.
cumsum
()
df_out
[
'acmEvent'
]
=
df_out
.
groupby
(
classes
)[
'sum'
]
.
cumsum
()
df_out
[
'acmEventRate'
]
=
df_out
[
'acmEvent'
]
/
df_out
[
'acmCnt'
]
return
df_out
def
cal_univar
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
...
...
data/graph/drawplot.py
View file @
37c70174
from
pyplotz.pyplotz
import
PyplotZ
from
pyplotz.pyplotz
import
plt
from
data.analyis
import
datacal
import
seaborn
as
sns
import
pandas
as
pd
plt
.
rc
(
'figure'
,
figsize
=
(
8
,
6
))
font_options
=
{
...
...
@@ -11,6 +12,67 @@ font_options={
plt
.
rc
(
'font'
,
**
font_options
)
def
liftchart
(
df
,
x
,
y
,
classes
=
''
,
bin
=
10
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
# values=['count'], aggfunc=['mean'])
plt
.
cla
()
if
classes
!=
''
:
df_out
=
datacal
.
cal_accume
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
plt
.
subplot
(
2
,
1
,
1
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
subplot
(
2
,
1
,
2
)
draw_lineplot
(
df_out
,
'grid'
,
'acmEventRate'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_accume
(
df
,
x
,
y
,
bin
)
plt
.
subplot
(
2
,
1
,
1
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
subplot
(
2
,
1
,
2
)
draw_lineplot
(
df_out
,
'grid'
,
'acmEventRate'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
tight_layout
()
# plt.show()
return
plt
def
univarchart
(
df
,
x
,
y
,
bin
=
10
,
classes
=
''
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt
.
cla
()
plt
.
subplot
(
1
,
1
,
1
)
if
classes
!=
''
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
# plt.show()
return
plt
def
pdpchart
(
df
,
x
,
y
,
bin
=
10
,
classes
=
''
,
title
=
''
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt
.
cla
()
plt
.
subplot
(
1
,
1
,
1
)
if
classes
!=
''
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
# plt.show()
return
plt
'''
双坐标轴
'''
...
...
@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
'''
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
,
path
=
None
,
filename
=
None
):
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
):
'''
:param df: dataframe
:param x: 横坐标
...
...
@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
return
fig
def
draw_lineplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
):
def
draw_lineplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
'''
:param df: dataframe
:param x: 横坐标
...
...
@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
'''
pltz
=
PyplotZ
()
pltz
.
enable_chinese
()
fig
=
plt
.
figure
()
ax
=
fig
.
add_subplot
(
1
,
1
,
1
)
# fig = plt.figure()
if
hue
!=
''
:
for
type
in
df
[
hue
]
.
unique
()
.
tolist
():
# == 画图
...
...
@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
else
:
plt
.
plot
(
df
[
x
],
df
[
y
],
linestyle
=
'dashed'
,
marker
=
'o'
)
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz
.
xlabel
(
x
)
pltz
.
ylabel
(
y
)
if
xlabel
!=
''
:
pltz
.
xlabel
(
xlabel
)
else
:
pltz
.
xlabel
(
x
)
if
ylabel
!=
''
:
pltz
.
ylabel
(
ylabel
)
else
:
pltz
.
ylabel
(
y
)
pltz
.
title
(
title
)
pltz
.
legend
()
plt
.
grid
()
plt
.
show
()
return
fig
\ No newline at end of file
# plt.show()
return
plt
\ No newline at end of file
models/xgboost.py
View file @
37c70174
import
pandas
as
pd
import
numpy
as
np
import
xgboost
as
xgb
from
sklearn.model_selection
import
KFold
,
train_test_split
,
GridSearchCV
,
StratifiedKFold
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn
import
metrics
from
sklearn
import
metrics
...
...
@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
1
,
silent
=
True
,
objective
=
'binary:logistic'
,
verbosity
=
0
,
silent
=
0
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
...
...
@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:param kfold:
:return:
'''
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kf
lod
,
verbose
=
0
,
iid
=
True
,
refit
=
True
)
#
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kf
old
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
#== 模型训练
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
grid_search
.
fit
(
dftrain
[
features
]
,
dftrain
[
resp
]
)
#== 获取最优参数
return
grid_search
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
useTrainCV
=
True
,
cv_folds
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
useTrainCV
=
True
,
kfold
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
'''
模型训练
:type useTrainCV: object
...
...
@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
:return:
'''
if
useTrainCV
:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param
=
clf
.
get_xgb_params
()
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
)
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
cv_folds
,
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
kfold
,
metrics
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
True
)
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
...
...
@@ -106,9 +105,9 @@ def predict(clf,df,features):
def
auc
(
clf
,
df
,
features
,
label
):
#== 计算准确率,auc等指标
df
=
predict
(
clf
,
df
,
features
)
accu
=
metrics
.
accuracy_score
(
df
[
label
]
,
df
[
'predict'
]
)
accu
=
metrics
.
accuracy_score
(
df
[
label
]
.
values
,
df
[
'predict'
]
.
values
)
auc
=
metrics
.
roc_auc_score
(
df
[
label
],
df
[
'predict_proba'
])
return
dict
({
'accuracy'
:
accu
,
'auc'
:
auc
})
return
{
'accuracy'
:
accu
,
'auc'
:
auc
}
def
featureImportance
(
clf
,
features
):
...
...
@@ -121,4 +120,6 @@ def featureImportance(clf,features):
# Print Feature Importance:
feat_imp
=
pd
.
Series
(
clf
.
get_booster
()
.
get_fscore
(),
features
)
.
sort_values
(
ascending
=
False
,
na_position
=
'last'
)
feat_imp
=
feat_imp
[
feat_imp
>
0
]
feat_imp
=
feat_imp
.
to_frame
()
.
reset_index
()
feat_imp
.
columns
=
[
'feature'
,
'weight'
]
return
feat_imp
mvp/report.py
0 → 100644
View file @
37c70174
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
mvp
import
xgbreport
from
data.analyis
import
datacal
if
__name__
==
'__main__'
:
features
=
[
'third_data_source#xy_pan_newapplyAcredibility'
,
'third_data_source#xy_pan_newapplyAscore'
,
'third_data_source#xy_pan_newconsfinAavgAlimit'
,
'third_data_source#xy_pan_newconsfinAcredibility'
,
'third_data_source#xy_pan_newconsfinAcreditAlimit'
,
'third_data_source#xy_pan_newconsfinAmaxAlimit'
,
'third_data_source#xy_pan_newconsfinAorgAcountq'
,
'third_data_source#xy_pan_newconsfinAorgAcountx'
,
'third_data_source#xy_pan_newconsfinAproductAcount'
,
'third_data_source#xy_pan_newhistoryAfailAfee'
,
'third_data_source#xy_pan_newhistoryAsucAfee'
,
'third_data_source#xy_pan_newlatestAoneAmonthAfail'
,
'third_data_source#xy_pan_newlatestAoneAmonthAsuc'
,
'third_data_source#xy_pan_newlatestAoneAmonthd'
,
'third_data_source#xy_pan_newlatestAoneAmonthj'
,
'third_data_source#xy_pan_newlatestAqueryAtime'
,
'third_data_source#xy_pan_newlatestAsixAmontha'
,
'third_data_source#xy_pan_newlatestAsixAmonthv'
,
'third_data_source#xy_pan_newlatestAthreeAmonthb'
,
'third_data_source#xy_pan_newlatestAthreeAmonthf'
,
'third_data_source#xy_pan_newloansAavgAlimit'
,
'third_data_source#xy_pan_newloansAcashAcount'
,
'third_data_source#xy_pan_newloansAcount'
,
'third_data_source#xy_pan_newloansAcredibilityh'
,
'third_data_source#xy_pan_newloansAcredibilitys'
,
'third_data_source#xy_pan_newloansAcreditAlimit'
,
'third_data_source#xy_pan_newloansAlatestAtime'
,
'third_data_source#xy_pan_newloansAlongAtime'
,
'third_data_source#xy_pan_newloansAmaxAlimit'
,
'third_data_source#xy_pan_newloansAorgAcounta'
,
'third_data_source#xy_pan_newloansAorgAcountg'
,
'third_data_source#xy_pan_newloansAoverdueAcount'
,
'third_data_source#xy_pan_newloansAproductAcount'
,
'third_data_source#xy_pan_newloansAscore'
,
'third_data_source#xy_pan_newloansAsettleAcount'
,
'third_data_source#xy_pan_newqueryAcashAcount'
,
'third_data_source#xy_pan_newqueryAfinanceAcount'
,
'third_data_source#xy_pan_newqueryAorgAcount'
,
'third_data_source#xy_pan_newqueryAsumAcount'
]
label
=
'y'
df
=
pd
.
read_csv
(
'test.csv'
)
dftrain
,
dftest
=
datacal
.
split_train_val
(
df
,
trainsplit
=
'timeSeries'
,
trainsplitRatio
=
0.8
,
sort_col
=
'applied_at'
)
xgbreport
.
report
(
dftrain
,
dftest
,
features
,
label
,
''
,
'tmp.doc'
)
\ No newline at end of file
mvp/xgb
oost
report.py
→
mvp/xgbreport.py
View file @
37c70174
...
...
@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
from
data.graph
import
drawplot
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
):
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
:param dftrain:
:param dftest:
:param features:
:param label:
:param path:
:param filename:
:return:
'''
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
clf
=
xgboost
.
buildClf
()
...
...
@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
min_child_weight
=
range
(
1
,
4
,
1
)
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'max_depth'
:
max_depth
,
'min_child_weight'
:
min_child_weight
},
features
,
label
)
# gamma
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,{
'gamma'
:
gamma
},
features
,
label
)
# subsample colsample_bytree
subsample
=
[
0.8
,
0.9
,
1
]
colsample_bytree
=
[
0.8
,
0.9
,
1
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'subsample'
:
subsample
,
'colsample_bytree'
:
colsample_bytree
},
features
,
label
)
# reg_alpha
reg_alpha
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'reg_alpha'
:
reg_alpha
},
features
,
label
)
# reg_lambda
reg_lambda
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'reg_lambda'
:
reg_lambda
},
features
,
label
)
#
#
gamma
#
gamma=[i/10 for i in range(0,5)]
#
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
#
#
#
subsample colsample_bytree
#
subsample=[0.8,0.9,1]
#
colsample_bytree=[0.8,0.9,1]
#
document, clf = tun_params(document, clf, dftrain, dftest,
#
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
#
#
#
reg_alpha
#
reg_alpha=[0.001,0.01,0.1,1,10]
#
document, clf = tun_params(document, clf, dftrain, dftest,
#
{'reg_alpha': reg_alpha}, features, label)
#
#
#
reg_lambda
#
reg_lambda = [0.001, 0.01, 0.1, 1, 10]
#
document, clf = tun_params(document, clf, dftrain, dftest,
#
{'reg_lambda': reg_lambda}, features, label)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain
=
xgboost
.
predict
(
clf
,
dftrain
,
features
)
dftest
=
xgboost
.
predict
(
clf
,
dftest
,
features
)
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
.
to_frame
(
name
=
[
'weight'
,
'feature'
])
#== 特征权重
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
document
.
add_picture
(
'tmp.png'
)
#== 模型分同逾期率的关系图
dftrain
[
'flag'
]
=
'训练集'
dftest
[
'flag'
]
=
'测试集'
drawplot
.
liftchart
(
pd
.
concat
([
dftrain
,
dftest
]),
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'flag'
,
title
=
'liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'整体--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
filetool
.
saveDocument
(
document
,
path
,
filename
)
#== 分月份查看-- 只看测试集
dftest
=
datacal
.
cal_month
(
dftest
,
'applied_at'
,
'applied_month'
)
drawplot
.
liftchart
(
dftest
,
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_month'
,
title
=
'分月liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分月--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 分用户类型分月查看
drawplot
.
liftchart
(
dftest
,
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_type'
,
title
=
'分用户类型liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分用户类型--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 分渠道分月查看--取前5个渠道查看
channels
=
dftest
.
applied_channel
.
value_counts
()[:
5
]
.
index
drawplot
.
liftchart
(
dftest
[
dftest
.
applied_channel
.
isin
(
channels
)],
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_channel'
,
title
=
'分渠道liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分渠道--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 各个特征的 单变量图 和 pdp 图
for
i
in
featureimp
.
feature
.
tolist
():
drawplot
.
univarchart
(
dftest
,
i
,
label
,
bin
=
10
,
title
=
'单变量
%
s'
%
i
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'单变量
%
s'
%
i
)
document
.
add_picture
(
'tmp.png'
)
#= pdp
drawplot
.
pdpchart
(
dftest
,
i
,
'predict_proba'
,
bin
=
10
,
title
=
'pdp
%
s'
%
i
,
ylabel
=
'模型分'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'pdp
%
s'
%
i
)
document
.
add_picture
(
'tmp.png'
)
filetool
.
saveDocument
(
document
,
path
,
filename
)
...
...
@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
grid_search
=
xgboost
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
)
clf
=
grid_search
.
best_estimator_
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
clf
=
xgboost
.
modelfit
(
clf
,
dftrain
,
features
,
label
)
#==
# clf = xgboost.modelfit(clf, dftrain, features, label)
document
.
add_paragraph
(
'寻找最优参数过程{}'
.
format
(
grid_search
.
cv_results_
))
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftest
,
features
,
label
)))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftest
,
features
,
label
)))
return
document
,
clf
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment