Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
37c70174
Commit
37c70174
authored
Apr 18, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
xgb report done
parent
75c387db
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
276 additions
and
25 deletions
+276
-25
datacal.py
data/analyis/datacal.py
+14
-4
drawplot.py
data/graph/drawplot.py
+76
-9
xgboost.py
models/xgboost.py
+13
-12
report.py
mvp/report.py
+52
-0
xgbreport.py
mvp/xgbreport.py
+121
-0
No files found.
data/analyis/datacal.py
View file @
37c70174
...
@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
...
@@ -91,11 +91,21 @@ def cal_feature_grid(df,feature,bin=10):
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
return
feature_grid
return
feature_grid
def
cal_accume
(
df
,
feature
,
target
,
bin
=
10
):
def
cal_accume
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
df_out
=
cal_univar
(
df
,
feature
,
target
,
bin
)
'''
df_out
[
'acmCnt'
]
=
df_out
[
'count'
]
.
cumsum
()
df_out
[
'acmEvent'
]
=
df_out
[
'sum'
]
.
cumsum
()
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,坏样本数量,坏样本比例
'''
df_out
=
cal_univar
(
df
,
feature
,
target
,
bin
,
classes
=
classes
)
df_out
[
'acmCnt'
]
=
df_out
.
groupby
(
classes
)[
'count'
]
.
cumsum
()
df_out
[
'acmEvent'
]
=
df_out
.
groupby
(
classes
)[
'sum'
]
.
cumsum
()
df_out
[
'acmEventRate'
]
=
df_out
[
'acmEvent'
]
/
df_out
[
'acmCnt'
]
df_out
[
'acmEventRate'
]
=
df_out
[
'acmEvent'
]
/
df_out
[
'acmCnt'
]
return
df_out
def
cal_univar
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
def
cal_univar
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
...
...
data/graph/drawplot.py
View file @
37c70174
from
pyplotz.pyplotz
import
PyplotZ
from
pyplotz.pyplotz
import
PyplotZ
from
pyplotz.pyplotz
import
plt
from
pyplotz.pyplotz
import
plt
from
data.analyis
import
datacal
import
seaborn
as
sns
import
seaborn
as
sns
import
pandas
as
pd
plt
.
rc
(
'figure'
,
figsize
=
(
8
,
6
))
plt
.
rc
(
'figure'
,
figsize
=
(
8
,
6
))
font_options
=
{
font_options
=
{
...
@@ -11,6 +12,67 @@ font_options={
...
@@ -11,6 +12,67 @@ font_options={
plt
.
rc
(
'font'
,
**
font_options
)
plt
.
rc
(
'font'
,
**
font_options
)
def
liftchart
(
df
,
x
,
y
,
classes
=
''
,
bin
=
10
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
# values=['count'], aggfunc=['mean'])
plt
.
cla
()
if
classes
!=
''
:
df_out
=
datacal
.
cal_accume
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
plt
.
subplot
(
2
,
1
,
1
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
subplot
(
2
,
1
,
2
)
draw_lineplot
(
df_out
,
'grid'
,
'acmEventRate'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_accume
(
df
,
x
,
y
,
bin
)
plt
.
subplot
(
2
,
1
,
1
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
subplot
(
2
,
1
,
2
)
draw_lineplot
(
df_out
,
'grid'
,
'acmEventRate'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
plt
.
tight_layout
()
# plt.show()
return
plt
def
univarchart
(
df
,
x
,
y
,
bin
=
10
,
classes
=
''
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt
.
cla
()
plt
.
subplot
(
1
,
1
,
1
)
if
classes
!=
''
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
# plt.show()
return
plt
def
pdpchart
(
df
,
x
,
y
,
bin
=
10
,
classes
=
''
,
title
=
''
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt
.
cla
()
plt
.
subplot
(
1
,
1
,
1
)
if
classes
!=
''
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
,
classes
=
[
classes
])
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
hue
=
classes
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
else
:
df_out
=
datacal
.
cal_univar
(
df
,
x
,
y
,
bin
)
draw_lineplot
(
df_out
,
'grid'
,
'mean'
,
title
=
title
,
xlabel
=
xlabel
,
ylabel
=
ylabel
)
# plt.show()
return
plt
'''
'''
双坐标轴
双坐标轴
'''
'''
...
@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
...
@@ -27,7 +89,7 @@ def draw_lineplot_doubleaxes(df,x,y1,y2,y1_hue='',y2_hue='',title=''):
'''
'''
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
,
path
=
None
,
filename
=
None
):
def
draw_barplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
):
'''
'''
:param df: dataframe
:param df: dataframe
:param x: 横坐标
:param x: 横坐标
...
@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
...
@@ -58,7 +120,7 @@ def draw_barplot(df,x,y,hue='',title='',path=None,filename=None):
return
fig
return
fig
def
draw_lineplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
):
def
draw_lineplot
(
df
,
x
,
y
,
hue
=
''
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
'''
'''
:param df: dataframe
:param df: dataframe
:param x: 横坐标
:param x: 横坐标
...
@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
...
@@ -69,8 +131,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
'''
'''
pltz
=
PyplotZ
()
pltz
=
PyplotZ
()
pltz
.
enable_chinese
()
pltz
.
enable_chinese
()
fig
=
plt
.
figure
()
# fig = plt.figure()
ax
=
fig
.
add_subplot
(
1
,
1
,
1
)
if
hue
!=
''
:
if
hue
!=
''
:
for
type
in
df
[
hue
]
.
unique
()
.
tolist
():
for
type
in
df
[
hue
]
.
unique
()
.
tolist
():
# == 画图
# == 画图
...
@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
...
@@ -79,10 +140,16 @@ def draw_lineplot(df,x,y,hue='',title=''):
else
:
else
:
plt
.
plot
(
df
[
x
],
df
[
y
],
linestyle
=
'dashed'
,
marker
=
'o'
)
plt
.
plot
(
df
[
x
],
df
[
y
],
linestyle
=
'dashed'
,
marker
=
'o'
)
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz
.
xlabel
(
x
)
if
xlabel
!=
''
:
pltz
.
ylabel
(
y
)
pltz
.
xlabel
(
xlabel
)
else
:
pltz
.
xlabel
(
x
)
if
ylabel
!=
''
:
pltz
.
ylabel
(
ylabel
)
else
:
pltz
.
ylabel
(
y
)
pltz
.
title
(
title
)
pltz
.
title
(
title
)
pltz
.
legend
()
pltz
.
legend
()
plt
.
grid
()
plt
.
grid
()
plt
.
show
()
# plt.show()
return
fig
return
plt
\ No newline at end of file
\ No newline at end of file
models/xgboost.py
View file @
37c70174
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
numpy
as
np
import
xgboost
as
xgb
import
xgboost
as
xgb
from
sklearn.model_selection
import
KFold
,
train_test_split
,
GridSearchCV
,
StratifiedKFold
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn
import
metrics
from
sklearn
import
metrics
...
@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
...
@@ -38,7 +36,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:return:XGBClassifier
:return:XGBClassifier
'''
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
1
,
silent
=
True
,
objective
=
'binary:logistic'
,
verbosity
=
0
,
silent
=
0
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
...
@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
...
@@ -57,15 +55,15 @@ def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc
:param kfold:
:param kfold:
:return:
:return:
'''
'''
kflod
=
StratifiedKFold
(
n_splits
=
kfold
,
shuffle
=
True
,
random_state
=
7
)
#
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kf
lod
,
verbose
=
0
,
iid
=
True
,
refit
=
True
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kf
old
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
#== 模型训练
#== 模型训练
grid_search
.
fit
(
dftrain
[
features
]
.
values
,
dftrain
[
resp
]
.
values
)
grid_search
.
fit
(
dftrain
[
features
]
,
dftrain
[
resp
]
)
#== 获取最优参数
#== 获取最优参数
return
grid_search
return
grid_search
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
useTrainCV
=
True
,
cv_folds
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
useTrainCV
=
True
,
kfold
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
'''
'''
模型训练
模型训练
:type useTrainCV: object
:type useTrainCV: object
...
@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
...
@@ -80,9 +78,10 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, cv_folds=10, eval_m
:return:
:return:
'''
'''
if
useTrainCV
:
if
useTrainCV
:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param
=
clf
.
get_xgb_params
()
xgb_param
=
clf
.
get_xgb_params
()
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
)
xgtrain
=
xgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
)
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
cv_folds
,
cvresult
=
xgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
kfold
,
metrics
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
True
)
metrics
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
True
)
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
...
@@ -106,9 +105,9 @@ def predict(clf,df,features):
...
@@ -106,9 +105,9 @@ def predict(clf,df,features):
def
auc
(
clf
,
df
,
features
,
label
):
def
auc
(
clf
,
df
,
features
,
label
):
#== 计算准确率,auc等指标
#== 计算准确率,auc等指标
df
=
predict
(
clf
,
df
,
features
)
df
=
predict
(
clf
,
df
,
features
)
accu
=
metrics
.
accuracy_score
(
df
[
label
]
,
df
[
'predict'
]
)
accu
=
metrics
.
accuracy_score
(
df
[
label
]
.
values
,
df
[
'predict'
]
.
values
)
auc
=
metrics
.
roc_auc_score
(
df
[
label
],
df
[
'predict_proba'
])
auc
=
metrics
.
roc_auc_score
(
df
[
label
],
df
[
'predict_proba'
])
return
dict
({
'accuracy'
:
accu
,
'auc'
:
auc
})
return
{
'accuracy'
:
accu
,
'auc'
:
auc
}
def
featureImportance
(
clf
,
features
):
def
featureImportance
(
clf
,
features
):
...
@@ -121,4 +120,6 @@ def featureImportance(clf,features):
...
@@ -121,4 +120,6 @@ def featureImportance(clf,features):
# Print Feature Importance:
# Print Feature Importance:
feat_imp
=
pd
.
Series
(
clf
.
get_booster
()
.
get_fscore
(),
features
)
.
sort_values
(
ascending
=
False
,
na_position
=
'last'
)
feat_imp
=
pd
.
Series
(
clf
.
get_booster
()
.
get_fscore
(),
features
)
.
sort_values
(
ascending
=
False
,
na_position
=
'last'
)
feat_imp
=
feat_imp
[
feat_imp
>
0
]
feat_imp
=
feat_imp
[
feat_imp
>
0
]
feat_imp
=
feat_imp
.
to_frame
()
.
reset_index
()
feat_imp
.
columns
=
[
'feature'
,
'weight'
]
return
feat_imp
return
feat_imp
mvp/report.py
0 → 100644
View file @
37c70174
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
mvp
import
xgbreport
from
data.analyis
import
datacal
if
__name__
==
'__main__'
:
features
=
[
'third_data_source#xy_pan_newapplyAcredibility'
,
'third_data_source#xy_pan_newapplyAscore'
,
'third_data_source#xy_pan_newconsfinAavgAlimit'
,
'third_data_source#xy_pan_newconsfinAcredibility'
,
'third_data_source#xy_pan_newconsfinAcreditAlimit'
,
'third_data_source#xy_pan_newconsfinAmaxAlimit'
,
'third_data_source#xy_pan_newconsfinAorgAcountq'
,
'third_data_source#xy_pan_newconsfinAorgAcountx'
,
'third_data_source#xy_pan_newconsfinAproductAcount'
,
'third_data_source#xy_pan_newhistoryAfailAfee'
,
'third_data_source#xy_pan_newhistoryAsucAfee'
,
'third_data_source#xy_pan_newlatestAoneAmonthAfail'
,
'third_data_source#xy_pan_newlatestAoneAmonthAsuc'
,
'third_data_source#xy_pan_newlatestAoneAmonthd'
,
'third_data_source#xy_pan_newlatestAoneAmonthj'
,
'third_data_source#xy_pan_newlatestAqueryAtime'
,
'third_data_source#xy_pan_newlatestAsixAmontha'
,
'third_data_source#xy_pan_newlatestAsixAmonthv'
,
'third_data_source#xy_pan_newlatestAthreeAmonthb'
,
'third_data_source#xy_pan_newlatestAthreeAmonthf'
,
'third_data_source#xy_pan_newloansAavgAlimit'
,
'third_data_source#xy_pan_newloansAcashAcount'
,
'third_data_source#xy_pan_newloansAcount'
,
'third_data_source#xy_pan_newloansAcredibilityh'
,
'third_data_source#xy_pan_newloansAcredibilitys'
,
'third_data_source#xy_pan_newloansAcreditAlimit'
,
'third_data_source#xy_pan_newloansAlatestAtime'
,
'third_data_source#xy_pan_newloansAlongAtime'
,
'third_data_source#xy_pan_newloansAmaxAlimit'
,
'third_data_source#xy_pan_newloansAorgAcounta'
,
'third_data_source#xy_pan_newloansAorgAcountg'
,
'third_data_source#xy_pan_newloansAoverdueAcount'
,
'third_data_source#xy_pan_newloansAproductAcount'
,
'third_data_source#xy_pan_newloansAscore'
,
'third_data_source#xy_pan_newloansAsettleAcount'
,
'third_data_source#xy_pan_newqueryAcashAcount'
,
'third_data_source#xy_pan_newqueryAfinanceAcount'
,
'third_data_source#xy_pan_newqueryAorgAcount'
,
'third_data_source#xy_pan_newqueryAsumAcount'
]
label
=
'y'
df
=
pd
.
read_csv
(
'test.csv'
)
dftrain
,
dftest
=
datacal
.
split_train_val
(
df
,
trainsplit
=
'timeSeries'
,
trainsplitRatio
=
0.8
,
sort_col
=
'applied_at'
)
xgbreport
.
report
(
dftrain
,
dftest
,
features
,
label
,
''
,
'tmp.doc'
)
\ No newline at end of file
mvp/xgb
oost
report.py
→
mvp/xgbreport.py
View file @
37c70174
...
@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
...
@@ -8,6 +8,16 @@ from matplotlib import pyplot as plt
from
data.graph
import
drawplot
from
data.graph
import
drawplot
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
):
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
):
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
:param dftrain:
:param dftest:
:param features:
:param label:
:param path:
:param filename:
:return:
'''
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
clf
=
xgboost
.
buildClf
()
clf
=
xgboost
.
buildClf
()
...
@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
...
@@ -21,41 +31,75 @@ def report(dftrain,dftest,features,label,path,filename):
min_child_weight
=
range
(
1
,
4
,
1
)
min_child_weight
=
range
(
1
,
4
,
1
)
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'max_depth'
:
max_depth
,
'min_child_weight'
:
min_child_weight
},
features
,
label
)
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
{
'max_depth'
:
max_depth
,
'min_child_weight'
:
min_child_weight
},
features
,
label
)
# gamma
#
#
gamma
gamma
=
[
i
/
10
for
i
in
range
(
0
,
5
)]
#
gamma=[i/10 for i in range(0,5)]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,{
'gamma'
:
gamma
},
features
,
label
)
#
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
#
# subsample colsample_bytree
#
#
subsample colsample_bytree
subsample
=
[
0.8
,
0.9
,
1
]
#
subsample=[0.8,0.9,1]
colsample_bytree
=
[
0.8
,
0.9
,
1
]
#
colsample_bytree=[0.8,0.9,1]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
#
document, clf = tun_params(document, clf, dftrain, dftest,
{
'subsample'
:
subsample
,
'colsample_bytree'
:
colsample_bytree
},
features
,
label
)
#
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
#
# reg_alpha
#
#
reg_alpha
reg_alpha
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
#
reg_alpha=[0.001,0.01,0.1,1,10]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
#
document, clf = tun_params(document, clf, dftrain, dftest,
{
'reg_alpha'
:
reg_alpha
},
features
,
label
)
#
{'reg_alpha': reg_alpha}, features, label)
#
# reg_lambda
#
#
reg_lambda
reg_lambda
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
#
reg_lambda = [0.001, 0.01, 0.1, 1, 10]
document
,
clf
=
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
#
document, clf = tun_params(document, clf, dftrain, dftest,
{
'reg_lambda'
:
reg_lambda
},
features
,
label
)
#
{'reg_lambda': reg_lambda}, features, label)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain
=
xgboost
.
predict
(
clf
,
dftrain
,
features
)
dftrain
=
xgboost
.
predict
(
clf
,
dftrain
,
features
)
dftest
=
xgboost
.
predict
(
clf
,
dftest
,
features
)
dftest
=
xgboost
.
predict
(
clf
,
dftest
,
features
)
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
.
to_frame
(
name
=
[
'weight'
,
'feature'
])
#== 特征权重
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
.
savefig
(
'tmp.png'
)
fig
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
document
.
add_picture
(
'tmp.png'
)
document
.
add_picture
(
'tmp.png'
)
#== 模型分同逾期率的关系图
dftrain
[
'flag'
]
=
'训练集'
dftest
[
'flag'
]
=
'测试集'
drawplot
.
liftchart
(
pd
.
concat
([
dftrain
,
dftest
]),
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'flag'
,
title
=
'liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'整体--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
filetool
.
saveDocument
(
document
,
path
,
filename
)
#== 分月份查看-- 只看测试集
dftest
=
datacal
.
cal_month
(
dftest
,
'applied_at'
,
'applied_month'
)
drawplot
.
liftchart
(
dftest
,
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_month'
,
title
=
'分月liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分月--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 分用户类型分月查看
drawplot
.
liftchart
(
dftest
,
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_type'
,
title
=
'分用户类型liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分用户类型--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 分渠道分月查看--取前5个渠道查看
channels
=
dftest
.
applied_channel
.
value_counts
()[:
5
]
.
index
drawplot
.
liftchart
(
dftest
[
dftest
.
applied_channel
.
isin
(
channels
)],
'predict_proba'
,
label
,
bin
=
10
,
classes
=
'applied_channel'
,
title
=
'分渠道liftchart'
,
xlabel
=
'模型分'
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'测试集分渠道--liftchart'
)
document
.
add_picture
(
'tmp.png'
)
#== 各个特征的 单变量图 和 pdp 图
for
i
in
featureimp
.
feature
.
tolist
():
drawplot
.
univarchart
(
dftest
,
i
,
label
,
bin
=
10
,
title
=
'单变量
%
s'
%
i
,
ylabel
=
'逾期率'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'单变量
%
s'
%
i
)
document
.
add_picture
(
'tmp.png'
)
#= pdp
drawplot
.
pdpchart
(
dftest
,
i
,
'predict_proba'
,
bin
=
10
,
title
=
'pdp
%
s'
%
i
,
ylabel
=
'模型分'
)
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'pdp
%
s'
%
i
)
document
.
add_picture
(
'tmp.png'
)
filetool
.
saveDocument
(
document
,
path
,
filename
)
...
@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
...
@@ -65,10 +109,12 @@ def tun_params(document,clf,dftrain,dftest,params,features,label):
grid_search
=
xgboost
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
)
grid_search
=
xgboost
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
)
clf
=
grid_search
.
best_estimator_
clf
=
grid_search
.
best_estimator_
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
clf
=
xgboost
.
modelfit
(
clf
,
dftrain
,
features
,
label
)
#==
# clf = xgboost.modelfit(clf, dftrain, features, label)
document
.
add_paragraph
(
'寻找最优参数过程{}'
.
format
(
grid_search
.
cv_results_
))
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftest
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftest
,
features
,
label
)))
return
document
,
clf
return
document
,
clf
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment