Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
e511a80c
Commit
e511a80c
authored
May 14, 2019
by
王家华
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改了model refit和画图的部分代码
parent
38d4951f
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
371 additions
and
145 deletions
+371
-145
drawplot.py
graph/drawplot.py
+1
-1
matplot.py
graph/matplot.py
+44
-17
general_methods.py
models_kit/general_methods.py
+13
-0
lightgbm.py
models_kit/lightgbm.py
+13
-17
dhb_obj.py
models_obj/dhb_obj.py
+65
-85
allocator.py
mvp/allocator.py
+81
-16
lgbreport.py
mvp/lgbreport.py
+4
-4
refit.py
mvp/refit.py
+100
-0
datacal.py
tools/datacal.py
+44
-5
filetool.py
tools/filetool.py
+6
-0
No files found.
graph/drawplot.py
View file @
e511a80c
from
pyplotz.pyplotz
import
PyplotZ
from
pyplotz.pyplotz
import
plt
from
data.analyi
s
import
datacal
from
tool
s
import
datacal
import
seaborn
as
sns
import
pandas
as
pd
...
...
graph/matplot.py
View file @
e511a80c
"""
Created on Thu Apr 18 11:32:06 2019
@author:
wangjiahua
@author:
Jason Wang
"""
...
...
@@ -10,54 +10,83 @@ import numpy as np
import
pandas
as
pd
import
seaborn
as
sns
############# plot config ###############
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
#图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
#分辨率
def
plot_table
(
dataset
,
auc
,
title
=
'untitled'
,
X_label
=
None
,
y_label
=
None
,
plot_tab
=
True
,
legend_list
=
None
,
saved_path
=
None
):
def
topN_feature_importance
(
model
,
clf
,
title
=
"untitled"
,
save_path
=
'./plots/'
,
topN
=
20
):
'''
instructions : visualization of pivot
plot feature importance squence
params:
classifier
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
# 分辨率
fig
,
axs
=
plt
.
subplots
(
1
,
1
,
figsize
=
(
16
,
9
),
linewidth
=
0.1
)
plt
.
figure
(
figsize
=
(
10
,
6
))
model
.
plot_importance
(
clf
,
max_num_features
=
topN
)
plt
.
title
(
"Feature Importances"
)
path
=
save_path
+
title
+
"featureImportance.png"
plt
.
savefig
(
path
)
plt
.
show
()
return
path
def
plot_table
(
dataset
,
auc
,
title
=
'untitled'
,
X_label
=
None
,
y_label
=
None
,
tab_df
=
None
,
plot_tab
=
True
,
saved_path
=
None
):
'''
instructions : visualization of pivot
Params :
dataset -
auc - auc list / array
title - title of plot('untitled' as default)
x_label - X axis label of plot
y_label - y axis label of plot
plot_tab - plot table or not , default as True
saved_path - saved path, set as None as there has no download needs
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
100
# 分辨率
fig
,
axs
=
plt
.
subplots
(
1
,
1
,
figsize
=
(
6
,
6
),
linewidth
=
0.1
)
table_rows
=
dataset
.
columns
table_cols
=
dataset
.
index
# traverse each columns of dataframe
for
i
in
table_rows
:
x
=
table_cols
y
=
dataset
[
i
]
axs
.
plot
(
x
,
y
,
maker
=
'o'
,
label
=
str
(
i
)
+
' AUC: '
+
auc
[
i
])
if
plot_tab
!=
False
:
the_table
=
plt
.
table
(
cellText
=
[
list
(
dataset
.
iloc
[
i
,
:]
.
values
)
for
i
in
range
(
len
(
dataset
.
head
()))],
axs
.
plot
(
x
,
y
,
label
=
str
(
i
)
+
' AUC: '
+
str
(
auc
[
i
]))
# if table should be plot
if
plot_tab
:
the_table
=
plt
.
table
(
cellText
=
[
list
(
dataset
.
iloc
[
i
,
:]
.
values
)
for
i
in
range
(
len
(
dataset
))],
rowLabels
=
table_rows
,
colLabels
=
table_cols
,
colWidths
=
[
0.91
/
(
len
(
table_cols
)
-
1
)]
*
len
(
table_cols
),
loc
=
'bottom'
)
plt
.
xticks
([])
# otherwise, nothing to do here
the_table
.
auto_set_font_size
(
False
)
the_table
.
set_fontsize
(
8
)
the_table
.
set_fontsize
(
6
)
fig
.
subplots_adjust
(
bottom
=
0.2
)
plt
.
grid
()
plt
.
ylabel
(
title
)
if
y_label
is
not
None
:
plt
.
ylabel
(
y_label
)
if
X_label
is
not
None
:
plt
.
xlabel
(
X_label
)
plt
.
legend
()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt
.
title
(
title
)
plt
.
show
()
return
1
def
plot_curve_singleCurve
(
dataset
,
x_label
=
None
,
y_label
=
None
,
table_tab
=
None
,
save_path
=
None
,
figure_arrangement
=
11
,
fig_size
=
(
4
,
3
),
fig_title
=
'General Plot'
,
fig_name
=
'untitled'
,
...
...
@@ -144,8 +173,6 @@ def density_chart(dataset,title):
plt
.
show
()
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
...
...
models_kit/general_methods.py
0 → 100644
View file @
e511a80c
def
topN_feature_importance
(
classifier
,
clf
,
topN
=
20
,
model
=
lgb
):
'''
plot feature importance squence
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
# 分辨率
plt
.
figure
(
figsize
=
(
10
,
6
))
classifier
.
plot_importance
(
clf
,
max_num_features
=
topN
)
plt
.
title
(
"Feature Importances"
)
plt
.
show
()
models_kit/lightgbm.py
View file @
e511a80c
...
...
@@ -6,7 +6,8 @@ import numpy as np
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
os
,
psutil
from
tools
import
datacal
from
graph
import
matplot
params_lgb
=
{
'task'
:
'train'
,
# 用途
'application'
:
'binary'
,
# 用于二分类
...
...
@@ -43,7 +44,8 @@ def returnAUC(clf, training_set, validation_set, features, target='target'):
return
train_auc
,
val_auc
def
train_lgbm
(
params
,
df_train
,
df_val
,
features
,
adds_on
=
None
,
target
=
'target'
):
def
train_lgbm
(
params
,
df_train
,
df_val
,
features
,
adds_on
=
None
,
target
=
'target'
,
featureImportance_path
=
'../mvp/plots/'
,
topN_featureImportance
=
20
,
featureImportance_title
=
'lightgbm'
):
'''
instructions : training lightgbm model with specified params
...
...
@@ -68,6 +70,8 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'
lgbm
=
lgb
.
train
(
params
,
lgb_train
,
valid_sets
=
lgb_val
,
verbose_eval
=
False
)
train_auc
,
val_auc
=
returnAUC
(
lgbm
,
df_train
,
df_val
,
features
)
matplot
.
topN_feature_importance
(
lgb
,
lgbm
,
title
=
featureImportance_title
,
save_path
=
featureImportance_path
,
topN
=
topN_featureImportance
)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return
train_auc
,
val_auc
,
lgbm
...
...
@@ -117,9 +121,8 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
# replace the worst parameter with a greater combination
para
[
'max_depth'
]
=
deepth
para
[
'num_leaves'
]
=
leaves
optimal_para
[
topn
.
argmin
()]
=
para
return
optimal_para
,
lgb_train
,
lgb_val
,
topn
return
optimal_para
,
topn
# training_curve.append(train_auc)
...
...
@@ -163,18 +166,11 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
# return 1
def
topN_feature_importance
(
classifier
,
clf
,
topN
=
20
,
model
=
lgb
):
'''
plot feature importance squence
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
# 分辨率
plt
.
figure
(
figsize
=
(
10
,
6
))
classifier
.
plot_importance
(
clf
,
max_num_features
=
topN
)
plt
.
title
(
"Featurer Importances"
)
plt
.
show
()
def
predict
(
lgbm
,
df_test
,
features
,
target
=
'target'
):
predictions
=
lgbm
.
predict
(
df_test
[
features
])
auc
=
roc_auc_score
(
predictions
,
df_test
[
target
])
return
predictions
,
auc
def
buildClf
(
params
=
params_lgb
):
...
...
@@ -183,7 +179,7 @@ def buildClf(params=params_lgb):
Params :
'''
return
lgb
m
.
LGBMClassifier
(
params
)
return
lgb
.
LGBMClassifier
(
params
)
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
...
...
models_obj/dhb_obj.py
View file @
e511a80c
...
...
@@ -210,84 +210,73 @@ class dhb:
and datediff(now(),deadline) > '''
+
str
(
passdue_day
)
+
'''
'''
def
dhb_features_extract
(
self
):
value_map
=
{
"近3天"
:
1
,
"近4-5天"
:
2
,
"近6-7天"
:
3
,
"近8-15天"
:
4
,
"近16-30天"
:
5
,
"近31-60天"
:
6
,
"近61-90天"
:
7
,
"近91-120天"
:
8
,
"近121-150天"
:
9
,
"近151-180天"
:
10
,
"180天前"
:
11
,
"无"
:
0
}
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
print
(
'sql: '
,
self
.
sql
.
replace
(
'@start_time_period'
,
self
.
start_time_period
)
.
replace
(
'@end_time_period'
,
self
.
end_time_period
))
dhb_loan
=
pd
.
read_sql
(
self
.
sql
.
replace
(
'@start_time_period'
,
self
.
start_time_period
)
.
replace
(
'@end_time_period'
,
self
.
end_time_period
),
mysqldb
.
engine_risk_analysis
)
dhb_loan
[[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
"dhb_overview_ntdun_last_call_time"
]]
=
dhb_loan
[
[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
"dhb_overview_ntdun_last_call_time"
]]
.
applymap
(
lambda
x
:
value_map
[
x
])
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_60_and_90_days_ntdun_call_avg_duration
>=
42
,
"dhb_last_60_and_90_days_ntdun_call_avg_duration"
]
=
42
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_duration_above60
>=
25
,
"dhb_overview_ntdun_call_duration_above60"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_and_60_days_ntdun_call_total_duration
>=
800
,
"dhb_last_30_and_60_days_ntdun_call_total_duration"
]
=
800
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_and_60_days_dun_call_in_duration
>=
1600
,
"dhb_last_30_and_60_days_dun_call_in_duration"
]
=
1600
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_ntdun_call_total_duration
>=
2500
,
"dhb_last_30_days_ntdun_call_total_duration"
]
=
2500
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_ntdun_call_tel_total_nums
>=
25
,
"dhb_last_30_days_ntdun_call_tel_total_nums"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_dun_call_in_duration
>=
1000
,
"dhb_last_30_days_dun_call_in_duration"
]
=
1000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_total_duration
>=
3000
,
"dhb_overview_ntdun_call_total_duration"
]
=
3000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_in_times
>=
25
,
"dhb_overview_ntdun_call_in_times"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_60_and_90_days_ntdun_call_in_duration
>=
1000
,
"dhb_last_60_and_90_days_ntdun_call_in_duration"
]
=
1000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_dun_call_tel_total_nums
>=
22
,
"dhb_overview_dun_call_tel_total_nums"
]
=
22
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_dun_call_total_duration
>=
1100
,
"dhb_last_30_days_dun_call_total_duration"
]
=
1100
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_two_weeks_ntdun_call_in_duration
>=
300
,
"dhb_last_two_weeks_ntdun_call_in_duration"
]
=
300
dhb_loan
.
to_csv
(
"./dhb_loan_sample——"
+
str
(
datetime
.
date
.
today
())
+
".csv"
)
print
(
time
.
strftime
(
'
%
Y.
%
m.
%
d
%
H:
%
M:
%
S'
,
time
.
localtime
(
time
.
time
()))
+
"提取了dhb "
+
self
.
start_time_period
+
"to"
+
self
.
end_time_period
+
"时段样本"
)
def
dhb_features_extract
(
self
,
df
):
try
:
value_map
=
{
"近3天"
:
1
,
"近4-5天"
:
2
,
"近6-7天"
:
3
,
"近8-15天"
:
4
,
"近16-30天"
:
5
,
"近31-60天"
:
6
,
"近61-90天"
:
7
,
"近91-120天"
:
8
,
"近121-150天"
:
9
,
"近151-180天"
:
10
,
"180天前"
:
11
,
"无"
:
0
}
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
print
(
'sql: '
,
self
.
sql
.
replace
(
'@start_time_period'
,
self
.
start_time_period
)
.
replace
(
'@end_time_period'
,
self
.
end_time_period
))
dhb_loan
=
pd
.
read_sql
(
self
.
sql
.
replace
(
'@start_time_period'
,
self
.
start_time_period
)
.
replace
(
'@end_time_period'
,
self
.
end_time_period
),
mysqldb
.
engine_risk_analysis
)
dhb_loan
[[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
"dhb_overview_ntdun_last_call_time"
]]
=
dhb_loan
[
[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
"dhb_overview_ntdun_last_call_time"
]]
.
applymap
(
lambda
x
:
value_map
[
x
])
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_60_and_90_days_ntdun_call_avg_duration
>=
42
,
"dhb_last_60_and_90_days_ntdun_call_avg_duration"
]
=
42
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_duration_above60
>=
25
,
"dhb_overview_ntdun_call_duration_above60"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_and_60_days_ntdun_call_total_duration
>=
800
,
"dhb_last_30_and_60_days_ntdun_call_total_duration"
]
=
800
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_and_60_days_dun_call_in_duration
>=
1600
,
"dhb_last_30_and_60_days_dun_call_in_duration"
]
=
1600
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_ntdun_call_total_duration
>=
2500
,
"dhb_last_30_days_ntdun_call_total_duration"
]
=
2500
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_ntdun_call_tel_total_nums
>=
25
,
"dhb_last_30_days_ntdun_call_tel_total_nums"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_dun_call_in_duration
>=
1000
,
"dhb_last_30_days_dun_call_in_duration"
]
=
1000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_total_duration
>=
3000
,
"dhb_overview_ntdun_call_total_duration"
]
=
3000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_ntdun_call_in_times
>=
25
,
"dhb_overview_ntdun_call_in_times"
]
=
25
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_60_and_90_days_ntdun_call_in_duration
>=
1000
,
"dhb_last_60_and_90_days_ntdun_call_in_duration"
]
=
1000
dhb_loan
.
loc
[
dhb_loan
.
dhb_overview_dun_call_tel_total_nums
>=
22
,
"dhb_overview_dun_call_tel_total_nums"
]
=
22
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_30_days_dun_call_total_duration
>=
1100
,
"dhb_last_30_days_dun_call_total_duration"
]
=
1100
dhb_loan
.
loc
[
dhb_loan
.
dhb_last_two_weeks_ntdun_call_in_duration
>=
300
,
"dhb_last_two_weeks_ntdun_call_in_duration"
]
=
300
dhb_loan
.
to_csv
(
"./dhb_loan_sample——"
+
str
(
datetime
.
date
.
today
())
+
".csv"
)
print
(
time
.
strftime
(
'
%
Y.
%
m.
%
d
%
H:
%
M:
%
S'
,
time
.
localtime
(
time
.
time
()))
+
"提取了dhb "
+
self
.
start_time_period
+
"to"
+
self
.
end_time_period
+
"时段样本"
)
# ignore exceptions such as "colmns doesn't exist"
except
Exception
as
e
:
print
(
"data preprocessing ERR "
,
e
)
pass
return
dhb_loan
'''
instructions : build a comparasion
Params :
df - test dataset which was given
score - score column
target - label
start_time_period -
end_time_period -
applied_tpye -
applied_from -
Returns :
auc comparasion
liftchart plot
'''
def
dhb_predict_with_pkl
(
self
,
test
,
pkl
=
'./dhb_cuishou_jianzhi_v3.pkl'
,
features
=
features
):
open_file
=
open
(
pkl
,
"rb"
)
...
...
@@ -327,7 +316,7 @@ class dhb:
def
dhb_comparasion
(
df
,
score_BM
=
'model_exec_data_source#dhb'
,
score_predict
=
'predict'
,
target
=
'target'
,
applied_type
=
None
,
applied_from
=
None
):
'''
instructions :
comparasion of previous dhb liftchart & auc
instructions :
obtain online dhb score from mongodb
'''
# spliting data with appliedType & applied_channel
df
=
df
[
df
.
applied_type
==
applied_type
]
...
...
@@ -337,15 +326,6 @@ class dhb:
df
[
'bins_BM'
]
=
df
.
qcut
(
df
[
score_BM
],
q
=
10
,
percision
=
6
,
dupulicates
=
'drop'
)
## bins of predictions
df
[
'bins_predict'
]
=
df
.
qcut
(
df
[
score_predict
],
q
=
10
,
percision
=
6
,
dupulicates
=
'drop'
)
pivot_BM
=
df
[[
'bins_BM'
,
target
]]
.
groupby
(
'bins_BM'
)
pivot_predict
=
df
[[
'bins_predict'
,
target
]]
.
groupby
(
'bins_predict'
)
# output liftchart & AUC
pivot_BM
=
pivot_BM
.
sum
()
/
pivot_BM
.
count
()
pivot_predict
=
pivot_predict
.
sum
()
/
pivot_predict
.
count
()
# concate two pivot
pivot
=
pd
.
concat
([
pivot_BM
,
pivot_predict
],
axis
=
1
)
# pivottable plot
pivot
.
plot
()
return
1
...
...
mvp/allocator.py
View file @
e511a80c
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
mvp
import
xgbreport
from
mvp
import
lgbreport
from
data.analyis
import
datacal
from
tools
import
datacal
from
models
import
xgboost
from
models
import
lightgbm
from
mvp
import
refit
from
mvp
import
rebuild
from
models_obj
import
dhb_obj
###### global variable ######
# label
target
=
'target'
#############################
from
mvp
import
dhb
# from mvp import dhb
from
data.samples
import
dhb
,
sample
dhb
=
dhb
.
dhb
()
dhb
=
dhb_obj
.
dhb
()
df_sample
=
dhb
.
dhb_features_extract
()
target
=
'target'
features
=
dhb
.
features
df_sample
[
features
]
=
df_sample
[
features
]
.
astype
(
float
)
df_sample
[
'target'
]
=
df_sample
[
'target'
]
.
astype
(
int
)
...
...
@@ -27,8 +35,6 @@ print('----no.',len(features),'of samples of dhb----')
# to save model performance
if
__name__
==
'__main__'
:
# data extraction
''' ## Old Edition here
...
...
@@ -40,10 +46,10 @@ if __name__ == '__main__':
# else:
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
'''
df_train
,
df_val
,
df_test
=
train_test_split_general
()
# 默认取样本方法
df_train
,
df_val
,
df_test
=
datacal
.
train_test_split_general
()
# data manipulation
## TODO
...
...
@@ -69,7 +75,7 @@ if __name__ == '__main__':
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# merge as single dataframe full of models
pd
.
DataFrame
(
xgb_model
)
#
pd.DataFrame(xgb_model)
...
...
@@ -109,4 +115,63 @@ if __name__ == '__main__':
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport
.
report
(
dftrain
,
dftest
,
dhb
.
get_feature
(),
'label'
,
''
,
'xgboost_
%
s.doc'
%
datetime
.
datetime
.
now
()
.
date
()
.
strftime
(
'
%
y
%
m
%
d'
),
kfold
=
2
)
#################################################### report settings #############################################################################
applied_from
=
{
'1,214,217,198'
:
'内部'
,
'333'
:
'融360'
,
'159537'
:
'360金融'
}
applied_type
=
{
'1,2'
:
'首贷'
,
'1,2,3'
:
'首付贷'
,
'1'
:
'首申'
,
'2'
:
'复申'
,
'3'
:
'复贷'
}
# refit / rebuild sequence
# 生成电话帮对象(使用默认参数)
dhb
=
dhb_obj
.
dhb
(
features
=
None
,
sql
=
None
,
start_time_period
=
None
,
end_time_period
=
None
,
passdue_day
=
15
)
# 提取样本
df_sample
=
dhb
.
dhb_features_extract
()
# 备份df_sample
df_sample
.
to_csv
(
str
(
datetime
.
date
.
today
())
+
"dhb_samples.xlsx"
)
# 电话帮数据处理
# report sequence
mvp/lgbreport.py
View file @
e511a80c
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
data.analyi
s
import
filetool
from
data.analyi
s
import
datacal
from
models
import
lightgbm
from
tool
s
import
filetool
from
tool
s
import
datacal
from
models
_kit
import
lightgbm
from
matplotlib
import
pyplot
as
plt
from
data.
graph
import
matplot
from
graph
import
matplot
mvp/refit.py
0 → 100644
View file @
e511a80c
from
models_obj
import
dhb_obj
from
tools
import
datacal
import
datetime
from
models_kit
import
lightgbm
from
models_kit
import
xgboost
import
lightgbm
as
lgb
from
graph
import
matplot
from
tools
import
filetool
dhb
=
dhb_obj
.
dhb
(
features
=
None
,
sql
=
None
,
start_time_period
=
None
,
end_time_period
=
None
,
passdue_day
=
15
)
# 提取样本
#df_sample = dhb.dhb_features_extract()
######### temp #############
import
pandas
as
pd
df_sample
=
pd
.
read_csv
(
'E:
\\
model
\\
model_mvp
\\
mvp
\\
dhb_loan_sample——2019-04-23.csv'
,
engine
=
'python'
)
############################
# 备份df_sample
df_sample
.
to_csv
(
str
(
datetime
.
date
.
today
())
+
"dhb_samples.xlsx"
)
# 默认样本划分
df_train
,
df_val
,
df_test
=
datacal
.
train_test_split_general
(
df_sample
,
val_size
=
0.2
,
test_size
=
0.2
,
stratify
=
'target'
,
random_state
=
7
,
split_methods
=
'random'
,
time_label
=
'applied_at'
)
del
df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para
,
topn
=
lightgbm
.
lgb_params_tuning
(
lightgbm
.
params_lgb
,
dhb
.
features
,
df_train
,
df_val
,
target
=
'target'
,
topN
=
3
,
cv_fold
=
5
)
print
(
'topn 通过train交叉验证得到的auc '
,
topn
)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc
,
val_auc
,
lgbm
=
lightgbm
.
train_lgbm
(
lightgbm
.
params_lgb
,
df_train
,
df_val
,
dhb
.
features
,
adds_on
=
optimal_para
,
target
=
'target'
)
predictions
,
test_auc
=
lightgbm
.
predict
(
lgbm
,
df_test
,
features
=
dhb
.
features
)
df_test
[
'predict'
]
=
predictions
####### allocator cache ############
applied_from
=
{
'1,214,217,198'
:
'内部'
,
'333'
:
'融360'
,
'159537'
:
'360金融'
}
applied_type
=
{
'1,2'
:
'首贷'
,
'1,2,3'
:
'首付贷'
,
'1'
:
'首申'
,
'2'
:
'复申'
,
'3'
:
'复贷'
}
####################################
### report
# plot feature importance
path
=
matplot
.
topN_feature_importance
(
lgb
,
lgbm
,
title
=
"untitled"
,
save_path
=
'./plots/'
,
topN
=
20
)
# report file
report_path
=
"E:
\\
bla
\\
"
report_name
=
"lgb_report.docx"
document
=
filetool
.
buildDocument
(
report_path
,
report_name
)
document
.
add_heading
(
'lightGBM 算法refit报告'
)
filetool
.
Document
.
add_paragraph
(
'特征权重图'
)
filetool
.
add_picture
(
path
)
filetool
.
Document
.
add_paragraph
(
'univar_chart'
)
for
i
in
dhb
.
features
:
univar
=
datacal
.
cal_univar
(
df_train
,
score
=
'raw_score'
)
univarChart
=
matplot
.
plot_table
(
univar
,
title
=
i
+
' univar Chart'
,
saved_path
=
'./plots/cache'
)
filetool
.
add_picture
(
"./plots/cache"
+
i
+
' univar Chart'
)
for
i
in
dhb
.
features
:
pdp
=
datacal
.
cal_pdp
(
df_test
,
score
=
'predict'
)
pdpChart
=
matplot
.
plot_table
(
pdp
,
title
=
i
+
' PDP Chart'
,
saved_path
=
'./plots/cache'
)
filetool
.
add_picture
(
"./plots/cache"
+
i
+
' PDP Chart'
)
for
i
in
dhb
.
features
:
lift
=
datacal
.
cal_liftchart
(
df_test
,
score
=
'predict'
)
liftChart
=
matplot
.
plot_table
(
lift
,
title
=
i
+
' lift Chart'
,
saved_path
=
'./plots/cache'
)
filetool
.
add_picture
(
"./plots/cache"
+
i
+
' lift Chart'
)
filetool
.
saveDocument
(
document
,
report_path
,
report_name
)
tools/datacal.py
View file @
e511a80c
...
...
@@ -4,6 +4,31 @@ import datetime
from
sklearn.model_selection
import
train_test_split
def
liftchart
(
df
,
target
=
'target'
,
qcut
=
10
,
retbins
=
True
):
'''
instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
Params :
df - dataframe(注意一定是是放款集!!)
target - label column
qcut - quantiles
retbins - return bins interval when 'retbins' is True, else False
:return:
liftchart dataframe
'''
df
=
df
.
copy
()
# create a bins column
df
[
'bins'
]
=
pd
.
qcut
(
df
,
q
=
10
,
precision
=
6
,
retbins
=
False
,
duplicates
=
'drop'
)
pivot
=
df
[[
'bins'
,
'target'
]]
.
groupby
(
'bins'
)
.
agg
([
'mean'
,
'count'
])
return
pivot
def
train_test_split_general
(
dataset
,
val_size
=
0.2
,
test_size
=
0.2
,
stratify
=
'target'
,
random_state
=
7
,
split_methods
=
'random'
,
time_label
=
'applied_at'
):
'''
...
...
@@ -92,6 +117,20 @@ def cal_month(df,date_name,date_name_new):
return
df
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
,
method
=
2
):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
...
...
@@ -156,7 +195,7 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
:return:
'''
if
df
.
shape
[
0
]
==
0
:
raise
(
'no dat
e
'
)
raise
(
'no dat
a
'
)
columns
=
df
.
columns
.
tolist
()
if
target
not
in
columns
:
raise
(
'not found
%
s'
%
target
)
...
...
@@ -167,9 +206,9 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
# == bin 划分,feature 有可能 非数字
try
:
tmp
[
feature
]
=
tmp
[
feature
]
.
astype
(
float
)
feature_grid
=
cal_feature_grid
(
tmp
,
feature
,
bin
)
tmp
[
'lbl'
]
=
pd
.
cut
(
tmp
[
feature
],
feature_grid
,
include_lowest
=
True
)
tmp
[
feature
]
=
tmp
[
feature
]
.
astype
(
float
)
feature_grid
=
cal_feature_grid
(
tmp
,
feature
,
bin
)
tmp
[
'lbl'
]
=
pd
.
cut
(
tmp
[
feature
],
feature_grid
,
include_lowest
=
True
)
tmp
[
'grid'
]
=
tmp
[
'lbl'
]
.
cat
.
codes
except
ValueError
:
tmp
[
'lbl'
]
=
tmp
[
feature
]
...
...
@@ -181,7 +220,7 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
df_out
=
df_gp
else
:
df_all
=
tmp
.
groupby
([
'grid'
,
'lbl'
])
.
agg
({
target
:
[
'count'
,
'mean'
,
'sum'
]})
.
reset_index
()
df_all
.
columns
=
[
'grid'
,
'lbl'
,
'count'
,
'mean'
,
'sum'
]
df_all
.
columns
=
[
'grid'
,
'lbl'
,
'count'
,
'mean'
,
'sum'
]
df_out
=
df_all
return
df_out
...
...
tools/filetool.py
View file @
e511a80c
...
...
@@ -3,6 +3,12 @@ from docx import Document
from
docx.shared
import
Inches
def
buildDocument
(
path
,
filename
):
'''
instrucions : build a document writer
:param path:
:param filename:
:return:
'''
if
filename
[
-
3
:]
!=
'doc'
:
if
filename
[
-
4
:]
!=
'docx'
:
raise
ValueError
(
'{} is not a word file'
.
format
(
filename
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment