Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
f2e2d5cf
Commit
f2e2d5cf
authored
May 07, 2019
by
王家华
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
新增readme,lgb调参
parent
fe8f7148
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
835 additions
and
386 deletions
+835
-386
README.md
README.md
+101
-0
datacal.py
data/analyis/datacal.py
+34
-0
matplot.py
data/graph/matplot.py
+40
-3
lightgbm.cpython-36.pyc
models/__pycache__/lightgbm.cpython-36.pyc
+0
-0
lightgbm.py
models/lightgbm.py
+179
-64
xgboost.py
models/xgboost.py
+63
-2
allocator.py
mvp/allocator.py
+63
-54
dhb.py
mvp/dhb.py
+284
-248
docxReport.py
mvp/docxReport.py
+54
-0
lgbreport.py
mvp/lgbreport.py
+4
-4
xgbreport.py
mvp/xgbreport.py
+13
-11
No files found.
README.md
0 → 100644
View file @
f2e2d5cf
# PROJECT_MVP
## 数据源(datasource)
### mongo提取(mongodb)
### mysql(mysqldb)
### TBD
## 数据计算通用(tools)
### 常用通用工具包(datacal)
-
train_test_split_general
-
univar
-
pdp
-
liftchart
-
TBD
### docx报告生成工具(filetool)
### TBD
## 绘图包(graph)
### 常用matplotlib折线图工具包(matplot)
### pyecharts绘图包
### TBD
## 线上模型对象
### 百融
### dhb
-
取给定特征
<
默认取线上特征
>
(dhb_features_extract)
-
获取线上模型效果(dhb_comparasion)
-
dhb_xgb
-
(存放lgb模型报告相关图像)dhb_lgb
-
(模型报告及PKL生成路径)report_lgb
-
report_xgb
-
线上分数变化
-
线上分数PSI
-
通过给定特征用线上模型pkl打分
-
特征VLM
-
TBD
### xy
### Others
## 模型方法(models)
### Xgboost
-
默认参数表(params_xgb)
-
返回train/validation的AUC(returnAUC)
-
xgb_train
-
buildClf
-
automodelfit
-
predict
-
featureImportance
### LightGBM
-
(默认参数表)params_lgb
-
returnAUC
-
topN_feature_importance
-
buildClf
-
(组合cv调参模块)lgb_params_tuning
-
(训练模型并调用returnAUC)train_lgbm
## 特征工程(features)
### 特征筛选
-
单变量
-
信息熵
-
方差
-
降维方法
### 缺失值处理
### 标准化(线性模型)
### outliers(线性模型)
## mvp
### 程序入口(allocator)
### 拟合xgboost(xgbreport)
-
调用绘图包/datacal包/filttool,生成报告
### 拟合lightgbm(lgbreport)
*XMind: ZEN - Trial Version*
\ No newline at end of file
data/analyis/datacal.py
View file @
f2e2d5cf
import
pandas
as
pd
import
pandas
as
pd
import
numpy
as
np
import
numpy
as
np
import
datetime
import
datetime
from
sklearn.model_selection
import
train_test_split
def
train_test_split_general
(
dataset
,
val_size
=
0.2
,
test_size
=
0.2
,
stratify
=
'target'
,
random_state
=
7
,
split_methods
=
'random'
,
time_label
=
'applied_at'
):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if
split_methods
==
'random'
:
df_train
,
df_test
=
train_test_split_general
(
dataset
,
val_size
=
None
,
stratify
=
None
,
split_methods
=
'timeSeries'
)
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if
val_size
!=
None
:
size
=
val_size
/
(
1
-
test_size
)
df_train
,
df_val
=
train_test_split
(
df_train
,
test_size
=
size
,
random_state
=
random_state
)
# case when validation set not exists
return
df_train
,
df_val
,
df_test
# split data with time sequence
elif
split_methods
==
'timeSeries'
:
data_tmp
=
dataset
.
sort_values
(
by
=
[
time_label
],
axis
=
0
,
ascending
=
False
)
df_test
=
data_tmp
[:
int
(
len
(
dataset
)
*
test_size
)]
df_train
=
data_tmp
[
int
(
len
(
dataset
)
*
test_size
):]
return
df_train
,
df_test
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
'''
...
@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N
...
@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N
val
=
None
val
=
None
return
train
,
val
return
train
,
val
def
cal_week
(
df
,
date_name
,
date_name_new
):
def
cal_week
(
df
,
date_name
,
date_name_new
):
'''
'''
:param df: dateframe
:param df: dateframe
...
...
data/graph/matplot.py
View file @
f2e2d5cf
...
@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素
...
@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
#分辨率
plt
.
rcParams
[
'figure.dpi'
]
=
200
#分辨率
def
plot_table
(
dataset
,
auc
,
title
=
'untitled'
,
X_label
=
None
,
y_label
=
None
,
plot_tab
=
True
,
legend_list
=
None
,
saved_path
=
None
):
'''
instructions : visualization of pivot
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
# 分辨率
fig
,
axs
=
plt
.
subplots
(
1
,
1
,
figsize
=
(
16
,
9
),
linewidth
=
0.1
)
table_rows
=
dataset
.
columns
table_cols
=
dataset
.
index
# traverse each columns of dataframe
for
i
in
table_rows
:
x
=
table_cols
y
=
dataset
[
i
]
axs
.
plot
(
x
,
y
,
maker
=
'o'
,
label
=
str
(
i
)
+
' AUC: '
+
auc
[
i
])
if
plot_tab
!=
False
:
the_table
=
plt
.
table
(
cellText
=
[
list
(
dataset
.
iloc
[
i
,
:]
.
values
)
for
i
in
range
(
len
(
dataset
.
head
()))],
rowLabels
=
table_rows
,
colLabels
=
table_cols
,
colWidths
=
[
0.91
/
(
len
(
table_cols
)
-
1
)]
*
len
(
table_cols
),
loc
=
'bottom'
)
plt
.
xticks
([])
the_table
.
auto_set_font_size
(
False
)
the_table
.
set_fontsize
(
8
)
fig
.
subplots_adjust
(
bottom
=
0.2
)
plt
.
grid
()
plt
.
ylabel
(
title
)
plt
.
legend
()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt
.
title
(
title
)
plt
.
show
()
return
1
def
plot_curve_singleCurve
(
dataset
,
x_label
=
None
,
y_label
=
None
,
table_tab
=
None
,
def
plot_curve_singleCurve
(
dataset
,
x_label
=
None
,
y_label
=
None
,
table_tab
=
None
,
save_path
=
None
,
figure_arrangement
=
11
,
fig_size
=
(
4
,
3
),
save_path
=
None
,
figure_arrangement
=
11
,
fig_size
=
(
4
,
3
),
...
@@ -103,9 +143,6 @@ def density_chart(dataset,title):
...
@@ -103,9 +143,6 @@ def density_chart(dataset,title):
plt
.
title
(
title
)
plt
.
title
(
title
)
plt
.
show
()
plt
.
show
()
def
uniVarChart
():
return
1
...
...
models/__pycache__/lightgbm.cpython-36.pyc
0 → 100644
View file @
f2e2d5cf
File added
models/lightgbm.py
View file @
f2e2d5cf
...
@@ -2,86 +2,201 @@ import lightgbm as lgb
...
@@ -2,86 +2,201 @@ import lightgbm as lgb
from
sklearn.metrics
import
roc_auc_score
from
sklearn.metrics
import
roc_auc_score
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
import
numpy
import
numpy
as
np
import
pandas
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
import
os
,
psutil
params
=
{
params
_lgb
=
{
'task'
:
'train'
,
#
用途
'task'
:
'train'
,
#
用途
'application'
:
'binary'
,
#
用于二分类
'application'
:
'binary'
,
#
用于二分类
'boosting_type'
:
'gbdt'
,
# 设置提升类型
'boosting_type'
:
'gbdt'
,
# 设置提升类型
'num_boost_round'
:
100
,
#
迭代次数
'num_boost_round'
:
150
,
#
迭代次数
'learning_rate'
:
0.01
,
# 学习速率
'learning_rate'
:
0.01
,
# 学习速率
'metric'
:
{
'logloss'
,
'auc'
},
# 评估函数
'metric'
:
{
'logloss'
,
'auc'
},
# 评估函数
'early_stopping_rounds'
:
None
,
'early_stopping_rounds'
:
None
,
# 'objective': 'regression', # 目标函数
# 'objective': 'regression', # 目标函数
'max_depth'
:
4
,
'max_depth'
:
4
,
'num_leaves'
:
20
,
# 叶子节点数
'num_leaves'
:
20
,
# 叶子节点数
'feature_fraction'
:
0.9
,
# 建树的特征选择比例
'feature_fraction'
:
0.9
,
# 建树的特征选择比例
'bagging_fraction'
:
0.8
,
# 建树的样本采样比例
'bagging_fraction'
:
0.8
,
# 建树的样本采样比例
'bagging_freq'
:
5
,
# k 意味着每 k 次迭代执行bagging
'bagging_freq'
:
5
,
# k 意味着每 k 次迭代执行bagging
'verbose'
:
1
# <0 显示致命的, =0 显示错误 (警告), >0 显示信息
'verbose'
:
1
# <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
}
def
returnAUC
(
clf
,
training_set
,
validation_set
,
features
,
target
=
'target'
):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc
=
roc_auc_score
(
training_set
[
target
],
clf
.
predict
(
training_set
[
features
]))
val_auc
=
roc_auc_score
(
validation_set
[
target
],
clf
.
predict
(
validation_set
[
features
]))
print
(
'training set AUC : '
,
train_auc
)
print
(
'validation set AUC : '
,
val_auc
)
return
train_auc
,
val_auc
def
train_lgbm
(
params
,
df_train
,
df_val
,
features
,
adds_on
=
None
,
target
=
'target'
):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params
=
params
.
copy
()
print
(
type
(
df_train
),
type
(
df_val
))
# training params
if
adds_on
!=
None
:
for
i
in
adds_on
.
keys
():
params
[
i
]
=
adds_on
[
i
]
# convert DataFrame to binary format
lgb_train
=
lgb
.
Dataset
(
df_train
[
features
],
df_train
[
target
])
lgb_val
=
lgb
.
Dataset
(
df_val
[
features
],
df_val
[
target
],
reference
=
lgb_train
)
lgbm
=
lgb
.
train
(
params
,
lgb_train
,
valid_sets
=
lgb_val
,
verbose_eval
=
False
)
train_auc
,
val_auc
=
returnAUC
(
lgbm
,
df_train
,
df_val
,
features
)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return
train_auc
,
val_auc
,
lgbm
def
lgb_params_tuning
(
params
,
features
,
train
,
val
,
target
=
'target'
,
topN
=
3
,
cv_fold
=
5
):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params
=
params
.
copy
()
lgb_train
=
lgb
.
Dataset
(
train
[
features
],
train
[
target
])
lgb_val
=
lgb
.
Dataset
(
val
[
features
],
val
[
target
],
reference
=
lgb_train
)
# create a ndarray shapes 1*n
topn
=
np
.
zeros
(
topN
)
# make sure that memory can afford
print
(
'Memory Occupancy Rate: '
+
(
str
)(
psutil
.
virtual_memory
()
.
percent
)
+
'
%
'
)
optimal_para
=
list
(
topn
)
'''
for
deepth
in
np
.
arange
(
2
,
7
,
1
):
instructions : training lightgbm model with specified params
for
leaves
in
np
.
arange
(
2
,
2
**
deepth
,
2
):
params
[
'max_depth'
]
=
deepth
params
[
'num_leaves'
]
=
leaves
print
(
"parameter combination : "
,
'max_depth '
,
deepth
,
'num_leaves '
,
leaves
)
cv_result
=
lgb
.
cv
(
params
,
lgb_train
,
seed
=
7
,
nfold
=
cv_fold
,
verbose_eval
=
False
)
# return max auc(best performance)
auc_score
=
pd
.
Series
(
cv_result
[
'auc-mean'
])
.
max
()
print
(
'auc '
,
auc_score
)
Parameters :
boost_round
=
pd
.
Series
(
cv_result
[
'auc-mean'
])
.
argmax
()
dataset -
# if anyone greater than item in topn list(either of them)
features - feature list of dataset
if
(
auc_score
>
topn
)
.
any
():
target - tagert column or label list of samples
# find the worst one / lowest AUC
topn
[
topn
.
argmin
()]
=
auc_score
para
=
{}
# replace the worst parameter with a greater combination
para
[
'max_depth'
]
=
deepth
para
[
'num_leaves'
]
=
leaves
'''
optimal_para
[
topn
.
argmin
()]
=
para
def
lgb_train
(
params
,
training_set
,
features
,
target
):
return
optimal_para
,
lgb_train
,
lgb_val
,
topn
lgb_train
=
lgb
.
Dataset
(
training_set
[
features
],
training_set
[
target
])
#lgb.train(params,)
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
def
lightGBM_gridCV
(
param_validation
,
params
=
params_lgb
):
# make sure that memory can afford
print
(
'Memory Occupancy Rate: '
+
(
str
)(
psutil
.
virtual_memory
()
.
percent
)
+
'
%
'
)
param_test
=
{
'max_depth'
:
np
.
arange
(
2
,
7
,
1
),
'num_leaves'
:
np
.
arange
(
20
,
200
,
10
),
}
estimator
=
LGBMRegressor
(
num_leaves
=
50
,
max_depth
=
13
,
learning_rate
=
0.1
,
n_estimators
=
1000
,
objective
=
'binary'
,
min_child_weight
=
1
,
param
[
'metric'
]
=
[
'auc'
,
'binary_logloss'
],
subsample
=
0.8
,
colsample_bytree
=
0.8
,
nthread
=
7
)
gsearch
=
GridSearchCV
(
estimator
,
param_grid
=
param_test
,
scoring
=
'roc_auc'
,
cv
=
5
)
gsearch
.
fit
(
values
,
labels
)
gsearch
.
grid_scores_
,
gsearch
.
best_params_
,
gsearch
.
best_score_
return
1
return
1
'''
def
topN_feature_importance
(
classifier
,
clf
,
topN
=
20
,
model
=
lgb
):
instructions : build a lgb classifier
'''
plot feature importance squence
'''
plt
.
rcParams
[
'font.sans-serif'
]
=
[
'SimHei'
]
plt
.
rcParams
[
'axes.unicode_minus'
]
=
False
plt
.
rcParams
[
'savefig.dpi'
]
=
226
# 图片像素
plt
.
rcParams
[
'figure.dpi'
]
=
200
# 分辨率
plt
.
figure
(
figsize
=
(
10
,
6
))
classifier
.
plot_importance
(
clf
,
max_num_features
=
topN
)
plt
.
title
(
"Featurer Importances"
)
plt
.
show
()
Params :
'''
def
buildClf
(
params
):
return
lgb
.
LGBMClassifier
(
params
)
'''
def
buildClf
(
params
=
params_lgb
):
'''
'''
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
instructions : build a lgb classifier
Params :
'''
return
lgbm
.
LGBMClassifier
(
params
)
def
automodelfit
(
clf
,
param_grid
,
dftrain
,
features
,
resp
,
kfold
=
10
,
scoring
=
'roc_auc'
):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=
2
,
cv
=
kfold
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
grid_search
=
GridSearchCV
(
clf
,
param_grid
,
scoring
=
scoring
,
n_jobs
=-
1
,
cv
=
kfold
,
verbose
=
2
,
iid
=
True
,
refit
=
True
)
#== 模型训练
#
== 模型训练
grid_search
.
fit
(
dftrain
[
features
],
dftrain
[
resp
])
grid_search
.
fit
(
dftrain
[
features
],
dftrain
[
resp
])
#== 获取最优参数
#
== 获取最优参数
return
grid_search
return
grid_search
def
modelfit
(
clf
,
dftrain
,
features
,
resp
,
useTrainCV
=
True
,
kfold
=
10
,
eval_metric
=
'auc'
,
early_stopping_rounds
=
20
):
'''
##############################################################################
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if
useTrainCV
:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param
=
clf
.
get_xgb_params
()
xgtrain
=
lgb
.
DMatrix
(
dftrain
[
features
]
.
values
,
label
=
dftrain
[
resp
]
.
values
)
cvresult
=
lgb
.
cv
(
xgb_param
,
xgtrain
,
num_boost_round
=
clf
.
get_params
()[
'n_estimators'
],
nfold
=
kfold
,
metrics
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose_eval
=
True
)
clf
.
set_params
(
n_estimators
=
cvresult
.
shape
[
0
])
clf
.
fit
(
dftrain
[
features
],
dftrain
[
resp
],
eval_metric
=
eval_metric
)
return
clf
models/xgboost.py
View file @
f2e2d5cf
...
@@ -2,13 +2,74 @@ import pandas as pd
...
@@ -2,13 +2,74 @@ import pandas as pd
import
numpy
as
np
import
numpy
as
np
import
xgboost
as
xgb
import
xgboost
as
xgb
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.metrics
import
roc_auc_score
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn.metrics
import
confusion_matrix
,
mean_squared_error
from
sklearn
import
metrics
from
sklearn
import
metrics
target
=
'target'
import
xgboost
as
xgb
# default parameters
params_xgb
=
{
'learning_rate'
:
0.1
,
'n_estimators'
:
200
,
'max_depth'
:
3
,
'min_child_weight'
:
1
,
'gamma'
:
0
,
'subsample'
:
0.8
,
'colsample_bytree'
:
0.8
,
'objective'
:
'binary:logistic'
,
'nthread'
:
4
,
'scale_pos_weight'
:
1
,
'seed'
:
27
}
def
returnAUC
(
clf
,
training_set
,
validation_set
,
features
,
target
=
'target'
):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc
=
roc_auc_score
(
training_set
[
target
],
clf
.
predict
(
training_set
[
features
]))
val_auc
=
roc_auc_score
(
validation_set
[
target
],
clf
.
predict
(
validation_set
[
features
]))
print
(
'training set AUC : '
,
train_auc
)
print
(
'validation set AUC : '
,
val_auc
)
return
train_auc
,
val_auc
def
xgb_train
(
params
,
train
,
val
,
features
,
target
=
'target'
):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain
=
xgb
.
DMatrix
(
train
[
features
],
train
[
target
])
dval
=
xgb
.
DMatrix
(
val
[
features
],
val
[
target
])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf
=
xgb
.
XGBClassifier
(
params_xgb
)
xgb_clf
.
fit
(
train
[
features
],
train
[
'target'
])
# xgbm = xgb.train(params,dtrain)
returnAUC
(
xgb_clf
,
train
,
val
,
features
)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return
xgb_clf
#############################################################################
def
buildClf
(
max_depth
=
2
,
learning_rate
=
0.1
,
n_estimators
=
5000
,
gamma
=
0
,
def
buildClf
(
max_depth
=
4
,
learning_rate
=
0.05
,
n_estimators
=
5000
,
gamma
=
0
,
min_child_weight
=
1
,
max_delta_step
=
0
,
subsample
=
0.8
,
colsample_bytree
=
0.8
,
reg_alpha
=
0
,
reg_lambda
=
1
,
min_child_weight
=
1
,
max_delta_step
=
0
,
subsample
=
0.8
,
colsample_bytree
=
0.8
,
reg_alpha
=
0
,
reg_lambda
=
1
,
scale_pos_weight
=
1
,
base_score
=
0.5
):
scale_pos_weight
=
1
,
base_score
=
0.5
):
'''
'''
...
@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
...
@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
'''
'''
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
return
xgb
.
XGBClassifier
(
max_depth
=
max_depth
,
learning_rate
=
learning_rate
,
n_estimators
=
n_estimators
,
verbosity
=
0
,
silent
=
0
,
objective
=
'binary:logistic'
,
verbosity
=
0
,
silent
=
0
,
objective
=
'binary:logistic'
,
booster
=
'gbtree'
,
n_jobs
=
2
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
booster
=
'gbtree'
,
n_jobs
=
-
1
,
nthread
=
2
,
gamma
=
gamma
,
min_child_weight
=
min_child_weight
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
max_delta_step
=
max_delta_step
,
subsample
=
subsample
,
colsample_bytree
=
colsample_bytree
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
reg_alpha
=
reg_alpha
,
reg_lambda
=
reg_lambda
,
scale_pos_weight
=
scale_pos_weight
,
base_score
=
base_score
,
random_state
=
7
,
seed
=
7
base_score
=
base_score
,
random_state
=
7
,
seed
=
7
...
...
mvp/allocator.py
View file @
f2e2d5cf
...
@@ -2,61 +2,70 @@ import pandas as pd
...
@@ -2,61 +2,70 @@ import pandas as pd
import
numpy
as
np
import
numpy
as
np
import
datetime
import
datetime
from
mvp
import
xgbreport
from
mvp
import
xgbreport
from
mvp
import
lgbreport
from
data.analyis
import
datacal
from
data.analyis
import
datacal
from
models
import
xgboost
from
models
import
lightgbm
from
mvp
import
dhb
from
mvp
import
dhb
dhb
=
dhb
.
dhb
()
df_sample
=
dhb
.
dhb_features_extract
()
target
=
'target'
features
=
dhb
.
features
df_sample
[
features
]
=
df_sample
[
features
]
.
astype
(
float
)
df_sample
[
'target'
]
=
df_sample
[
'target'
]
.
astype
(
int
)
print
(
'period of time: '
,
dhb
.
start_time_period
,
'-'
,
dhb
.
end_time_period
)
print
(
'----no.'
,
len
(
features
),
'of samples of dhb----'
)
# to save model performance
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
# features=[
# 'third_data_source#xy_pan_newapplyAcredibility',
# data extraction
# 'third_data_source#xy_pan_newapplyAscore',
# 'third_data_source#xy_pan_newconsfinAavgAlimit',
''' ## Old Edition here
# 'third_data_source#xy_pan_newconsfinAcredibility',
# if total sample more than 30000, it would use train-validation-test
# 'third_data_source#xy_pan_newconsfinAcreditAlimit',
# else use CV to parameters tuning
# 'third_data_source#xy_pan_newconsfinAmaxAlimit',
# 'third_data_source#xy_pan_newconsfinAorgAcountq',
# if len(df_sample) >= 30000:
# 'third_data_source#xy_pan_newconsfinAorgAcountx',
# df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newconsfinAproductAcount',
# else:
# 'third_data_source#xy_pan_newhistoryAfailAfee',
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newhistoryAsucAfee',
'''
# 'third_data_source#xy_pan_newlatestAoneAmonthAfail',
df_train
,
df_val
,
df_test
=
train_test_split_general
()
# 'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
# 'third_data_source#xy_pan_newlatestAoneAmonthd',
# data manipulation
# 'third_data_source#xy_pan_newlatestAoneAmonthj',
## TODO
# 'third_data_source#xy_pan_newlatestAqueryAtime',
# 'third_data_source#xy_pan_newlatestAsixAmontha',
# 'third_data_source#xy_pan_newlatestAsixAmonthv',
# 'third_data_source#xy_pan_newlatestAthreeAmonthb',
# model refit
# 'third_data_source#xy_pan_newlatestAthreeAmonthf',
# 'third_data_source#xy_pan_newloansAavgAlimit',
# 'third_data_source#xy_pan_newloansAcashAcount',
# 'third_data_source#xy_pan_newloansAcount',
#xgboost
# 'third_data_source#xy_pan_newloansAcredibilityh',
xgb_model_auc
=
{
'training_auc'
:
None
,
'val_auc'
:
None
,
'test_auc'
:
None
}
# 'third_data_source#xy_pan_newloansAcredibilitys',
xgb_model_auc
[
'training_auc'
]
=
None
# 'third_data_source#xy_pan_newloansAcreditAlimit',
xgb_model_auc
[
'val_auc'
]
=
None
# 'third_data_source#xy_pan_newloansAlatestAtime',
# 'third_data_source#xy_pan_newloansAlongAtime',
#xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newloansAmaxAlimit',
# 'third_data_source#xy_pan_newloansAorgAcounta',
## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
# 'third_data_source#xy_pan_newloansAorgAcountg',
# 'third_data_source#xy_pan_newloansAoverdueAcount',
#ligthtgbm
# 'third_data_source#xy_pan_newloansAproductAcount',
lgb_model_auc
=
{
'training_auc'
:
None
,
'val_auc'
:
None
,
'test_auc'
:
None
}
# 'third_data_source#xy_pan_newloansAscore',
lgb_model_auc
[
'training_auc'
]
=
None
# 'third_data_source#xy_pan_newloansAsettleAcount',
lgb_model_auc
[
'val_auc'
]
=
None
# 'third_data_source#xy_pan_newqueryAcashAcount',
# 'third_data_source#xy_pan_newqueryAfinanceAcount',
#dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 'third_data_source#xy_pan_newqueryAorgAcount',
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newqueryAsumAcount'
# ]
# merge as single dataframe full of models
dhb
=
dhb
.
dhb
(
start_time_period
=
'2019-01-19 11:00:00'
,
end_time_period
=
'2019-01-20 12:00:00'
)
pd
.
DataFrame
(
xgb_model
)
df
=
dhb
.
dhb_features_extract
()
print
(
df
.
columns
.
tolist
())
print
(
df
.
target
.
unique
())
label
=
'target'
features
=
dhb
.
get_feature
()
df
[
features
]
=
df
[
features
]
.
astype
(
float
)
df
[
'target'
]
=
df
[
'target'
]
.
astype
(
int
)
print
(
'----feature---'
,
len
(
features
))
# df=pd.read_csv('test.csv')
dftrain
,
dftest
=
datacal
.
split_train_val
(
df
,
trainsplit
=
'timeSeries'
,
trainsplitRatio
=
0.8
,
sort_col
=
'applied_at'
)
xgbreport
.
report
(
dftrain
,
dftest
,
features
,
label
,
''
,
'tmp.doc'
,
kfold
=
2
)
mvp/dhb.py
View file @
f2e2d5cf
This diff is collapsed.
Click to expand it.
mvp/docxReport.py
0 → 100644
View file @
f2e2d5cf
# Author : Jason Wang
# latest update : May 6 2019
# version control :
#
#######################################################################################################################
import
pandas
as
pd
import
numpy
as
np
import
datetime
from
data.analyis
import
filetool
from
data.analyis
import
datacal
from
models
import
lightgbm
from
matplotlib
import
pyplot
as
plt
from
data.graph
import
matplot
# 选定的topnfeatures
mvp/lgbreport.py
View file @
f2e2d5cf
...
@@ -3,9 +3,9 @@ import numpy as np
...
@@ -3,9 +3,9 @@ import numpy as np
import
datetime
import
datetime
from
data.analyis
import
filetool
from
data.analyis
import
filetool
from
data.analyis
import
datacal
from
data.analyis
import
datacal
from
models
import
xgboost
from
models
import
lightgbm
from
matplotlib
import
pyplot
as
plt
from
matplotlib
import
pyplot
as
plt
from
data.graph
import
draw
plot
from
data.graph
import
mat
plot
from
mvp
import
dhb
from
data.datasource
import
mysqldb
,
mongodb
mvp/xgbreport.py
View file @
f2e2d5cf
...
@@ -3,10 +3,11 @@ import numpy as np
...
@@ -3,10 +3,11 @@ import numpy as np
import
datetime
import
datetime
from
data.analyis
import
filetool
from
data.analyis
import
filetool
from
data.analyis
import
datacal
from
data.analyis
import
datacal
from
models
import
xgboost
from
models
import
lightgbm
from
matplotlib
import
pyplot
as
plt
from
matplotlib
import
pyplot
as
plt
from
data.graph
import
drawplot
from
data.graph
import
drawplot
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
,
kfold
=
10
):
def
report
(
dftrain
,
dftest
,
features
,
label
,
path
,
filename
,
kfold
=
10
):
'''
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
...
@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
...
@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
'''
'''
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
=
filetool
.
buildDocument
(
path
,
filename
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
document
.
add_heading
(
'xgboost 算法运行报告'
)
clf
=
xgboost
.
buildClf
()
clf
=
lightgbm
.
buildClf
()
document
.
add_paragraph
(
'初始化参数运行{}'
.
format
(
clf
.
get_xgb_params
()))
document
.
add_paragraph
(
'初始化参数运行{}'
.
format
(
clf
.
get_xgb_params
()))
clf
=
xgboost
.
modelfit
(
clf
,
dftrain
,
features
,
label
,
kfold
=
kfold
)
clf
=
lightgbm
.
modelfit
(
clf
,
dftrain
,
features
,
label
,
kfold
=
kfold
)
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
lightgbm
.
auc
(
clf
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
clf
,
dftest
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
lightgbm
.
auc
(
clf
,
dftest
,
features
,
label
)))
document
.
add_heading
(
'调整参数'
)
document
.
add_heading
(
'调整参数'
)
max_depth
=
[
2
,
3
]
max_depth
=
[
2
,
3
]
...
@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
...
@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
{
'reg_lambda'
:
reg_lambda
},
features
,
label
,
kfold
=
kfold
)
{
'reg_lambda'
:
reg_lambda
},
features
,
label
,
kfold
=
kfold
)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain
=
xgboost
.
predict
(
clf
,
dftrain
,
features
)
dftrain
=
lightgbm
.
predict
(
clf
,
dftrain
,
features
)
dftest
=
xgboost
.
predict
(
clf
,
dftest
,
features
)
dftest
=
lightgbm
.
predict
(
clf
,
dftest
,
features
)
#== 特征权重
#== 特征权重
featureimp
=
xgboost
.
featureImportance
(
clf
,
features
)
featureimp
=
lightgbm
.
featureImportance
(
clf
,
features
)
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
=
drawplot
.
draw_barplot
(
featureimp
.
head
(
10
),
'feature'
,
'weight'
,
title
=
'Feature importance'
)
fig
.
savefig
(
'tmp.png'
)
fig
.
savefig
(
'tmp.png'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
document
.
add_paragraph
(
'特征权重图,近前10个特征'
)
...
@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
...
@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
def
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
params
,
features
,
label
,
kfold
=
10
):
def
tun_params
(
document
,
clf
,
dftrain
,
dftest
,
params
,
features
,
label
,
kfold
=
10
):
for
i
in
dict
(
params
)
.
keys
():
for
i
in
dict
(
params
)
.
keys
():
document
.
add_paragraph
(
'调参{},取值{}'
.
format
(
i
,
params
[
i
]))
document
.
add_paragraph
(
'调参{},取值{}'
.
format
(
i
,
params
[
i
]))
grid_search
=
xgboost
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
,
kfold
=
kfold
)
grid_search
=
lightgbm
.
automodelfit
(
clf
,
params
,
dftrain
,
features
,
label
,
kfold
=
kfold
)
clf
=
grid_search
.
best_estimator_
clf
=
grid_search
.
best_estimator_
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
document
.
add_paragraph
(
'模型训练参数{}'
.
format
(
clf
.
get_xgb_params
()))
#==
#==
# clf = xgboost.modelfit(clf, dftrain, features, label)
# clf = xgboost.modelfit(clf, dftrain, features, label)
document
.
add_paragraph
(
'寻找最优参数过程{}'
.
format
(
grid_search
.
cv_results_
))
document
.
add_paragraph
(
'寻找最优参数过程{}'
.
format
(
grid_search
.
cv_results_
))
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'最优参数{},最优分{}'
.
format
(
grid_search
.
best_params_
,
grid_search
.
best_score_
))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型训练集{}'
.
format
(
lightgbm
.
auc
(
grid_search
,
dftrain
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
xgboost
.
auc
(
grid_search
,
dftest
,
features
,
label
)))
document
.
add_paragraph
(
'模型测试集{}'
.
format
(
lightgbm
.
auc
(
grid_search
,
dftest
,
features
,
label
)))
return
document
,
clf
return
document
,
clf
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment