Commit f2e2d5cf authored by 王家华's avatar 王家华

新增readme,lgb调参

parent fe8f7148
# PROJECT_MVP
## 数据源(datasource)
### mongo提取(mongodb)
### mysql(mysqldb)
### TBD
## 数据计算通用(tools)
### 常用通用工具包(datacal)
- train_test_split_general
- univar
- pdp
- liftchart
- TBD
### docx报告生成工具(filetool)
### TBD
## 绘图包(graph)
### 常用matplotlib折线图工具包(matplot)
### pyecharts绘图包
### TBD
## 线上模型对象
### 百融
### dhb
- 取给定特征<默认取线上特征>(dhb_features_extract)
- 获取线上模型效果(dhb_comparasion)
- dhb_xgb
- (存放lgb模型报告相关图像)dhb_lgb
- (模型报告及PKL生成路径)report_lgb
- report_xgb
- 线上分数变化
- 线上分数PSI
- 通过给定特征用线上模型pkl打分
- 特征VLM
- TBD
### xy
### Others
## 模型方法(models)
### Xgboost
- 默认参数表(params_xgb)
- 返回train/validation的AUC(returnAUC)
- xgb_train
- buildClf
- automodelfit
- predict
- featureImportance
### LightGBM
- (默认参数表)params_lgb
- returnAUC
- topN_feature_importance
- buildClf
- (组合cv调参模块)lgb_params_tuning
- (训练模型并调用returnAUC)train_lgbm
## 特征工程(features)
### 特征筛选
- 单变量
- 信息熵
- 方差
- 降维方法
### 缺失值处理
### 标准化(线性模型)
### outliers(线性模型)
## mvp
### 程序入口(allocator)
### 拟合xgboost(xgbreport)
- 调用绘图包/datacal包/filttool,生成报告
### 拟合lightgbm(lgbreport)
*XMind: ZEN - Trial Version*
\ No newline at end of file
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if split_methods == 'random':
df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if val_size != None:
size = val_size / (1 - test_size)
df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
# case when validation set not exists
return df_train, df_val, df_test
# split data with time sequence
elif split_methods == 'timeSeries':
data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
df_test = data_tmp[: int(len(dataset) * test_size)]
df_train = data_tmp[int(len(dataset) * test_size):]
return df_train, df_test
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
......@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N
val = None
return train,val
def cal_week(df,date_name,date_name_new):
'''
:param df: dateframe
......
......@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
'''
instructions : visualization of pivot
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
......@@ -103,9 +143,6 @@ def density_chart(dataset,title):
plt.title(title)
plt.show()
def uniVarChart():
return 1
......
......@@ -2,86 +2,201 @@ import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy
import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
params = {
'task': 'train', #用途
'application':'binary', #用于二分类
params_lgb = {
'task': 'train', # 用途
'application': 'binary', # 用于二分类
'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round':100, #迭代次数
'num_boost_round': 150, # 迭代次数
'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds':None,
# 'objective': 'regression', # 目标函数
'max_depth':4,
'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'early_stopping_rounds': None,
# 'objective': 'regression', # 目标函数
'max_depth': 4,
'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params = params.copy()
print(type(df_train), type(df_val))
# training params
if adds_on != None:
for i in adds_on.keys():
params[i] = adds_on[i]
# convert DataFrame to binary format
lgb_train = lgb.Dataset(df_train[features], df_train[target])
lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params = params.copy()
lgb_train = lgb.Dataset(train[features], train[target])
lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
# create a ndarray shapes 1*n
topn = np.zeros(topN)
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
'''
instructions : training lightgbm model with specified params
for deepth in np.arange(2, 7, 1):
for leaves in np.arange(2, 2 ** deepth, 2):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
boost_round = pd.Series(cv_result['auc-mean']).argmax()
# if anyone greater than item in topn list(either of them)
if (auc_score > topn).any():
# find the worst one / lowest AUC
topn[topn.argmin()] = auc_score
para = {}
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
'''
def lgb_train(params,training_set,features,target):
lgb_train = lgb.Dataset(training_set[features],training_set[target])
#lgb.train(params,)
optimal_para[topn.argmin()] = para
return optimal_para, lgb_train, lgb_val, topn
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
def lightGBM_gridCV(param_validation, params=params_lgb):
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
param_test = {
'max_depth': np.arange(2, 7, 1),
'num_leaves': np.arange(20, 200, 10),
}
estimator = LGBMRegressor(
num_leaves=50,
max_depth=13,
learning_rate=0.1,
n_estimators=1000,
objective='binary',
min_child_weight=1,
param['metric'] = ['auc', 'binary_logloss'],
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 7
)
gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
gsearch.fit(values, labels)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
return 1
'''
instructions : build a lgb classifier
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
'''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
Params :
'''
def buildClf(params):
return lgb.LGBMClassifier(params)
'''
'''
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
def buildClf(params=params_lgb):
'''
instructions : build a lgb classifier
Params :
'''
return lgbm.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数
grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
# == 模型训练
grid_search.fit(dftrain[features], dftrain[resp])
# == 获取最优参数
return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = lgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = lgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
##############################################################################
......@@ -2,13 +2,74 @@ import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics
target = 'target'
import xgboost as xgb
# default parameters
params_xgb = {
'learning_rate': 0.1,
'n_estimators': 200,
'max_depth': 3,
'min_child_weight': 1,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'binary:logistic',
'nthread': 4,
'scale_pos_weight': 1,
'seed': 27
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def xgb_train(params, train, val, features, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain = xgb.DMatrix(train[features], train[target])
dval = xgb.DMatrix(val[features], val[target])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf.fit(train[features], train['target'])
# xgbm = xgb.train(params,dtrain)
returnAUC(xgb_clf, train, val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return xgb_clf
#############################################################################
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5):
'''
......@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7
......
......@@ -2,61 +2,70 @@ import pandas as pd
import numpy as np
import datetime
from mvp import xgbreport
from mvp import lgbreport
from data.analyis import datacal
from models import xgboost
from models import lightgbm
from mvp import dhb
dhb = dhb.dhb()
df_sample = dhb.dhb_features_extract()
target = 'target'
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__':
# features=[
# 'third_data_source#xy_pan_newapplyAcredibility',
# 'third_data_source#xy_pan_newapplyAscore',
# 'third_data_source#xy_pan_newconsfinAavgAlimit',
# 'third_data_source#xy_pan_newconsfinAcredibility',
# 'third_data_source#xy_pan_newconsfinAcreditAlimit',
# 'third_data_source#xy_pan_newconsfinAmaxAlimit',
# 'third_data_source#xy_pan_newconsfinAorgAcountq',
# 'third_data_source#xy_pan_newconsfinAorgAcountx',
# 'third_data_source#xy_pan_newconsfinAproductAcount',
# 'third_data_source#xy_pan_newhistoryAfailAfee',
# 'third_data_source#xy_pan_newhistoryAsucAfee',
# 'third_data_source#xy_pan_newlatestAoneAmonthAfail',
# 'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
# 'third_data_source#xy_pan_newlatestAoneAmonthd',
# 'third_data_source#xy_pan_newlatestAoneAmonthj',
# 'third_data_source#xy_pan_newlatestAqueryAtime',
# 'third_data_source#xy_pan_newlatestAsixAmontha',
# 'third_data_source#xy_pan_newlatestAsixAmonthv',
# 'third_data_source#xy_pan_newlatestAthreeAmonthb',
# 'third_data_source#xy_pan_newlatestAthreeAmonthf',
# 'third_data_source#xy_pan_newloansAavgAlimit',
# 'third_data_source#xy_pan_newloansAcashAcount',
# 'third_data_source#xy_pan_newloansAcount',
# 'third_data_source#xy_pan_newloansAcredibilityh',
# 'third_data_source#xy_pan_newloansAcredibilitys',
# 'third_data_source#xy_pan_newloansAcreditAlimit',
# 'third_data_source#xy_pan_newloansAlatestAtime',
# 'third_data_source#xy_pan_newloansAlongAtime',
# 'third_data_source#xy_pan_newloansAmaxAlimit',
# 'third_data_source#xy_pan_newloansAorgAcounta',
# 'third_data_source#xy_pan_newloansAorgAcountg',
# 'third_data_source#xy_pan_newloansAoverdueAcount',
# 'third_data_source#xy_pan_newloansAproductAcount',
# 'third_data_source#xy_pan_newloansAscore',
# 'third_data_source#xy_pan_newloansAsettleAcount',
# 'third_data_source#xy_pan_newqueryAcashAcount',
# 'third_data_source#xy_pan_newqueryAfinanceAcount',
# 'third_data_source#xy_pan_newqueryAorgAcount',
# 'third_data_source#xy_pan_newqueryAsumAcount'
# ]
dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
df=dhb.dhb_features_extract()
print(df.columns.tolist())
print(df.target.unique())
label='target'
features=dhb.get_feature()
df[features]=df[features].astype(float)
df['target']=df['target'].astype(int)
print('----feature---',len(features))
# df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
# data extraction
''' ## Old Edition here
# if total sample more than 30000, it would use train-validation-test
# else use CV to parameters tuning
# if len(df_sample) >= 30000:
# df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# else:
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
'''
df_train, df_val, df_test = train_test_split_general()
# data manipulation
## TODO
# model refit
#xgboost
xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
xgb_model_auc['training_auc'] = None
xgb_model_auc['val_auc'] = None
#xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
#ligthtgbm
lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
lgb_model_auc['training_auc'] = None
lgb_model_auc['val_auc'] = None
#dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# merge as single dataframe full of models
pd.DataFrame(xgb_model)
This diff is collapsed.
# Author : Jason Wang
# latest update : May 6 2019
# version control :
#
#######################################################################################################################
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import matplot
# 选定的topnfeatures
......@@ -3,9 +3,9 @@ import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import xgboost
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import drawplot
from mvp import dhb
from data.datasource import mysqldb,mongodb
from data.graph import matplot
......@@ -3,10 +3,11 @@ import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import xgboost
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename,kfold=10):
'''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
......@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
'''
document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf()
clf=lightgbm.buildClf()
document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
clf=xgboost.modelfit(clf,dftrain,features,label,kfold=kfold)
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
clf=lightgbm.modelfit(clf,dftrain,features,label,kfold=kfold)
document.add_paragraph('模型训练集{}'.format(lightgbm.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(lightgbm.auc(clf, dftest, features, label)))
document.add_heading('调整参数')
max_depth=[2,3]
......@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
{'reg_lambda': reg_lambda}, features, label,kfold=kfold)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features)
dftrain=lightgbm.predict(clf,dftrain,features)
dftest=lightgbm.predict(clf,dftest,features)
#== 特征权重
featureimp=xgboost.featureImportance(clf,features)
featureimp=lightgbm.featureImportance(clf,features)
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征')
......@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10):
for i in dict(params).keys():
document.add_paragraph('调参{},取值{}'.format(i,params[i]))
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
grid_search = lightgbm.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
#==
# clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label)))
document.add_paragraph('模型训练集{}'.format(lightgbm.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(lightgbm.auc(grid_search, dftest, features, label)))
return document,clf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment