Commit f2e2d5cf authored by 王家华's avatar 王家华

新增readme,lgb调参

parent fe8f7148
# PROJECT_MVP
## 数据源(datasource)
### mongo提取(mongodb)
### mysql(mysqldb)
### TBD
## 数据计算通用(tools)
### 常用通用工具包(datacal)
- train_test_split_general
- univar
- pdp
- liftchart
- TBD
### docx报告生成工具(filetool)
### TBD
## 绘图包(graph)
### 常用matplotlib折线图工具包(matplot)
### pyecharts绘图包
### TBD
## 线上模型对象
### 百融
### dhb
- 取给定特征<默认取线上特征>(dhb_features_extract)
- 获取线上模型效果(dhb_comparasion)
- dhb_xgb
- (存放lgb模型报告相关图像)dhb_lgb
- (模型报告及PKL生成路径)report_lgb
- report_xgb
- 线上分数变化
- 线上分数PSI
- 通过给定特征用线上模型pkl打分
- 特征VLM
- TBD
### xy
### Others
## 模型方法(models)
### Xgboost
- 默认参数表(params_xgb)
- 返回train/validation的AUC(returnAUC)
- xgb_train
- buildClf
- automodelfit
- predict
- featureImportance
### LightGBM
- (默认参数表)params_lgb
- returnAUC
- topN_feature_importance
- buildClf
- (组合cv调参模块)lgb_params_tuning
- (训练模型并调用returnAUC)train_lgbm
## 特征工程(features)
### 特征筛选
- 单变量
- 信息熵
- 方差
- 降维方法
### 缺失值处理
### 标准化(线性模型)
### outliers(线性模型)
## mvp
### 程序入口(allocator)
### 拟合xgboost(xgbreport)
- 调用绘图包/datacal包/filttool,生成报告
### 拟合lightgbm(lgbreport)
*XMind: ZEN - Trial Version*
\ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import datetime import datetime
from sklearn.model_selection import train_test_split
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if split_methods == 'random':
df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if val_size != None:
size = val_size / (1 - test_size)
df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
# case when validation set not exists
return df_train, df_val, df_test
# split data with time sequence
elif split_methods == 'timeSeries':
data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
df_test = data_tmp[: int(len(dataset) * test_size)]
df_train = data_tmp[int(len(dataset) * test_size):]
return df_train, df_test
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None): def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
''' '''
...@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N ...@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N
val = None val = None
return train,val return train,val
def cal_week(df,date_name,date_name_new): def cal_week(df,date_name,date_name_new):
''' '''
:param df: dateframe :param df: dateframe
......
...@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素 ...@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率 plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
'''
instructions : visualization of pivot
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None, def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3), save_path = None, figure_arrangement = 11, fig_size = (4,3),
...@@ -103,9 +143,6 @@ def density_chart(dataset,title): ...@@ -103,9 +143,6 @@ def density_chart(dataset,title):
plt.title(title) plt.title(title)
plt.show() plt.show()
def uniVarChart():
return 1
......
...@@ -2,86 +2,201 @@ import lightgbm as lgb ...@@ -2,86 +2,201 @@ import lightgbm as lgb
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy import numpy as np
import pandas import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
params = { params_lgb = {
'task': 'train', #用途 'task': 'train', # 用途
'application':'binary', #用于二分类 'application': 'binary', # 用于二分类
'boosting_type': 'gbdt', # 设置提升类型 'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round':100, #迭代次数 'num_boost_round': 150, # 迭代次数
'learning_rate': 0.01, # 学习速率 'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数 'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds':None, 'early_stopping_rounds': None,
# 'objective': 'regression', # 目标函数 # 'objective': 'regression', # 目标函数
'max_depth':4, 'max_depth': 4,
'num_leaves': 20, # 叶子节点数 'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例 'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例 'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息 }
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params = params.copy()
print(type(df_train), type(df_val))
# training params
if adds_on != None:
for i in adds_on.keys():
params[i] = adds_on[i]
# convert DataFrame to binary format
lgb_train = lgb.Dataset(df_train[features], df_train[target])
lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params = params.copy()
lgb_train = lgb.Dataset(train[features], train[target])
lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
# create a ndarray shapes 1*n
topn = np.zeros(topN)
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
''' for deepth in np.arange(2, 7, 1):
instructions : training lightgbm model with specified params for leaves in np.arange(2, 2 ** deepth, 2):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
Parameters : boost_round = pd.Series(cv_result['auc-mean']).argmax()
dataset - # if anyone greater than item in topn list(either of them)
features - feature list of dataset if (auc_score > topn).any():
target - tagert column or label list of samples # find the worst one / lowest AUC
topn[topn.argmin()] = auc_score
para = {}
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
''' optimal_para[topn.argmin()] = para
def lgb_train(params,training_set,features,target): return optimal_para, lgb_train, lgb_val, topn
lgb_train = lgb.Dataset(training_set[features],training_set[target])
#lgb.train(params,)
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
def lightGBM_gridCV(param_validation, params=params_lgb):
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
param_test = {
'max_depth': np.arange(2, 7, 1),
'num_leaves': np.arange(20, 200, 10),
}
estimator = LGBMRegressor(
num_leaves=50,
max_depth=13,
learning_rate=0.1,
n_estimators=1000,
objective='binary',
min_child_weight=1,
param['metric'] = ['auc', 'binary_logloss'],
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 7
)
gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
gsearch.fit(values, labels)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
return 1 return 1
''' def topN_feature_importance(classifier, clf, topN=20, model=lgb):
instructions : build a lgb classifier '''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
Params :
'''
def buildClf(params):
return lgb.LGBMClassifier(params)
''' def buildClf(params=params_lgb):
''' '''
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'): instructions : build a lgb classifier
Params :
'''
return lgbm.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7) # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True) grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
#== 模型训练 # == 模型训练
grid_search.fit(dftrain[features],dftrain[resp]) grid_search.fit(dftrain[features], dftrain[resp])
#== 获取最优参数 # == 获取最优参数
return grid_search return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
''' ##############################################################################
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = lgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = lgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
...@@ -2,13 +2,74 @@ import pandas as pd ...@@ -2,13 +2,74 @@ import pandas as pd
import numpy as np import numpy as np
import xgboost as xgb import xgboost as xgb
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics from sklearn import metrics
target = 'target'
import xgboost as xgb
# default parameters
params_xgb = {
'learning_rate': 0.1,
'n_estimators': 200,
'max_depth': 3,
'min_child_weight': 1,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'binary:logistic',
'nthread': 4,
'scale_pos_weight': 1,
'seed': 27
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def xgb_train(params, train, val, features, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain = xgb.DMatrix(train[features], train[target])
dval = xgb.DMatrix(val[features], val[target])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf.fit(train[features], train['target'])
# xgbm = xgb.train(params,dtrain)
returnAUC(xgb_clf, train, val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return xgb_clf
#############################################################################
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5): scale_pos_weight=1, base_score=0.5):
''' '''
...@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, ...@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
''' '''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators, return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=0,silent=0,objective='binary:logistic', verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight, booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree, max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight, reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7 base_score=base_score,random_state=7,seed=7
......
...@@ -2,61 +2,70 @@ import pandas as pd ...@@ -2,61 +2,70 @@ import pandas as pd
import numpy as np import numpy as np
import datetime import datetime
from mvp import xgbreport from mvp import xgbreport
from mvp import lgbreport
from data.analyis import datacal from data.analyis import datacal
from models import xgboost
from models import lightgbm
from mvp import dhb from mvp import dhb
dhb = dhb.dhb()
df_sample = dhb.dhb_features_extract()
target = 'target'
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__': if __name__ == '__main__':
# features=[
# 'third_data_source#xy_pan_newapplyAcredibility', # data extraction
# 'third_data_source#xy_pan_newapplyAscore',
# 'third_data_source#xy_pan_newconsfinAavgAlimit', ''' ## Old Edition here
# 'third_data_source#xy_pan_newconsfinAcredibility', # if total sample more than 30000, it would use train-validation-test
# 'third_data_source#xy_pan_newconsfinAcreditAlimit', # else use CV to parameters tuning
# 'third_data_source#xy_pan_newconsfinAmaxAlimit',
# 'third_data_source#xy_pan_newconsfinAorgAcountq', # if len(df_sample) >= 30000:
# 'third_data_source#xy_pan_newconsfinAorgAcountx', # df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newconsfinAproductAcount', # else:
# 'third_data_source#xy_pan_newhistoryAfailAfee', # df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newhistoryAsucAfee', '''
# 'third_data_source#xy_pan_newlatestAoneAmonthAfail', df_train, df_val, df_test = train_test_split_general()
# 'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
# 'third_data_source#xy_pan_newlatestAoneAmonthd', # data manipulation
# 'third_data_source#xy_pan_newlatestAoneAmonthj', ## TODO
# 'third_data_source#xy_pan_newlatestAqueryAtime',
# 'third_data_source#xy_pan_newlatestAsixAmontha',
# 'third_data_source#xy_pan_newlatestAsixAmonthv',
# 'third_data_source#xy_pan_newlatestAthreeAmonthb', # model refit
# 'third_data_source#xy_pan_newlatestAthreeAmonthf',
# 'third_data_source#xy_pan_newloansAavgAlimit',
# 'third_data_source#xy_pan_newloansAcashAcount',
# 'third_data_source#xy_pan_newloansAcount', #xgboost
# 'third_data_source#xy_pan_newloansAcredibilityh', xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
# 'third_data_source#xy_pan_newloansAcredibilitys', xgb_model_auc['training_auc'] = None
# 'third_data_source#xy_pan_newloansAcreditAlimit', xgb_model_auc['val_auc'] = None
# 'third_data_source#xy_pan_newloansAlatestAtime',
# 'third_data_source#xy_pan_newloansAlongAtime', #xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newloansAmaxAlimit',
# 'third_data_source#xy_pan_newloansAorgAcounta', ## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
# 'third_data_source#xy_pan_newloansAorgAcountg',
# 'third_data_source#xy_pan_newloansAoverdueAcount', #ligthtgbm
# 'third_data_source#xy_pan_newloansAproductAcount', lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
# 'third_data_source#xy_pan_newloansAscore', lgb_model_auc['training_auc'] = None
# 'third_data_source#xy_pan_newloansAsettleAcount', lgb_model_auc['val_auc'] = None
# 'third_data_source#xy_pan_newqueryAcashAcount',
# 'third_data_source#xy_pan_newqueryAfinanceAcount', #dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 'third_data_source#xy_pan_newqueryAorgAcount', #lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newqueryAsumAcount'
# ] # merge as single dataframe full of models
dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00') pd.DataFrame(xgb_model)
df=dhb.dhb_features_extract()
print(df.columns.tolist())
print(df.target.unique())
label='target'
features=dhb.get_feature()
df[features]=df[features].astype(float)
df['target']=df['target'].astype(int)
print('----feature---',len(features))
# df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
This diff is collapsed.
# Author : Jason Wang
# latest update : May 6 2019
# version control :
#
#######################################################################################################################
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import matplot
# 选定的topnfeatures
...@@ -3,9 +3,9 @@ import numpy as np ...@@ -3,9 +3,9 @@ import numpy as np
import datetime import datetime
from data.analyis import filetool from data.analyis import filetool
from data.analyis import datacal from data.analyis import datacal
from models import xgboost from models import lightgbm
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import matplot
from mvp import dhb
from data.datasource import mysqldb,mongodb
...@@ -3,10 +3,11 @@ import numpy as np ...@@ -3,10 +3,11 @@ import numpy as np
import datetime import datetime
from data.analyis import filetool from data.analyis import filetool
from data.analyis import datacal from data.analyis import datacal
from models import xgboost from models import lightgbm
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename,kfold=10): def report(dftrain,dftest,features,label,path,filename,kfold=10):
''' '''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
...@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
''' '''
document=filetool.buildDocument(path,filename) document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告') document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf() clf=lightgbm.buildClf()
document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params())) document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
clf=xgboost.modelfit(clf,dftrain,features,label,kfold=kfold) clf=lightgbm.modelfit(clf,dftrain,features,label,kfold=kfold)
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label))) document.add_paragraph('模型训练集{}'.format(lightgbm.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(lightgbm.auc(clf, dftest, features, label)))
document.add_heading('调整参数') document.add_heading('调整参数')
max_depth=[2,3] max_depth=[2,3]
...@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
{'reg_lambda': reg_lambda}, features, label,kfold=kfold) {'reg_lambda': reg_lambda}, features, label,kfold=kfold)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart #==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features) dftrain=lightgbm.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features) dftest=lightgbm.predict(clf,dftest,features)
#== 特征权重 #== 特征权重
featureimp=xgboost.featureImportance(clf,features) featureimp=lightgbm.featureImportance(clf,features)
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance') fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png') fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征') document.add_paragraph('特征权重图,近前10个特征')
...@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10): def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10):
for i in dict(params).keys(): for i in dict(params).keys():
document.add_paragraph('调参{},取值{}'.format(i,params[i])) document.add_paragraph('调参{},取值{}'.format(i,params[i]))
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label,kfold=kfold) grid_search = lightgbm.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
clf = grid_search.best_estimator_ clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params())) document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
#== #==
# clf = xgboost.modelfit(clf, dftrain, features, label) # clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_)) document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_)) document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label))) document.add_paragraph('模型训练集{}'.format(lightgbm.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(lightgbm.auc(grid_search, dftest, features, label)))
return document,clf return document,clf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment