Commit f2e2d5cf authored by 王家华's avatar 王家华

新增readme,lgb调参

parent fe8f7148
# PROJECT_MVP
## 数据源(datasource)
### mongo提取(mongodb)
### mysql(mysqldb)
### TBD
## 数据计算通用(tools)
### 常用通用工具包(datacal)
- train_test_split_general
- univar
- pdp
- liftchart
- TBD
### docx报告生成工具(filetool)
### TBD
## 绘图包(graph)
### 常用matplotlib折线图工具包(matplot)
### pyecharts绘图包
### TBD
## 线上模型对象
### 百融
### dhb
- 取给定特征<默认取线上特征>(dhb_features_extract)
- 获取线上模型效果(dhb_comparasion)
- dhb_xgb
- (存放lgb模型报告相关图像)dhb_lgb
- (模型报告及PKL生成路径)report_lgb
- report_xgb
- 线上分数变化
- 线上分数PSI
- 通过给定特征用线上模型pkl打分
- 特征VLM
- TBD
### xy
### Others
## 模型方法(models)
### Xgboost
- 默认参数表(params_xgb)
- 返回train/validation的AUC(returnAUC)
- xgb_train
- buildClf
- automodelfit
- predict
- featureImportance
### LightGBM
- (默认参数表)params_lgb
- returnAUC
- topN_feature_importance
- buildClf
- (组合cv调参模块)lgb_params_tuning
- (训练模型并调用returnAUC)train_lgbm
## 特征工程(features)
### 特征筛选
- 单变量
- 信息熵
- 方差
- 降维方法
### 缺失值处理
### 标准化(线性模型)
### outliers(线性模型)
## mvp
### 程序入口(allocator)
### 拟合xgboost(xgbreport)
- 调用绘图包/datacal包/filttool,生成报告
### 拟合lightgbm(lgbreport)
*XMind: ZEN - Trial Version*
\ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import datetime import datetime
from sklearn.model_selection import train_test_split
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if split_methods == 'random':
df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if val_size != None:
size = val_size / (1 - test_size)
df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
# case when validation set not exists
return df_train, df_val, df_test
# split data with time sequence
elif split_methods == 'timeSeries':
data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
df_test = data_tmp[: int(len(dataset) * test_size)]
df_train = data_tmp[int(len(dataset) * test_size):]
return df_train, df_test
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None): def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
''' '''
...@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N ...@@ -27,6 +60,7 @@ def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=N
val = None val = None
return train,val return train,val
def cal_week(df,date_name,date_name_new): def cal_week(df,date_name,date_name_new):
''' '''
:param df: dateframe :param df: dateframe
......
...@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素 ...@@ -17,6 +17,46 @@ plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率 plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
'''
instructions : visualization of pivot
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None, def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3), save_path = None, figure_arrangement = 11, fig_size = (4,3),
...@@ -103,9 +143,6 @@ def density_chart(dataset,title): ...@@ -103,9 +143,6 @@ def density_chart(dataset,title):
plt.title(title) plt.title(title)
plt.show() plt.show()
def uniVarChart():
return 1
......
...@@ -2,86 +2,201 @@ import lightgbm as lgb ...@@ -2,86 +2,201 @@ import lightgbm as lgb
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy import numpy as np
import pandas import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
params = { params_lgb = {
'task': 'train', #用途 'task': 'train', # 用途
'application':'binary', #用于二分类 'application': 'binary', # 用于二分类
'boosting_type': 'gbdt', # 设置提升类型 'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round':100, #迭代次数 'num_boost_round': 150, # 迭代次数
'learning_rate': 0.01, # 学习速率 'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数 'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds':None, 'early_stopping_rounds': None,
# 'objective': 'regression', # 目标函数 # 'objective': 'regression', # 目标函数
'max_depth':4, 'max_depth': 4,
'num_leaves': 20, # 叶子节点数 'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例 'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例 'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息 }
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params = params.copy()
print(type(df_train), type(df_val))
# training params
if adds_on != None:
for i in adds_on.keys():
params[i] = adds_on[i]
# convert DataFrame to binary format
lgb_train = lgb.Dataset(df_train[features], df_train[target])
lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params = params.copy()
lgb_train = lgb.Dataset(train[features], train[target])
lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
# create a ndarray shapes 1*n
topn = np.zeros(topN)
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
''' for deepth in np.arange(2, 7, 1):
instructions : training lightgbm model with specified params for leaves in np.arange(2, 2 ** deepth, 2):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
Parameters : boost_round = pd.Series(cv_result['auc-mean']).argmax()
dataset - # if anyone greater than item in topn list(either of them)
features - feature list of dataset if (auc_score > topn).any():
target - tagert column or label list of samples # find the worst one / lowest AUC
topn[topn.argmin()] = auc_score
para = {}
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
''' optimal_para[topn.argmin()] = para
def lgb_train(params,training_set,features,target): return optimal_para, lgb_train, lgb_val, topn
lgb_train = lgb.Dataset(training_set[features],training_set[target])
#lgb.train(params,)
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
def lightGBM_gridCV(param_validation, params=params_lgb):
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
param_test = {
'max_depth': np.arange(2, 7, 1),
'num_leaves': np.arange(20, 200, 10),
}
estimator = LGBMRegressor(
num_leaves=50,
max_depth=13,
learning_rate=0.1,
n_estimators=1000,
objective='binary',
min_child_weight=1,
param['metric'] = ['auc', 'binary_logloss'],
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 7
)
gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
gsearch.fit(values, labels)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
return 1 return 1
''' def topN_feature_importance(classifier, clf, topN=20, model=lgb):
instructions : build a lgb classifier '''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
Params :
'''
def buildClf(params):
return lgb.LGBMClassifier(params)
''' def buildClf(params=params_lgb):
''' '''
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'): instructions : build a lgb classifier
Params :
'''
return lgbm.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7) # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True) grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
#== 模型训练 # == 模型训练
grid_search.fit(dftrain[features],dftrain[resp]) grid_search.fit(dftrain[features], dftrain[resp])
#== 获取最优参数 # == 获取最优参数
return grid_search return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
''' ##############################################################################
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = lgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = lgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
...@@ -2,13 +2,74 @@ import pandas as pd ...@@ -2,13 +2,74 @@ import pandas as pd
import numpy as np import numpy as np
import xgboost as xgb import xgboost as xgb
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics from sklearn import metrics
target = 'target'
import xgboost as xgb
# default parameters
params_xgb = {
'learning_rate': 0.1,
'n_estimators': 200,
'max_depth': 3,
'min_child_weight': 1,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'binary:logistic',
'nthread': 4,
'scale_pos_weight': 1,
'seed': 27
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def xgb_train(params, train, val, features, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain = xgb.DMatrix(train[features], train[target])
dval = xgb.DMatrix(val[features], val[target])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf.fit(train[features], train['target'])
# xgbm = xgb.train(params,dtrain)
returnAUC(xgb_clf, train, val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return xgb_clf
#############################################################################
def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5): scale_pos_weight=1, base_score=0.5):
''' '''
...@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0, ...@@ -37,7 +98,7 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
''' '''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators, return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=0,silent=0,objective='binary:logistic', verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight, booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree, max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight, reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7 base_score=base_score,random_state=7,seed=7
......
...@@ -2,61 +2,70 @@ import pandas as pd ...@@ -2,61 +2,70 @@ import pandas as pd
import numpy as np import numpy as np
import datetime import datetime
from mvp import xgbreport from mvp import xgbreport
from mvp import lgbreport
from data.analyis import datacal from data.analyis import datacal
from models import xgboost
from models import lightgbm
from mvp import dhb from mvp import dhb
dhb = dhb.dhb()
df_sample = dhb.dhb_features_extract()
target = 'target'
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__': if __name__ == '__main__':
# features=[
# 'third_data_source#xy_pan_newapplyAcredibility', # data extraction
# 'third_data_source#xy_pan_newapplyAscore',
# 'third_data_source#xy_pan_newconsfinAavgAlimit', ''' ## Old Edition here
# 'third_data_source#xy_pan_newconsfinAcredibility', # if total sample more than 30000, it would use train-validation-test
# 'third_data_source#xy_pan_newconsfinAcreditAlimit', # else use CV to parameters tuning
# 'third_data_source#xy_pan_newconsfinAmaxAlimit',
# 'third_data_source#xy_pan_newconsfinAorgAcountq', # if len(df_sample) >= 30000:
# 'third_data_source#xy_pan_newconsfinAorgAcountx', # df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newconsfinAproductAcount', # else:
# 'third_data_source#xy_pan_newhistoryAfailAfee', # df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
# 'third_data_source#xy_pan_newhistoryAsucAfee', '''
# 'third_data_source#xy_pan_newlatestAoneAmonthAfail', df_train, df_val, df_test = train_test_split_general()
# 'third_data_source#xy_pan_newlatestAoneAmonthAsuc',
# 'third_data_source#xy_pan_newlatestAoneAmonthd', # data manipulation
# 'third_data_source#xy_pan_newlatestAoneAmonthj', ## TODO
# 'third_data_source#xy_pan_newlatestAqueryAtime',
# 'third_data_source#xy_pan_newlatestAsixAmontha',
# 'third_data_source#xy_pan_newlatestAsixAmonthv',
# 'third_data_source#xy_pan_newlatestAthreeAmonthb', # model refit
# 'third_data_source#xy_pan_newlatestAthreeAmonthf',
# 'third_data_source#xy_pan_newloansAavgAlimit',
# 'third_data_source#xy_pan_newloansAcashAcount',
# 'third_data_source#xy_pan_newloansAcount', #xgboost
# 'third_data_source#xy_pan_newloansAcredibilityh', xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
# 'third_data_source#xy_pan_newloansAcredibilitys', xgb_model_auc['training_auc'] = None
# 'third_data_source#xy_pan_newloansAcreditAlimit', xgb_model_auc['val_auc'] = None
# 'third_data_source#xy_pan_newloansAlatestAtime',
# 'third_data_source#xy_pan_newloansAlongAtime', #xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newloansAmaxAlimit',
# 'third_data_source#xy_pan_newloansAorgAcounta', ## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
# 'third_data_source#xy_pan_newloansAorgAcountg',
# 'third_data_source#xy_pan_newloansAoverdueAcount', #ligthtgbm
# 'third_data_source#xy_pan_newloansAproductAcount', lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
# 'third_data_source#xy_pan_newloansAscore', lgb_model_auc['training_auc'] = None
# 'third_data_source#xy_pan_newloansAsettleAcount', lgb_model_auc['val_auc'] = None
# 'third_data_source#xy_pan_newqueryAcashAcount',
# 'third_data_source#xy_pan_newqueryAfinanceAcount', #dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 'third_data_source#xy_pan_newqueryAorgAcount', #lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# 'third_data_source#xy_pan_newqueryAsumAcount'
# ] # merge as single dataframe full of models
dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00') pd.DataFrame(xgb_model)
df=dhb.dhb_features_extract()
print(df.columns.tolist())
print(df.target.unique())
label='target'
features=dhb.get_feature()
df[features]=df[features].astype(float)
df['target']=df['target'].astype(int)
print('----feature---',len(features))
# df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
import pandas as pd import pandas as pd
from data.datasource import mysqldb,mongodb from data.datasource import mysqldb, mongodb
import time import time
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
import datetime import datetime
''' '''
...@@ -19,253 +19,255 @@ API : ...@@ -19,253 +19,255 @@ API :
''' '''
class dhb: class dhb:
# features as Series format features = ['dhb_last_30_and_60_days_dun_call_avg_duration',
#features = pd.read_excel() 'dhb_last_30_and_60_days_dun_call_duration_above60',
'dhb_last_30_and_60_days_dun_call_duration_below15',
sql = ''' 'dhb_last_30_and_60_days_dun_call_duration_between15_and_30',
select dhb_last_30_and_60_days_dun_call_avg_duration, 'dhb_last_30_and_60_days_dun_call_in_duration',
dhb_last_30_and_60_days_dun_call_duration_above60, 'dhb_last_30_and_60_days_dun_call_in_times',
dhb_last_30_and_60_days_dun_call_duration_below15, 'dhb_last_30_and_60_days_dun_call_out_duration',
dhb_last_30_and_60_days_dun_call_duration_between15_and_30, 'dhb_last_30_and_60_days_dun_call_out_times',
dhb_last_30_and_60_days_dun_call_in_duration, 'dhb_last_30_and_60_days_dun_call_tel_total_nums',
dhb_last_30_and_60_days_dun_call_in_times, 'dhb_last_30_and_60_days_dun_call_total_duration',
dhb_last_30_and_60_days_dun_call_out_duration, 'dhb_last_30_and_60_days_dun_call_total_times',
dhb_last_30_and_60_days_dun_call_out_times, 'dhb_last_30_and_60_days_ntdun_call_avg_duration',
dhb_last_30_and_60_days_dun_call_tel_total_nums, 'dhb_last_30_and_60_days_ntdun_call_duration_above60',
dhb_last_30_and_60_days_dun_call_total_duration, 'dhb_last_30_and_60_days_ntdun_call_duration_below15',
dhb_last_30_and_60_days_dun_call_total_times, 'dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30',
dhb_last_30_and_60_days_ntdun_call_avg_duration, 'dhb_last_30_and_60_days_ntdun_call_in_duration',
dhb_last_30_and_60_days_ntdun_call_duration_above60, 'dhb_last_30_and_60_days_ntdun_call_in_times',
dhb_last_30_and_60_days_ntdun_call_duration_below15, 'dhb_last_30_and_60_days_ntdun_call_out_duration',
dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30, 'dhb_last_30_and_60_days_ntdun_call_out_times',
dhb_last_30_and_60_days_ntdun_call_in_duration, 'dhb_last_30_and_60_days_ntdun_call_tel_total_nums',
dhb_last_30_and_60_days_ntdun_call_in_times, 'dhb_last_30_and_60_days_ntdun_call_total_duration',
dhb_last_30_and_60_days_ntdun_call_out_duration, 'dhb_last_30_and_60_days_ntdun_call_total_times',
dhb_last_30_and_60_days_ntdun_call_out_times, 'dhb_last_30_days_dun_call_avg_duration',
dhb_last_30_and_60_days_ntdun_call_tel_total_nums, 'dhb_last_30_days_dun_call_duration_above60',
dhb_last_30_and_60_days_ntdun_call_total_duration, 'dhb_last_30_days_dun_call_duration_below15',
dhb_last_30_and_60_days_ntdun_call_total_times, 'dhb_last_30_days_dun_call_duration_between15_and_30',
dhb_last_30_days_dun_call_avg_duration, 'dhb_last_30_days_dun_call_in_duration',
dhb_last_30_days_dun_call_duration_above60, 'dhb_last_30_days_dun_call_in_times',
dhb_last_30_days_dun_call_duration_below15, 'dhb_last_30_days_dun_call_out_duration',
dhb_last_30_days_dun_call_duration_between15_and_30, 'dhb_last_30_days_dun_call_out_times',
dhb_last_30_days_dun_call_in_duration, 'dhb_last_30_days_dun_call_tel_total_nums',
dhb_last_30_days_dun_call_in_times, 'dhb_last_30_days_dun_call_total_duration',
dhb_last_30_days_dun_call_out_duration, 'dhb_last_30_days_dun_call_total_times',
dhb_last_30_days_dun_call_out_times, 'dhb_last_30_days_ntdun_call_avg_duration',
dhb_last_30_days_dun_call_tel_total_nums, 'dhb_last_30_days_ntdun_call_duration_above60',
dhb_last_30_days_dun_call_total_duration, 'dhb_last_30_days_ntdun_call_duration_below15',
dhb_last_30_days_dun_call_total_times, 'dhb_last_30_days_ntdun_call_duration_between15_and_30',
dhb_last_30_days_ntdun_call_avg_duration, 'dhb_last_30_days_ntdun_call_in_duration',
dhb_last_30_days_ntdun_call_duration_above60, 'dhb_last_30_days_ntdun_call_in_times',
dhb_last_30_days_ntdun_call_duration_below15, 'dhb_last_30_days_ntdun_call_out_duration',
dhb_last_30_days_ntdun_call_duration_between15_and_30, 'dhb_last_30_days_ntdun_call_out_times',
dhb_last_30_days_ntdun_call_in_duration, 'dhb_last_30_days_ntdun_call_tel_total_nums',
dhb_last_30_days_ntdun_call_in_times, 'dhb_last_30_days_ntdun_call_total_duration',
dhb_last_30_days_ntdun_call_out_duration, 'dhb_last_30_days_ntdun_call_total_times',
dhb_last_30_days_ntdun_call_out_times, 'dhb_last_60_and_90_days_dun_call_avg_duration',
dhb_last_30_days_ntdun_call_tel_total_nums, 'dhb_last_60_and_90_days_dun_call_duration_above60',
dhb_last_30_days_ntdun_call_total_duration, 'dhb_last_60_and_90_days_dun_call_duration_below15',
dhb_last_30_days_ntdun_call_total_times, 'dhb_last_60_and_90_days_dun_call_duration_between15_and_30',
dhb_last_60_and_90_days_dun_call_avg_duration, 'dhb_last_60_and_90_days_dun_call_in_duration',
dhb_last_60_and_90_days_dun_call_duration_above60, 'dhb_last_60_and_90_days_dun_call_in_times',
dhb_last_60_and_90_days_dun_call_duration_below15, 'dhb_last_60_and_90_days_dun_call_out_duration',
dhb_last_60_and_90_days_dun_call_duration_between15_and_30, 'dhb_last_60_and_90_days_dun_call_out_times',
dhb_last_60_and_90_days_dun_call_in_duration, 'dhb_last_60_and_90_days_dun_call_tel_total_nums',
dhb_last_60_and_90_days_dun_call_in_times, 'dhb_last_60_and_90_days_dun_call_total_duration',
dhb_last_60_and_90_days_dun_call_out_duration, 'dhb_last_60_and_90_days_dun_call_total_times',
dhb_last_60_and_90_days_dun_call_out_times, 'dhb_last_60_and_90_days_ntdun_call_avg_duration',
dhb_last_60_and_90_days_dun_call_tel_total_nums, 'dhb_last_60_and_90_days_ntdun_call_duration_above60',
dhb_last_60_and_90_days_dun_call_total_duration, 'dhb_last_60_and_90_days_ntdun_call_duration_below15',
dhb_last_60_and_90_days_dun_call_total_times, 'dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30',
dhb_last_60_and_90_days_ntdun_call_avg_duration, 'dhb_last_60_and_90_days_ntdun_call_in_duration',
dhb_last_60_and_90_days_ntdun_call_duration_above60, 'dhb_last_60_and_90_days_ntdun_call_in_times',
dhb_last_60_and_90_days_ntdun_call_duration_below15, 'dhb_last_60_and_90_days_ntdun_call_out_duration',
dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30, 'dhb_last_60_and_90_days_ntdun_call_out_times',
dhb_last_60_and_90_days_ntdun_call_in_duration, 'dhb_last_60_and_90_days_ntdun_call_tel_total_nums',
dhb_last_60_and_90_days_ntdun_call_in_times, 'dhb_last_60_and_90_days_ntdun_call_total_duration',
dhb_last_60_and_90_days_ntdun_call_out_duration, 'dhb_last_60_and_90_days_ntdun_call_total_times',
dhb_last_60_and_90_days_ntdun_call_out_times, 'dhb_last_three_weeks_dun_call_avg_duration',
dhb_last_60_and_90_days_ntdun_call_tel_total_nums, 'dhb_last_three_weeks_dun_call_duration_above60',
dhb_last_60_and_90_days_ntdun_call_total_duration, 'dhb_last_three_weeks_dun_call_duration_below15',
dhb_last_60_and_90_days_ntdun_call_total_times, 'dhb_last_three_weeks_dun_call_duration_between15_and_30',
dhb_last_three_weeks_dun_call_avg_duration, 'dhb_last_three_weeks_dun_call_in_duration',
dhb_last_three_weeks_dun_call_duration_above60, 'dhb_last_three_weeks_dun_call_in_times',
dhb_last_three_weeks_dun_call_duration_below15, 'dhb_last_three_weeks_dun_call_out_duration',
dhb_last_three_weeks_dun_call_duration_between15_and_30, 'dhb_last_three_weeks_dun_call_out_times',
dhb_last_three_weeks_dun_call_in_duration, 'dhb_last_three_weeks_dun_call_tel_total_nums',
dhb_last_three_weeks_dun_call_in_times, 'dhb_last_three_weeks_dun_call_total_duration',
dhb_last_three_weeks_dun_call_out_duration, 'dhb_last_three_weeks_dun_call_total_times',
dhb_last_three_weeks_dun_call_out_times, 'dhb_last_three_weeks_ntdun_call_avg_duration',
dhb_last_three_weeks_dun_call_tel_total_nums, 'dhb_last_three_weeks_ntdun_call_duration_above60',
dhb_last_three_weeks_dun_call_total_duration, 'dhb_last_three_weeks_ntdun_call_duration_below15',
dhb_last_three_weeks_dun_call_total_times, 'dhb_last_three_weeks_ntdun_call_duration_between15_and_30',
dhb_last_three_weeks_ntdun_call_avg_duration, 'dhb_last_three_weeks_ntdun_call_in_duration',
dhb_last_three_weeks_ntdun_call_duration_above60, 'dhb_last_three_weeks_ntdun_call_in_times',
dhb_last_three_weeks_ntdun_call_duration_below15, 'dhb_last_three_weeks_ntdun_call_out_duration',
dhb_last_three_weeks_ntdun_call_duration_between15_and_30, 'dhb_last_three_weeks_ntdun_call_out_times',
dhb_last_three_weeks_ntdun_call_in_duration, 'dhb_last_three_weeks_ntdun_call_tel_total_nums',
dhb_last_three_weeks_ntdun_call_in_times, 'dhb_last_three_weeks_ntdun_call_total_duration',
dhb_last_three_weeks_ntdun_call_out_duration, 'dhb_last_three_weeks_ntdun_call_total_times',
dhb_last_three_weeks_ntdun_call_out_times, 'dhb_last_two_weeks_dun_call_avg_duration',
dhb_last_three_weeks_ntdun_call_tel_total_nums, 'dhb_last_two_weeks_dun_call_duration_above60',
dhb_last_three_weeks_ntdun_call_total_duration, 'dhb_last_two_weeks_dun_call_duration_below15',
dhb_last_three_weeks_ntdun_call_total_times, 'dhb_last_two_weeks_dun_call_duration_between15_and_30',
dhb_last_two_weeks_dun_call_avg_duration, 'dhb_last_two_weeks_dun_call_in_duration',
dhb_last_two_weeks_dun_call_duration_above60, 'dhb_last_two_weeks_dun_call_in_times',
dhb_last_two_weeks_dun_call_duration_below15, 'dhb_last_two_weeks_dun_call_out_duration',
dhb_last_two_weeks_dun_call_duration_between15_and_30, 'dhb_last_two_weeks_dun_call_out_times',
dhb_last_two_weeks_dun_call_in_duration, 'dhb_last_two_weeks_dun_call_tel_total_nums',
dhb_last_two_weeks_dun_call_in_times, 'dhb_last_two_weeks_dun_call_total_duration',
dhb_last_two_weeks_dun_call_out_duration, 'dhb_last_two_weeks_dun_call_total_times',
dhb_last_two_weeks_dun_call_out_times, 'dhb_last_two_weeks_ntdun_call_avg_duration',
dhb_last_two_weeks_dun_call_tel_total_nums, 'dhb_last_two_weeks_ntdun_call_duration_above60',
dhb_last_two_weeks_dun_call_total_duration, 'dhb_last_two_weeks_ntdun_call_duration_below15',
dhb_last_two_weeks_dun_call_total_times, 'dhb_last_two_weeks_ntdun_call_duration_between15_and_30',
dhb_last_two_weeks_ntdun_call_avg_duration, 'dhb_last_two_weeks_ntdun_call_in_duration',
dhb_last_two_weeks_ntdun_call_duration_above60, 'dhb_last_two_weeks_ntdun_call_in_times',
dhb_last_two_weeks_ntdun_call_duration_below15, 'dhb_last_two_weeks_ntdun_call_out_duration',
dhb_last_two_weeks_ntdun_call_duration_between15_and_30, 'dhb_last_two_weeks_ntdun_call_out_times',
dhb_last_two_weeks_ntdun_call_in_duration, 'dhb_last_two_weeks_ntdun_call_tel_total_nums',
dhb_last_two_weeks_ntdun_call_in_times, 'dhb_last_two_weeks_ntdun_call_total_duration',
dhb_last_two_weeks_ntdun_call_out_duration, 'dhb_last_two_weeks_ntdun_call_total_times',
dhb_last_two_weeks_ntdun_call_out_times, 'dhb_last_week_dun_call_avg_duration',
dhb_last_two_weeks_ntdun_call_tel_total_nums, 'dhb_last_week_dun_call_duration_above60',
dhb_last_two_weeks_ntdun_call_total_duration, 'dhb_last_week_dun_call_duration_below15',
dhb_last_two_weeks_ntdun_call_total_times, 'dhb_last_week_dun_call_duration_between15_and_30',
dhb_last_week_dun_call_avg_duration, 'dhb_last_week_dun_call_in_duration',
dhb_last_week_dun_call_duration_above60, 'dhb_last_week_dun_call_in_times',
dhb_last_week_dun_call_duration_below15, 'dhb_last_week_dun_call_out_duration',
dhb_last_week_dun_call_duration_between15_and_30, 'dhb_last_week_dun_call_out_times',
dhb_last_week_dun_call_in_duration, dhb_last_week_dun_call_in_times, 'dhb_last_week_dun_call_tel_total_nums',
dhb_last_week_dun_call_out_duration, 'dhb_last_week_dun_call_total_duration',
dhb_last_week_dun_call_out_times, 'dhb_last_week_dun_call_total_times',
dhb_last_week_dun_call_tel_total_nums, 'dhb_last_week_ntdun_call_avg_duration',
dhb_last_week_dun_call_total_duration, 'dhb_last_week_ntdun_call_duration_above60',
dhb_last_week_dun_call_total_times, 'dhb_last_week_ntdun_call_duration_below15',
dhb_last_week_ntdun_call_avg_duration, 'dhb_last_week_ntdun_call_duration_between15_and_30',
dhb_last_week_ntdun_call_duration_above60, 'dhb_last_week_ntdun_call_in_duration',
dhb_last_week_ntdun_call_duration_below15, 'dhb_last_week_ntdun_call_in_times',
dhb_last_week_ntdun_call_duration_between15_and_30, 'dhb_last_week_ntdun_call_out_duration',
dhb_last_week_ntdun_call_in_duration, 'dhb_last_week_ntdun_call_out_times',
dhb_last_week_ntdun_call_in_times, 'dhb_last_week_ntdun_call_tel_total_nums',
dhb_last_week_ntdun_call_out_duration, 'dhb_last_week_ntdun_call_total_duration',
dhb_last_week_ntdun_call_out_times, 'dhb_last_week_ntdun_call_total_times',
dhb_last_week_ntdun_call_tel_total_nums, 'dhb_overview_dun_call_avg_duration',
dhb_last_week_ntdun_call_total_duration, 'dhb_overview_dun_call_duration_above60',
dhb_last_week_ntdun_call_total_times, 'dhb_overview_dun_call_duration_below15',
dhb_overview_dun_call_avg_duration, 'dhb_overview_dun_call_duration_between15_and_30',
dhb_overview_dun_call_duration_above60, 'dhb_overview_dun_call_in_duration',
dhb_overview_dun_call_duration_below15, 'dhb_overview_dun_call_in_times',
dhb_overview_dun_call_duration_between15_and_30, 'dhb_overview_dun_call_out_duration',
dhb_overview_dun_call_in_duration, dhb_overview_dun_call_in_times, 'dhb_overview_dun_call_out_times',
dhb_overview_dun_call_out_duration, dhb_overview_dun_call_out_times, 'dhb_overview_dun_call_tel_total_nums',
dhb_overview_dun_call_tel_total_nums, 'dhb_overview_dun_call_total_duration',
dhb_overview_dun_call_total_duration, 'dhb_overview_dun_call_total_times',
dhb_overview_dun_call_total_times, dhb_overview_dun_first_call_time, 'dhb_overview_dun_first_call_time',
dhb_overview_dun_last_call_time, 'dhb_overview_dun_last_call_time',
dhb_overview_ntdun_call_avg_duration, 'dhb_overview_ntdun_call_avg_duration',
dhb_overview_ntdun_call_duration_above60, 'dhb_overview_ntdun_call_duration_above60',
dhb_overview_ntdun_call_duration_below15, 'dhb_overview_ntdun_call_duration_below15',
dhb_overview_ntdun_call_duration_between15_and_30, 'dhb_overview_ntdun_call_duration_between15_and_30',
dhb_overview_ntdun_call_in_duration, 'dhb_overview_ntdun_call_in_duration',
dhb_overview_ntdun_call_in_times, 'dhb_overview_ntdun_call_in_times',
dhb_overview_ntdun_call_out_duration, 'dhb_overview_ntdun_call_out_duration',
dhb_overview_ntdun_call_out_times, 'dhb_overview_ntdun_call_out_times',
dhb_overview_ntdun_call_tel_total_nums, 'dhb_overview_ntdun_call_tel_total_nums',
dhb_overview_ntdun_call_total_duration, 'dhb_overview_ntdun_call_total_duration',
dhb_overview_ntdun_call_total_times, 'dhb_overview_ntdun_call_total_times',
dhb_overview_ntdun_first_call_time, 'dhb_overview_ntdun_first_call_time',
dhb_overview_ntdun_last_call_time,applied_at,applied_from,applied_type,if(passdue_day>15,1,0) as target 'dhb_overview_ntdun_last_call_time']
sql = '''
select ''' + str(features).replace('[', '').replace(']', '').replace('\'', '') + ''',applied_at,applied_from,applied_type,if(passdue_day>15,1,0) as target
from risk_analysis from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period' where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1 and transacted = 1
and dhb_flag =1 and dhb_flag =1
and datediff(now(),deadline) > 15 and datediff(now(),deadline) > 15
''' '''
# default time interval
# sql = '''
#
# '''
start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00") start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00")
end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00") end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00")
def __init__(self,features=None,sql=None,start_time_period=None,end_time_period=None): def __init__(self, features=None, sql=None, start_time_period=None, end_time_period=None):
try: try:
# if the para was not Series # if the para was not Series
if(type(features) != pd.core.series.Series): if features != None:
self.features =pd.Series(features)
else:
self.features = features self.features = features
except Exception as e: except Exception as e:
print("'features' parameter type Error, it should be list or Series") print("'features' parameter type Error, it should be list or Series")
raise raise
if start_time_period != None:
if sql != None: self.start_time_period = start_time_period
self.sql = sql if end_time_period != None:
if start_time_period != None: self.end_time_period = end_time_period
self.start_time_period = start_time_period if sql != None:
if end_time_period != None: self.sql = sql
self.end_time_period = end_time_period
def dhb_features_extract(self): def dhb_features_extract(self):
value_map = { value_map = {
"近3天":1, "近3天": 1,
"近4-5天":2, "近4-5天": 2,
"近6-7天":3, "近6-7天": 3,
"近8-15天":4, "近8-15天": 4,
"近16-30天":5, "近16-30天": 5,
"近31-60天":6, "近31-60天": 6,
"近61-90天":7, "近61-90天": 7,
"近91-120天":8, "近91-120天": 8,
"近121-150天":9, "近121-150天": 9,
"近151-180天":10, "近151-180天": 10,
"180天前":11, "180天前": 11,
"无":0 "无": 0
} }
#print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period)) # print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data # use risk_analysis to extract data
print('-----get dhb features from risk_analysis---',datetime.datetime.now()) print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
cnt=self.cnt_samples() self.end_time_period))
print('-----samples number is %d ' % cnt['cnt'][0])
res = [] dhb_loan = pd.read_sql(
tmp=pd.read_sql( self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period', mysqldb.engine_risk_analysis)
self.end_time_period),
mysqldb.engine_risk_analysis, chunksize=10000) dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
for tt in tmp: "dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
res.append(tt) ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
dhb_loan=pd.concat(res) "dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(
cols=["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"] lambda x: value_map[x])
dhb_loan[cols] = dhb_loan[cols].applymap(lambda x : value_map[x])
dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42,"dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42 dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25,"dhb_overview_ntdun_call_duration_above60"] = 25 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration>= 800,"dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800 dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_dun_call_in_duration >= 1600,"dhb_last_30_and_60_days_dun_call_in_duration"] = 1600 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_last_30_days_ntdun_call_total_duration>= 2500,"dhb_last_30_days_ntdun_call_total_duration"] = 2500 dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
dhb_loan.loc[dhb_loan.dhb_last_30_days_ntdun_call_tel_total_nums>= 25,"dhb_last_30_days_ntdun_call_tel_total_nums"] = 25 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_in_duration >= 1000,"dhb_last_30_days_dun_call_in_duration"] = 1000 dhb_loan.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_total_duration >= 3000,"dhb_overview_ntdun_call_total_duration"] = 3000 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_in_times>= 25,"dhb_overview_ntdun_call_in_times"] = 25 dhb_loan.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_in_duration>= 1000,"dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums>= 22,"dhb_overview_dun_call_tel_total_nums"] = 22 dhb_loan.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_total_duration>= 1100,"dhb_last_30_days_dun_call_total_duration"] = 1100 dhb_loan.loc[
dhb_loan.loc[dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration>= 300,"dhb_last_two_weeks_ntdun_call_in_duration"] = 300 dhb_loan.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
dhb_loan.loc[
dhb_loan.to_csv("./dhb_loan_sample——"+str(datetime.date.today())+".csv") dhb_loan.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
print( time.strftime('%Y.%m.%d %H:%M:%S',time.localtime(time.time())) +"提取了dhb "+ self.start_time_period + "to" + self.end_time_period + "时段样本") dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
dhb_loan.loc[
dhb_loan.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
dhb_loan.loc[
dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
return dhb_loan return dhb_loan
''' '''
...@@ -284,30 +286,64 @@ class dhb: ...@@ -284,30 +286,64 @@ class dhb:
auc comparasion auc comparasion
liftchart plot liftchart plot
''' '''
# def dhb_comparasion(df,score = 'model_exec_data_source#dhb' ,target = 'target', start_time_period = self.start_time_period, end_time_period = self.end_time_period, applied_type = None, applied_from = None):
# df_mongo = mongodb.pymongodb(start_time_period, end_time_period, limit, "{'order_id':1,'model_exec_data_source#dhb':1}")
# df = pd.merge(df,df_mongo,how='left',left_on='order_no',right_on='order_id')
# df['bins'] = df.qcut(df['target'], q = 10, percision = 6, dupulicates='drop') from data.datasource import mongodb
# df.groupby def dhb_dataSketch(self,df, given_dataset=None, start_time_period = start_time_period, end_time_period = end_time_period,
# return 1 applied_type=None, applied_from=None):
def cnt_samples(self): limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
sql = ''' query = "{'order_id':1,'@key':1}"
select count(1) as cnt df_mongo = mongodb.querymongo(start_time_period, end_time_period,
from risk_analysis limit.replace('@start_date', start_time_period).replace('@end_date', end_time_period),
where applied_at >= '%s' and applied_at < '%s' "{'order_id':1,'model_exec_data_source#dhb':1}")
and transacted = 1 df = pd.merge(df, df_mongo, how='left', left_on='order_no', right_on='order_id')
and dhb_flag =1 df = df.dropna(axis=0)
and datediff(now(),deadline) > 15 df.to_csv('./mvp/dhb_target.csv')
''' % (self.start_time_period,self.end_time_period) return df
return pd.read_sql(sql,mysqldb.engine_risk_analysis)
def dhb_comparasion(df, score_BM='model_exec_data_source#dhb', score_predict='predict', target='target',applied_type=None, applied_from=None):
'''
instructions : comparasion of previous dhb liftchart & auc
'''
# spliting data with appliedType & applied_channel
df = df[df.applied_type == applied_type]
df = df[df.applied_from == applied_from]
# cut up bins
## bins benchmark
df['bins_BM'] = df.qcut(df[score_BM], q=10, percision=6, dupulicates='drop')
## bins of predictions
df['bins_predict'] = df.qcut(df[score_predict], q=10, percision=6, dupulicates='drop')
pivot_BM = df[['bins_BM', target]].groupby('bins_BM')
pivot_predict = df[['bins_predict', target]].groupby('bins_predict')
# output liftchart & AUC
pivot_BM = pivot_BM.sum() / pivot_BM.count()
pivot_predict = pivot_predict.sum() / pivot_predict.count()
# concate two pivot
pivot = pd.concat([pivot_BM, pivot_predict],axis = 1)
# pivottable plot
pivot.plot()
return 1
def data_merge(self):
# merge data from mongodb & risk_analysis
return 1
def vlm(self,feature):
return 1
# Author : Jason Wang
# latest update : May 6 2019
# version control :
#
#######################################################################################################################
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import matplot
# 选定的topnfeatures
...@@ -3,9 +3,9 @@ import numpy as np ...@@ -3,9 +3,9 @@ import numpy as np
import datetime import datetime
from data.analyis import filetool from data.analyis import filetool
from data.analyis import datacal from data.analyis import datacal
from models import xgboost from models import lightgbm
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import matplot
from mvp import dhb
from data.datasource import mysqldb,mongodb
...@@ -3,10 +3,11 @@ import numpy as np ...@@ -3,10 +3,11 @@ import numpy as np
import datetime import datetime
from data.analyis import filetool from data.analyis import filetool
from data.analyis import datacal from data.analyis import datacal
from models import xgboost from models import lightgbm
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename,kfold=10): def report(dftrain,dftest,features,label,path,filename,kfold=10):
''' '''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
...@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -20,11 +21,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
''' '''
document=filetool.buildDocument(path,filename) document=filetool.buildDocument(path,filename)
document.add_heading('xgboost 算法运行报告') document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf() clf=lightgbm.buildClf()
document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params())) document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
clf=xgboost.modelfit(clf,dftrain,features,label,kfold=kfold) clf=lightgbm.modelfit(clf,dftrain,features,label,kfold=kfold)
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label))) document.add_paragraph('模型训练集{}'.format(lightgbm.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(lightgbm.auc(clf, dftest, features, label)))
document.add_heading('调整参数') document.add_heading('调整参数')
max_depth=[2,3] max_depth=[2,3]
...@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -52,10 +53,11 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
{'reg_lambda': reg_lambda}, features, label,kfold=kfold) {'reg_lambda': reg_lambda}, features, label,kfold=kfold)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart #==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features) dftrain=lightgbm.predict(clf,dftrain,features)
dftest=xgboost.predict(clf,dftest,features) dftest=lightgbm.predict(clf,dftest,features)
#== 特征权重 #== 特征权重
featureimp=xgboost.featureImportance(clf,features) featureimp=lightgbm.featureImportance(clf,features)
fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance') fig=drawplot.draw_barplot(featureimp.head(10),'feature','weight',title='Feature importance')
fig.savefig('tmp.png') fig.savefig('tmp.png')
document.add_paragraph('特征权重图,近前10个特征') document.add_paragraph('特征权重图,近前10个特征')
...@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10): ...@@ -106,15 +108,15 @@ def report(dftrain,dftest,features,label,path,filename,kfold=10):
def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10): def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10):
for i in dict(params).keys(): for i in dict(params).keys():
document.add_paragraph('调参{},取值{}'.format(i,params[i])) document.add_paragraph('调参{},取值{}'.format(i,params[i]))
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label,kfold=kfold) grid_search = lightgbm.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
clf = grid_search.best_estimator_ clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params())) document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
#== #==
# clf = xgboost.modelfit(clf, dftrain, features, label) # clf = xgboost.modelfit(clf, dftrain, features, label)
document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_)) document.add_paragraph('寻找最优参数过程{}'.format(grid_search.cv_results_))
document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_)) document.add_paragraph('最优参数{},最优分{}'.format(grid_search.best_params_,grid_search.best_score_))
document.add_paragraph('模型训练集{}'.format(xgboost.auc(grid_search, dftrain, features, label))) document.add_paragraph('模型训练集{}'.format(lightgbm.auc(grid_search, dftrain, features, label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(grid_search, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(lightgbm.auc(grid_search, dftest, features, label)))
return document,clf return document,clf
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment