Commit 45721de0 authored by 王家华's avatar 王家华

debug

parent b5a3f366
import lightgbm as lgb import lightgbm as lgb
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy import numpy
import pandas import pandas
...@@ -21,8 +22,9 @@ params = { ...@@ -21,8 +22,9 @@ params = {
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息 'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
} }
'''
'''
instructions : training lightgbm model with specified params instructions : training lightgbm model with specified params
Parameters : Parameters :
...@@ -33,5 +35,53 @@ Parameters : ...@@ -33,5 +35,53 @@ Parameters :
''' '''
def lgb_train(params,training_set,features,target): def lgb_train(params,training_set,features,target):
lgb_train = lgb.Dataset(training_set[features],training_set[target]) lgb_train = lgb.Dataset(training_set[features],training_set[target])
#lgb.train(params,)
return 1
'''
instructions : build a lgb classifier
Params :
'''
def buildClf(params):
return lgb.LGBMClassifier(params)
'''
'''
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数
return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = lgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = lgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
import pandas as pd import pandas as pd
from data.datasource import mysqldb,mongodb from data.datasource import mysqldb,mongodb
import time
from dateutil.relativedelta import relativedelta
import datetime import datetime
import dateutil
''' '''
model instructions : established a dhb obj which cotains attrubutes of dhb model model instructions : established a dhb obj which cotains attrubutes of dhb model
...@@ -20,222 +22,223 @@ API : ...@@ -20,222 +22,223 @@ API :
class dhb: class dhb:
# features as Series format # features as Series format
features = ['dhb_last_30_and_60_days_dun_call_avg_duration', #features = pd.read_excel()
'dhb_last_30_and_60_days_dun_call_duration_above60',
'dhb_last_30_and_60_days_dun_call_duration_below15', sql = '''
'dhb_last_30_and_60_days_dun_call_duration_between15_and_30', select dhb_last_30_and_60_days_dun_call_avg_duration,
'dhb_last_30_and_60_days_dun_call_in_duration', dhb_last_30_and_60_days_dun_call_duration_above60,
'dhb_last_30_and_60_days_dun_call_in_times', dhb_last_30_and_60_days_dun_call_duration_below15,
'dhb_last_30_and_60_days_dun_call_out_duration', dhb_last_30_and_60_days_dun_call_duration_between15_and_30,
'dhb_last_30_and_60_days_dun_call_out_times', dhb_last_30_and_60_days_dun_call_in_duration,
'dhb_last_30_and_60_days_dun_call_tel_total_nums', dhb_last_30_and_60_days_dun_call_in_times,
'dhb_last_30_and_60_days_dun_call_total_duration', dhb_last_30_and_60_days_dun_call_out_duration,
'dhb_last_30_and_60_days_dun_call_total_times', dhb_last_30_and_60_days_dun_call_out_times,
'dhb_last_30_and_60_days_ntdun_call_avg_duration', dhb_last_30_and_60_days_dun_call_tel_total_nums,
'dhb_last_30_and_60_days_ntdun_call_duration_above60', dhb_last_30_and_60_days_dun_call_total_duration,
'dhb_last_30_and_60_days_ntdun_call_duration_below15', dhb_last_30_and_60_days_dun_call_total_times,
'dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30', dhb_last_30_and_60_days_ntdun_call_avg_duration,
'dhb_last_30_and_60_days_ntdun_call_in_duration', dhb_last_30_and_60_days_ntdun_call_duration_above60,
'dhb_last_30_and_60_days_ntdun_call_in_times', dhb_last_30_and_60_days_ntdun_call_duration_below15,
'dhb_last_30_and_60_days_ntdun_call_out_duration', dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30,
'dhb_last_30_and_60_days_ntdun_call_out_times', dhb_last_30_and_60_days_ntdun_call_in_duration,
'dhb_last_30_and_60_days_ntdun_call_tel_total_nums', dhb_last_30_and_60_days_ntdun_call_in_times,
'dhb_last_30_and_60_days_ntdun_call_total_duration', dhb_last_30_and_60_days_ntdun_call_out_duration,
'dhb_last_30_and_60_days_ntdun_call_total_times', dhb_last_30_and_60_days_ntdun_call_out_times,
'dhb_last_30_days_dun_call_avg_duration', dhb_last_30_and_60_days_ntdun_call_tel_total_nums,
'dhb_last_30_days_dun_call_duration_above60', dhb_last_30_and_60_days_ntdun_call_total_duration,
'dhb_last_30_days_dun_call_duration_below15', dhb_last_30_and_60_days_ntdun_call_total_times,
'dhb_last_30_days_dun_call_duration_between15_and_30', dhb_last_30_days_dun_call_avg_duration,
'dhb_last_30_days_dun_call_in_duration', dhb_last_30_days_dun_call_duration_above60,
'dhb_last_30_days_dun_call_in_times', dhb_last_30_days_dun_call_duration_below15,
'dhb_last_30_days_dun_call_out_duration', dhb_last_30_days_dun_call_duration_between15_and_30,
'dhb_last_30_days_dun_call_out_times', dhb_last_30_days_dun_call_in_duration,
'dhb_last_30_days_dun_call_tel_total_nums', dhb_last_30_days_dun_call_in_times,
'dhb_last_30_days_dun_call_total_duration', dhb_last_30_days_dun_call_out_duration,
'dhb_last_30_days_dun_call_total_times', dhb_last_30_days_dun_call_out_times,
'dhb_last_30_days_ntdun_call_avg_duration', dhb_last_30_days_dun_call_tel_total_nums,
'dhb_last_30_days_ntdun_call_duration_above60', dhb_last_30_days_dun_call_total_duration,
'dhb_last_30_days_ntdun_call_duration_below15', dhb_last_30_days_dun_call_total_times,
'dhb_last_30_days_ntdun_call_duration_between15_and_30', dhb_last_30_days_ntdun_call_avg_duration,
'dhb_last_30_days_ntdun_call_in_duration', dhb_last_30_days_ntdun_call_duration_above60,
'dhb_last_30_days_ntdun_call_in_times', dhb_last_30_days_ntdun_call_duration_below15,
'dhb_last_30_days_ntdun_call_out_duration', dhb_last_30_days_ntdun_call_duration_between15_and_30,
'dhb_last_30_days_ntdun_call_out_times', dhb_last_30_days_ntdun_call_in_duration,
'dhb_last_30_days_ntdun_call_tel_total_nums', dhb_last_30_days_ntdun_call_in_times,
'dhb_last_30_days_ntdun_call_total_duration', dhb_last_30_days_ntdun_call_out_duration,
'dhb_last_30_days_ntdun_call_total_times', dhb_last_30_days_ntdun_call_out_times,
'dhb_last_60_and_90_days_dun_call_avg_duration', dhb_last_30_days_ntdun_call_tel_total_nums,
'dhb_last_60_and_90_days_dun_call_duration_above60', dhb_last_30_days_ntdun_call_total_duration,
'dhb_last_60_and_90_days_dun_call_duration_below15', dhb_last_30_days_ntdun_call_total_times,
'dhb_last_60_and_90_days_dun_call_duration_between15_and_30', dhb_last_60_and_90_days_dun_call_avg_duration,
'dhb_last_60_and_90_days_dun_call_in_duration', dhb_last_60_and_90_days_dun_call_duration_above60,
'dhb_last_60_and_90_days_dun_call_in_times', dhb_last_60_and_90_days_dun_call_duration_below15,
'dhb_last_60_and_90_days_dun_call_out_duration', dhb_last_60_and_90_days_dun_call_duration_between15_and_30,
'dhb_last_60_and_90_days_dun_call_out_times', dhb_last_60_and_90_days_dun_call_in_duration,
'dhb_last_60_and_90_days_dun_call_tel_total_nums', dhb_last_60_and_90_days_dun_call_in_times,
'dhb_last_60_and_90_days_dun_call_total_duration', dhb_last_60_and_90_days_dun_call_out_duration,
'dhb_last_60_and_90_days_dun_call_total_times', dhb_last_60_and_90_days_dun_call_out_times,
'dhb_last_60_and_90_days_ntdun_call_avg_duration', dhb_last_60_and_90_days_dun_call_tel_total_nums,
'dhb_last_60_and_90_days_ntdun_call_duration_above60', dhb_last_60_and_90_days_dun_call_total_duration,
'dhb_last_60_and_90_days_ntdun_call_duration_below15', dhb_last_60_and_90_days_dun_call_total_times,
'dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30', dhb_last_60_and_90_days_ntdun_call_avg_duration,
'dhb_last_60_and_90_days_ntdun_call_in_duration', dhb_last_60_and_90_days_ntdun_call_duration_above60,
'dhb_last_60_and_90_days_ntdun_call_in_times', dhb_last_60_and_90_days_ntdun_call_duration_below15,
'dhb_last_60_and_90_days_ntdun_call_out_duration', dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30,
'dhb_last_60_and_90_days_ntdun_call_out_times', dhb_last_60_and_90_days_ntdun_call_in_duration,
'dhb_last_60_and_90_days_ntdun_call_tel_total_nums', dhb_last_60_and_90_days_ntdun_call_in_times,
'dhb_last_60_and_90_days_ntdun_call_total_duration', dhb_last_60_and_90_days_ntdun_call_out_duration,
'dhb_last_60_and_90_days_ntdun_call_total_times', dhb_last_60_and_90_days_ntdun_call_out_times,
'dhb_last_three_weeks_dun_call_avg_duration', dhb_last_60_and_90_days_ntdun_call_tel_total_nums,
'dhb_last_three_weeks_dun_call_duration_above60', dhb_last_60_and_90_days_ntdun_call_total_duration,
'dhb_last_three_weeks_dun_call_duration_below15', dhb_last_60_and_90_days_ntdun_call_total_times,
'dhb_last_three_weeks_dun_call_duration_between15_and_30', dhb_last_three_weeks_dun_call_avg_duration,
'dhb_last_three_weeks_dun_call_in_duration', dhb_last_three_weeks_dun_call_duration_above60,
'dhb_last_three_weeks_dun_call_in_times', dhb_last_three_weeks_dun_call_duration_below15,
'dhb_last_three_weeks_dun_call_out_duration', dhb_last_three_weeks_dun_call_duration_between15_and_30,
'dhb_last_three_weeks_dun_call_out_times', dhb_last_three_weeks_dun_call_in_duration,
'dhb_last_three_weeks_dun_call_tel_total_nums', dhb_last_three_weeks_dun_call_in_times,
'dhb_last_three_weeks_dun_call_total_duration', dhb_last_three_weeks_dun_call_out_duration,
'dhb_last_three_weeks_dun_call_total_times', dhb_last_three_weeks_dun_call_out_times,
'dhb_last_three_weeks_ntdun_call_avg_duration', dhb_last_three_weeks_dun_call_tel_total_nums,
'dhb_last_three_weeks_ntdun_call_duration_above60', dhb_last_three_weeks_dun_call_total_duration,
'dhb_last_three_weeks_ntdun_call_duration_below15', dhb_last_three_weeks_dun_call_total_times,
'dhb_last_three_weeks_ntdun_call_duration_between15_and_30', dhb_last_three_weeks_ntdun_call_avg_duration,
'dhb_last_three_weeks_ntdun_call_in_duration', dhb_last_three_weeks_ntdun_call_duration_above60,
'dhb_last_three_weeks_ntdun_call_in_times', dhb_last_three_weeks_ntdun_call_duration_below15,
'dhb_last_three_weeks_ntdun_call_out_duration', dhb_last_three_weeks_ntdun_call_duration_between15_and_30,
'dhb_last_three_weeks_ntdun_call_out_times', dhb_last_three_weeks_ntdun_call_in_duration,
'dhb_last_three_weeks_ntdun_call_tel_total_nums', dhb_last_three_weeks_ntdun_call_in_times,
'dhb_last_three_weeks_ntdun_call_total_duration', dhb_last_three_weeks_ntdun_call_out_duration,
'dhb_last_three_weeks_ntdun_call_total_times', dhb_last_three_weeks_ntdun_call_out_times,
'dhb_last_two_weeks_dun_call_avg_duration', dhb_last_three_weeks_ntdun_call_tel_total_nums,
'dhb_last_two_weeks_dun_call_duration_above60', dhb_last_three_weeks_ntdun_call_total_duration,
'dhb_last_two_weeks_dun_call_duration_below15', dhb_last_three_weeks_ntdun_call_total_times,
'dhb_last_two_weeks_dun_call_duration_between15_and_30', dhb_last_two_weeks_dun_call_avg_duration,
'dhb_last_two_weeks_dun_call_in_duration', dhb_last_two_weeks_dun_call_duration_above60,
'dhb_last_two_weeks_dun_call_in_times', dhb_last_two_weeks_dun_call_duration_below15,
'dhb_last_two_weeks_dun_call_out_duration', dhb_last_two_weeks_dun_call_duration_between15_and_30,
'dhb_last_two_weeks_dun_call_out_times', dhb_last_two_weeks_dun_call_in_duration,
'dhb_last_two_weeks_dun_call_tel_total_nums', dhb_last_two_weeks_dun_call_in_times,
'dhb_last_two_weeks_dun_call_total_duration', dhb_last_two_weeks_dun_call_out_duration,
'dhb_last_two_weeks_dun_call_total_times', dhb_last_two_weeks_dun_call_out_times,
'dhb_last_two_weeks_ntdun_call_avg_duration', dhb_last_two_weeks_dun_call_tel_total_nums,
'dhb_last_two_weeks_ntdun_call_duration_above60', dhb_last_two_weeks_dun_call_total_duration,
'dhb_last_two_weeks_ntdun_call_duration_below15', dhb_last_two_weeks_dun_call_total_times,
'dhb_last_two_weeks_ntdun_call_duration_between15_and_30', dhb_last_two_weeks_ntdun_call_avg_duration,
'dhb_last_two_weeks_ntdun_call_in_duration', dhb_last_two_weeks_ntdun_call_duration_above60,
'dhb_last_two_weeks_ntdun_call_in_times', dhb_last_two_weeks_ntdun_call_duration_below15,
'dhb_last_two_weeks_ntdun_call_out_duration', dhb_last_two_weeks_ntdun_call_duration_between15_and_30,
'dhb_last_two_weeks_ntdun_call_out_times', dhb_last_two_weeks_ntdun_call_in_duration,
'dhb_last_two_weeks_ntdun_call_tel_total_nums', dhb_last_two_weeks_ntdun_call_in_times,
'dhb_last_two_weeks_ntdun_call_total_duration', dhb_last_two_weeks_ntdun_call_out_duration,
'dhb_last_two_weeks_ntdun_call_total_times', dhb_last_two_weeks_ntdun_call_out_times,
'dhb_last_week_dun_call_avg_duration', dhb_last_two_weeks_ntdun_call_tel_total_nums,
'dhb_last_week_dun_call_duration_above60', dhb_last_two_weeks_ntdun_call_total_duration,
'dhb_last_week_dun_call_duration_below15', dhb_last_two_weeks_ntdun_call_total_times,
'dhb_last_week_dun_call_duration_between15_and_30', dhb_last_week_dun_call_avg_duration,
'dhb_last_week_dun_call_in_duration', dhb_last_week_dun_call_duration_above60,
'dhb_last_week_dun_call_in_times', dhb_last_week_dun_call_duration_below15,
'dhb_last_week_dun_call_out_duration', dhb_last_week_dun_call_duration_between15_and_30,
'dhb_last_week_dun_call_out_times', dhb_last_week_dun_call_in_duration, dhb_last_week_dun_call_in_times,
'dhb_last_week_dun_call_tel_total_nums', dhb_last_week_dun_call_out_duration,
'dhb_last_week_dun_call_total_duration', dhb_last_week_dun_call_out_times,
'dhb_last_week_dun_call_total_times', dhb_last_week_dun_call_tel_total_nums,
'dhb_last_week_ntdun_call_avg_duration', dhb_last_week_dun_call_total_duration,
'dhb_last_week_ntdun_call_duration_above60', dhb_last_week_dun_call_total_times,
'dhb_last_week_ntdun_call_duration_below15', dhb_last_week_ntdun_call_avg_duration,
'dhb_last_week_ntdun_call_duration_between15_and_30', dhb_last_week_ntdun_call_duration_above60,
'dhb_last_week_ntdun_call_in_duration', dhb_last_week_ntdun_call_duration_below15,
'dhb_last_week_ntdun_call_in_times', dhb_last_week_ntdun_call_duration_between15_and_30,
'dhb_last_week_ntdun_call_out_duration', dhb_last_week_ntdun_call_in_duration,
'dhb_last_week_ntdun_call_out_times', dhb_last_week_ntdun_call_in_times,
'dhb_last_week_ntdun_call_tel_total_nums', dhb_last_week_ntdun_call_out_duration,
'dhb_last_week_ntdun_call_total_duration', dhb_last_week_ntdun_call_out_times,
'dhb_last_week_ntdun_call_total_times', dhb_last_week_ntdun_call_tel_total_nums,
'dhb_overview_dun_call_avg_duration', dhb_last_week_ntdun_call_total_duration,
'dhb_overview_dun_call_duration_above60', dhb_last_week_ntdun_call_total_times,
'dhb_overview_dun_call_duration_below15', dhb_overview_dun_call_avg_duration,
'dhb_overview_dun_call_duration_between15_and_30', dhb_overview_dun_call_duration_above60,
'dhb_overview_dun_call_in_duration', dhb_overview_dun_call_duration_below15,
'dhb_overview_dun_call_in_times', dhb_overview_dun_call_duration_between15_and_30,
'dhb_overview_dun_call_out_duration', dhb_overview_dun_call_in_duration, dhb_overview_dun_call_in_times,
'dhb_overview_dun_call_out_times', dhb_overview_dun_call_out_duration, dhb_overview_dun_call_out_times,
'dhb_overview_dun_call_tel_total_nums', dhb_overview_dun_call_tel_total_nums,
'dhb_overview_dun_call_total_duration', dhb_overview_dun_call_total_duration,
'dhb_overview_dun_call_total_times', dhb_overview_dun_call_total_times, dhb_overview_dun_first_call_time,
'dhb_overview_dun_first_call_time', dhb_overview_dun_last_call_time,
'dhb_overview_dun_last_call_time', dhb_overview_ntdun_call_avg_duration,
'dhb_overview_ntdun_call_avg_duration', dhb_overview_ntdun_call_duration_above60,
'dhb_overview_ntdun_call_duration_above60', dhb_overview_ntdun_call_duration_below15,
'dhb_overview_ntdun_call_duration_below15', dhb_overview_ntdun_call_duration_between15_and_30,
'dhb_overview_ntdun_call_duration_between15_and_30', dhb_overview_ntdun_call_in_duration,
'dhb_overview_ntdun_call_in_duration', dhb_overview_ntdun_call_in_times,
'dhb_overview_ntdun_call_in_times', dhb_overview_ntdun_call_out_duration,
'dhb_overview_ntdun_call_out_duration', dhb_overview_ntdun_call_out_times,
'dhb_overview_ntdun_call_out_times', dhb_overview_ntdun_call_tel_total_nums,
'dhb_overview_ntdun_call_tel_total_nums', dhb_overview_ntdun_call_total_duration,
'dhb_overview_ntdun_call_total_duration', dhb_overview_ntdun_call_total_times,
'dhb_overview_ntdun_call_total_times', dhb_overview_ntdun_first_call_time,
'dhb_overview_ntdun_first_call_time'] dhb_overview_ntdun_last_call_time,applied_at,applied_from,applied_type,if(passdue_day>15,1,0) as target
from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1
and dhb_flag =1
and datediff(now(),deadline) > 15
'''
def __init__(self,overdue_days=15,features=None,sql=None,start_time_period=None,end_time_period=None):
# sql = '''
#
# '''
start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00")
end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00")
def __init__(self,features=None,sql=None,start_time_period=None,end_time_period=None):
try: try:
if features != None: # if the para was not Series
self.features = features if(type(features) != pd.core.series.Series):
if sql != None: self.features =pd.Series(features)
self.sql = sql
else:
sql = "select "+str(features).strip('[').strip(']')+''',if(passdue_day>'''+str(overdue_days)+''',1,0) as target, applied_at, applied_from, applied_type
from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1
and dhb_flag =1
and datediff(now(),deadline) > '''+str(overdue_days)+'''
'''
if start_time_period != None:
self.start_time_period = start_time_period
else: else:
self.start_time_period =(datetime.date.today() - dateutil.relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00") self.features = features
if end_time_period != None:
self.end_time_period = end_time_period
else:
self.end_time_period = (datetime.date.today() - dateutil.relativedelta(days=+16)).strftime("%Y-%m-%d 00:00:00")
except Exception as e: except Exception as e:
print('Parameters Error:\n',e) print("'features' parameter type Error, it should be list or Series")
raise
if sql != None:
self.sql = sql
if start_time_period != None:
self.start_time_period = start_time_period
if end_time_period != None:
self.end_time_period = end_time_period
def dhb_features_extract(self): def dhb_features_extract(self):
'''
instrucions : extract dhb features from risk_analysis
:param self:
:return: dhb features
'''
value_map = { value_map = {
"近3天":1, "近3天":1,
"近4-5天":2, "近4-5天":2,
"近6-7天":3, "近6-7天":3,
"近8-15天":4, "近8-15天":4,
"近16-30天":5, "近16-30天":5,
"近31-60天":6, "近31-60天":6,
"近61-90天":7, "近61-90天":7,
"近91-120天":8, "近91-120天":8,
"近121-150天":9, "近121-150天":9,
"近151-180天":10, "近151-180天":10,
"180天前":11, "180天前":11,
"无":0 "无":0
} }
#print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period)) #print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data # use risk_analysis to extract data
sql=self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period) dhb_loan = pd.read_sql(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period),mysqldb.engine_risk_analysis)
# dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(lambda x : value_map[x])
dhb_loan = pd.read_sql(sql,mysqldb.engine_risk_analysis)
# dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(lambda x : value_map[x])
# manipul category datatype which includes sequences
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = pd.get_dummies(dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]],columns=["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"])
# limit the upper boundary
dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42,"dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42 dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42,"dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25,"dhb_overview_ntdun_call_duration_above60"] = 25 dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25,"dhb_overview_ntdun_call_duration_above60"] = 25
dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration>= 800,"dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800 dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration>= 800,"dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
...@@ -249,14 +252,41 @@ class dhb: ...@@ -249,14 +252,41 @@ class dhb:
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums>= 22,"dhb_overview_dun_call_tel_total_nums"] = 22 dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums>= 22,"dhb_overview_dun_call_tel_total_nums"] = 22
dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_total_duration>= 1100,"dhb_last_30_days_dun_call_total_duration"] = 1100 dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_total_duration>= 1100,"dhb_last_30_days_dun_call_total_duration"] = 1100
dhb_loan.loc[dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration>= 300,"dhb_last_two_weeks_ntdun_call_in_duration"] = 300 dhb_loan.loc[dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration>= 300,"dhb_last_two_weeks_ntdun_call_in_duration"] = 300
# dhb_loan.to_csv("./dhb_loan_sample——"+str(datetime.date.today())+".csv")
print( datetime.time.strftime('%Y.%m.%d %H:%M:%S',datetime.time.localtime(datetime.time.time())) +"提取了dhb {}+ ".format(str(self.overdue_days)) + self.start_time_period + "to" + self.end_time_period + "时段样本") dhb_loan.to_csv("./dhb_loan_sample——"+str(datetime.date.today())+".csv")
print( time.strftime('%Y.%m.%d %H:%M:%S',time.localtime(time.time())) +"提取了dhb "+ self.start_time_period + "to" + self.end_time_period + "时段样本")
return dhb_loan return dhb_loan
def dhb_comparasion(self,limit="{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}",df=None,applied_type = None,applied_from = None): '''
df_mongo = mongodb.pymongodb(self.start_time_period, self.end_time_period, limit, "{'order_id':1,'model_exec_data_source#dhb':1}") instructions : build a comparasion
Params :
df - test dataset which was given
score - score column
target - label
start_time_period -
end_time_period -
applied_tpye -
applied_from -
Returns :
auc comparasion
liftchart plot
'''
# def dhb_comparasion(df,score = 'model_exec_data_source#dhb' ,target = 'target', start_time_period = self.start_time_period, end_time_period = self.end_time_period, applied_type = None, applied_from = None):
# df_mongo = mongodb.pymongodb(start_time_period, end_time_period, limit, "{'order_id':1,'model_exec_data_source#dhb':1}")
# df = pd.merge(df,df_mongo,how='left',left_on='order_no',right_on='order_id')
# df['bins'] = df.qcut(df['target'], q = 10, percision = 6, dupulicates='drop')
# df.groupby
# return 1
...@@ -6,7 +6,6 @@ from data.analyis import datacal ...@@ -6,7 +6,6 @@ from data.analyis import datacal
from models import xgboost from models import xgboost
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import drawplot
import dhb from mvp import dhb
from data.datasource import mysqldb,mongodb
dhb = dhb()
df_dhb = dhb.dhb_features_extract()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment