from models_obj import dhb_obj
from tools import datacal
import datetime
from models_kit import lightgbm
from models_kit import xgboost
import lightgbm as lgb
from graph import matplot
from tools import filetool
from sklearn.metrics import roc_auc_score


dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本
#df_sample = dhb.dhb_features_extract()


######### temp #############
import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
target = 'target'
score = 'score'
prediction = 'predict'

############################
# 备份df_sample
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")


# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
                                                             random_state=7,split_methods='random',
                                                             time_label='applied_at')
del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
                                               topN=3, cv_fold=5)
print('topn 通过train交叉验证得到的auc ',topn)

# model matrix
model_matrix_index = ['name','Params','trainAUC','validationAUC']
model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])

pointer = 0
for param in optimal_para:
    train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
                                                   adds_on=param, target=target)
    model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
    pointer += 1

# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']

# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
                                               adds_on=best_params, target='target')

# 用新模型预测结果
predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target)
# 把新的预测结果加入test
df_test[prediction] = predictions



####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
####################################

### report
import os
os.chdir("E:/bla/model_mvp/")

# plot feature importance
topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/cache/', topN=20)

importanct_feat = pd.DataFrame({
        'column': dhb.features,
        'importance': lgbm.feature_importance(),
    }).sort_values(by='importance',ascending=False).column.tolist()[:3]

# report file
report_path = "E:/bla/model_mvp/"
report_name = "lgb_report.docx"

# 生成docx Documents
document = filetool.buildDocument(report_path, report_name)

# docx加入title
document.add_heading('lightGBM 算法refit报告')

# docx新增 特征权重段
document.add_paragraph('特征权重图')

# docx加入特征权重图像
document.add_picture(topnfeat_path)

# 新增 univar_chart段
document.add_paragraph('univar_chart')

# 遍历目标features画出univarchart
for i in importanct_feat:
    univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
    univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
    univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
    tab_df_list = [univar_train,univar_val,univar_test]
    univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
                    tab_df_list=tab_df_list, plot_tab=False,
                    saved_path='./mvp/plots/cache/')
    document.add_picture(univarChart)

document.add_paragraph('PDP_chart')
# 遍历目标features 画出对应PDP
for i in importanct_feat:
    pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
    pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
                  tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
    document.add_picture(pdpChart)



filetool.saveDocument(document, report_path, report_name)

document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
try:
    for channel in ['333','159537','1,214,217,198']:
        for type in ['1','2','3']:
            df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
            lift_pred = datacal.cal_lift(df_sliced,score=prediction)
            lift_online = datacal.cal_lift(df_sliced,score=score)
            #liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
            liftChart = matplot.plot_table_list([lift_pred, lift_online], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None,
                                    title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None,
                                    tab_df_list=tab_df_list, plot_tab=False,
                                    saved_path='./mvp/plots/cache/')
            document.add_picture(liftChart)
except:
    pass

filetool.saveDocument(document, report_path, report_name)













