from models_obj import dhb_obj
from tools import datacal
import datetime
from models_kit import lightgbm
from models_kit import xgboost
import lightgbm as lgb
from graph import matplot
from tools import filetool
from sklearn.metrics import roc_auc_score
from models_kit import general_methods
from docx.shared import Inches
import math
import pandas as pd


def model_fit(df_train, df_val, df_test, model_obj, target, score):
    '''

    :param df_train: 训练集
    :param df_val: 验证集
    :param df_test: 测试集
    :param model_obj: 线上模型对象
    :param target: 目标列标签（逾期率标签 1 and 0）
    :param score:  线上分字段
    :return:
        model_matrix - 不同模型的同一算法运行结果指标二维表
        lgbm - 验证集上选择的最优分类器

    '''

   # 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
    optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, model_obj.features, df_train, df_val, target=target,
                                                   topN=3, cv_fold=5)

    print('topn 通过train交叉验证得到的auc ',topn)

    # model matrix 存储不同模型指标的矩阵
    model_matrix_index = ['name','Params','trainAUC','validationAUC']
    model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])

    # 定义最优参指针
    pointer = 0
    # 遍历最优参组合
    for param in optimal_para:
        train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
                                                       adds_on=param, target=target)
        model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
        pointer += 1

    # 简单选取一下validation set auc 最高的 params
    best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']

    # 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
    train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
                                                   adds_on=best_params, target='target')

    return model_matrix, lgbm






###################################### 生成报告 ################################################################
def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
                 score, prediction, report_path, report_name, applied_from, applied_type, topN=3):
    '''

    :param clf:  模型分类器对象
    :param df_train: 训练集
    :param df_val: validation set
    :param df_test: 测试集
    :param model_obj: 线上模型对象
    :param target: 目标列标签（逾期标签）
    :param model_matrix: 模型对比二维表
    :param score: 线上模型分数字段
    :param prediction: 模型预测分数字段
    :param report_path: 报告生成路径
    :param report_name: 报告生成名
    :param applied_from: 报告中包含的渠道字典
    :param applied_type: 报告中包含的申请类型字典
    :param topN: 前N个特征（重要性）
    :return:
        status ： 返回1表示执行完成
    '''
    # 样本分布
    df_train_ = len(df_train)


    # 用新模型预测结果 xgb还需要加一个proba (TODO here)
    predictions ,test_auc = lightgbm.predict(clf,df_test,model_obj.features,target)

    # 把新的预测结果加入test
    df_test[prediction] = predictions

    # plot feature importance
    topnfeat_path = general_methods.topN_feature_importance_plot(lgb, clf, title="untitled", save_path='./mvp/plots/cache/', topN=20)

    # 获取前N个权重特征列表
    importanct_feat = general_methods.topN_feature_importance_list(model_obj.features, clf, topN=3)

    # 生成docx Documents
    document = filetool.buildDocument(report_path, report_name)

    # docx加入title
    document.add_heading('lightGBM 算法refit报告')

    # docx新增 特征权重段
    document.add_paragraph('特征权重图')

    # docx加入特征权重图像
    document.add_picture(topnfeat_path)

    # 新增 univar_chart段
    document.add_paragraph('univar_chart')
    # 遍历目标features画出univarchart
    for i in importanct_feat:
        # 训练集 univar
        univar_chart = matplot.uniVarChart(df_train, i, target, n_bins=10, dfltValue=-99999, dftrain=df_val, dftest=df_test, drawAll=True,
                    drawTrTe=False, saved_path='./mvp/plots/cache/')
        # univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
        # # validation univar
        # univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
        # # test集 univar
        # univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
        # 用于univarChart画图的参数列表
        #tab_df_list = [univar_train,univar_val,univar_test]
        # 调用plot_table_list
        #univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
        #                tab_df_list=tab_df_list, plot_tab=False,
        #                saved_path='./mvp/plots/cache/')
        document.add_picture(univar_chart,width=Inches(7))

    # 新增pdp段
    document.add_paragraph('PDP_chart')
    # 遍历目标features 画出对应PDP
    for i in range(math.ceil(len(importanct_feat)/9)):
        # pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
        # pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
        #               tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
        pdpChart = matplot.pdpCharts9(clf, df_test, importanct_feat, model_obj.features, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/")
        document.add_picture(pdpChart,width=Inches(7))

    # 新增liftchart段
    document.add_paragraph('lift_chart')
    # 遍历给定渠道 & 客群 默认等频画出liftchart

    try:
        lift_pred = datacal.cal_lift(df_test, score=prediction)
        lift_online = datacal.cal_lift(df_test, score=score)
        # liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
        liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
                                            [roc_auc_score(df_test[target], df_test[prediction]),
                                             roc_auc_score(df_test[target], df_test[score])], datalist_description=['新模型预测','线上模型'],
                                            title='全渠道全量客群测试集上的 lift Chart',
                                            X_label=None, y_label='逾期率',
                                            tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
                                            saved_path='./mvp/plots/cache/')

        document.add_picture(liftChart, width=Inches(7))
    # 遍历渠道
        for channel in applied_from.keys():
            print('lift ',channel)
            # 数据切片
            df_sliced = df_test[
                df_test.applied_type.map(lambda x: True if str(x) in type.split(',') else False) & df_test.applied_from.map(
                    lambda x: True if str(x) in channel.split(',') else False)]
            #
            lift_pred = datacal.cal_lift(df_sliced, score=prediction)
            lift_online = datacal.cal_lift(df_sliced, score=score)
            # liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
            liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
                                                [roc_auc_score(df_test[target], df_test[prediction]),
                                                 roc_auc_score(df_test[target], df_test[score])],
                                                datalist_description=['新模型预测', '线上模型'],
                                                title=applied_from[channel] + ' lift Chart',
                                                X_label=None, y_label='逾期率',
                                                tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
                                                saved_path='./mvp/plots/cache/')
            document.add_picture(liftChart, width=Inches(5.5))
        # 遍历客群类型
        for type in applied_type.keys():
                print('lift ',type)
                # 数据切片
                df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
                #
                lift_pred = datacal.cal_lift(df_sliced,score=prediction)
                lift_online = datacal.cal_lift(df_sliced,score=score)
                #liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
                liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=['新模型预测','线上模型'],
                                        title= applied_type[type]+ ' lift Chart', X_label=None, y_label='逾期率',
                                        tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
                                        saved_path='./mvp/plots/cache/')
                document.add_picture(liftChart,width=Inches(5.5))
    except Exception as e:
        print(e)
        pass

    # 存在某些渠道量很少的情况，加入try catch异常处理

    # docx 保存
    filetool.saveDocument(document, report_path, report_name)
    return 1













