仍然没有加表格好气呀

085d706c · 王家华 · 9b10189a · 085d706c · 085d706c · 085d706c
Commit 085d706c authored May 20, 2019 by 王家华
19 changed files
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/graph/__pycache__/matplot.cpython-36.pyc
+++ b/graph/__pycache__/matplot.cpython-36.pyc
--- a/graph/matplot.py
+++ b/graph/matplot.py
@@ -57,7 +57,7 @@ def plot_table_list(datalist, auc, datalist_description=None, title='untitled',
        # 每个table需要只有一个index，一个values
        x = range(len(datalist[table_index].index))
        y = datalist[table_index].values
-        axs.plot(x, y, label=datalist_description[table_index])
+        axs.plot(x, y, label=datalist_description[table_index] + "AUC: "+ str(round(auc[table_index],3)))
        if len(x) == 1:
            plot_tab = False
    if plot_tab:
@@ -100,7 +100,7 @@ def plot_table_list(datalist, auc, datalist_description=None, title='untitled',
        plt.xticks([])
    # otherwise, nothing to do here
        the_table.auto_set_font_size(False)
-        the_table.set_fontsize(8)
+        the_table.set_fontsize(6)
    fig.subplots_adjust(bottom=0.2)
    plt.grid()
    if y_label is not None:

--- a/lgb_report.docx
+++ b/lgb_report.docx
--- a/models_kit/__pycache__/general_methods.cpython-36.pyc
+++ b/models_kit/__pycache__/general_methods.cpython-36.pyc
--- a/models_kit/general_methods.py
+++ b/models_kit/general_methods.py
@@ -39,24 +39,26 @@ def topN_feature_importance_list(features, clf, topN=3):
    return importanct_feat


-def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
-    # model matrix 存储不同模型指标的矩阵
-    model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
-    model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
-                                 roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
-                                columns=['线上模型'])
+#TODO here

-    # 定义最优参指针
-    pointer = 0
-    # 遍历最优参组合
-    for param in optimal_para:
-        if algorthm == "lightGBM":
-            train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
-                                                       adds_on=param, target=target)
-        model_matrix = pd.concat([model_matrix,
-                                  pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
-                                               columns=[pointer])], axis=1)
-        pointer += 1
-
-    # 简单选取一下validation set auc 最高的 params
-    best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
+# def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
+#     # model matrix 存储不同模型指标的矩阵
+#     model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
+#     model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
+#                                  roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
+#                                 columns=['线上模型'])
+#
+#     # 定义最优参指针
+#     pointer = 0
+#     # 遍历最优参组合
+#     for param in optimal_para:
+#         if algorthm == "lightGBM":
+#             train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
+#                                                        adds_on=param, target=target)
+#         model_matrix = pd.concat([model_matrix,
+#                                   pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
+#                                                columns=[pointer])], axis=1)
+#         pointer += 1
+#
+#     # 简单选取一下validation set auc 最高的 params
+#     best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
--- a/mvp/__pycache__/refit.cpython-36.pyc
+++ b/mvp/__pycache__/refit.cpython-36.pyc
--- a/mvp/allocator.py
+++ b/mvp/allocator.py
@@ -62,11 +62,11 @@ df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size
 del df_sample

 # 模型refit
-model_matrix, lgbm = refit.model_fit(df_sample, dhb, target, score)
+model_matrix, lgbm = refit.model_fit(df_train, df_val, df_test, dhb, target, score)

 print(model_matrix)
 # 生成报告
-status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target,
+status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target, model_matrix,
                 score, prediction, report_path, report_name, applied_from, applied_type, topN=3)



--- a/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
--- a/mvp/plots/cache/pdp Chart with 9 1.png
+++ b/mvp/plots/cache/pdp Chart with 9 1.png
--- a/mvp/plots/cache/全渠道全量客群测试集上的 lift Chart.png
+++ b/mvp/plots/cache/全渠道全量客群测试集上的 lift Chart.png
--- a/mvp/plots/cache/内部复申 lift Chart.png
+++ b/mvp/plots/cache/内部复申 lift Chart.png
--- a/mvp/plots/cache/内部首付贷 lift Chart.png
+++ b/mvp/plots/cache/内部首付贷 lift Chart.png
--- a/mvp/plots/cache/内部首申 lift Chart.png
+++ b/mvp/plots/cache/内部首申 lift Chart.png
--- a/mvp/plots/cache/内部首贷 lift Chart.png
+++ b/mvp/plots/cache/内部首贷 lift Chart.png
--- a/mvp/refit.py
+++ b/mvp/refit.py
@@ -83,6 +83,9 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
    :return:
        status ： 返回1表示执行完成
    '''
+    # 样本分布
+    df_train_ = len(df_train)
+

    # 用新模型预测结果 xgb还需要加一个proba (TODO here)
    predictions ,test_auc = lightgbm.predict(clf,df_test,model_obj.features,target)
@@ -126,7 +129,7 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
        #univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
        #                tab_df_list=tab_df_list, plot_tab=False,
        #                saved_path='./mvp/plots/cache/')
-        document.add_picture(univar_chart,width=Inches(8))
+        document.add_picture(univar_chart,width=Inches(7))

    # 新增pdp段
    document.add_paragraph('PDP_chart')
@@ -136,45 +139,65 @@ def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
        # pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
        #               tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
        pdpChart = matplot.pdpCharts9(clf, df_test, importanct_feat, model_obj.features, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/")
-        document.add_picture(pdpChart,width=Inches(8))
+        document.add_picture(pdpChart,width=Inches(7))

    # 新增liftchart段
    document.add_paragraph('lift_chart')
    # 遍历给定渠道 & 客群 默认等频画出liftchart
+
    try:
        lift_pred = datacal.cal_lift(df_test, score=prediction)
        lift_online = datacal.cal_lift(df_test, score=score)
        # liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
        liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
                                            [roc_auc_score(df_test[target], df_test[prediction]),
-                                             roc_auc_score(df_test[target], df_test[score])], datalist_description=None,
+                                             roc_auc_score(df_test[target], df_test[score])], datalist_description=['新模型预测','线上模型'],
                                            title='全渠道全量客群测试集上的 lift Chart',
-                                            X_label=None, y_label=None,
-                                            tab_df_list=None, plot_tab=False,
+                                            X_label=None, y_label='逾期率',
+                                            tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
                                            saved_path='./mvp/plots/cache/')
-        document.add_picture(liftChart, width=Inches(8))
-        # 遍历渠道
+
+        document.add_picture(liftChart, width=Inches(7))
+    # 遍历渠道
        for channel in applied_from.keys():
-            # 遍历客群类型
-            for type in applied_type.keys():
-                print('lift ',type,channel)
+            print('lift ',channel)
+            # 数据切片
+            df_sliced = df_test[
+                df_test.applied_type.map(lambda x: True if str(x) in type.split(',') else False) & df_test.applied_from.map(
+                    lambda x: True if str(x) in channel.split(',') else False)]
+            #
+            lift_pred = datacal.cal_lift(df_sliced, score=prediction)
+            lift_online = datacal.cal_lift(df_sliced, score=score)
+            # liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
+            liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
+                                                [roc_auc_score(df_test[target], df_test[prediction]),
+                                                 roc_auc_score(df_test[target], df_test[score])],
+                                                datalist_description=['新模型预测', '线上模型'],
+                                                title=applied_from[channel] + ' lift Chart',
+                                                X_label=None, y_label='逾期率',
+                                                tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
+                                                saved_path='./mvp/plots/cache/')
+            document.add_picture(liftChart, width=Inches(5.5))
+        # 遍历客群类型
+        for type in applied_type.keys():
+                print('lift ',type)
                # 数据切片
                df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
                #
                lift_pred = datacal.cal_lift(df_sliced,score=prediction)
                lift_online = datacal.cal_lift(df_sliced,score=score)
                #liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
-                liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None,
-                                        title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None,
-                                        tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=True,
+                liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=['新模型预测','线上模型'],
+                                        title= applied_type[type]+ ' lift Chart', X_label=None, y_label='逾期率',
+                                        tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=False,
                                        saved_path='./mvp/plots/cache/')
-                document.add_picture(liftChart,width=Inches(8))
-
-    # 存在某些渠道量很少的情况，加入try catch异常处理
+                document.add_picture(liftChart,width=Inches(5.5))
    except Exception as e:
-        print('Exception: ',e)
+        print(e)
        pass

+    # 存在某些渠道量很少的情况，加入try catch异常处理
+
    # docx 保存
    filetool.saveDocument(document, report_path, report_name)
    return 1

--- a/tools/__pycache__/filetool.cpython-36.pyc
+++ b/tools/__pycache__/filetool.cpython-36.pyc