plot函数加表格出异常暂时没法全部解决，调用了model tools的方法画图

9b10189a · 王家华 · bd18c3b0 · 9b10189a · 9b10189a · 9b10189a
Commit 9b10189a authored May 20, 2019 by 王家华
31 changed files
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,9 +2,12 @@
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="c45d2e80-934e-41cc-8f01-c6d0d282db9d" name="Default Changelist" comment="">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/graph/matplot.py" beforeDir="false" afterPath="$PROJECT_DIR$/graph/matplot.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/models_kit/general_methods.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_kit/general_methods.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/models_obj/dhb_obj.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_obj/dhb_obj.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/mvp/allocator.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/allocator.py" afterDir="false" />
      <change beforePath="$PROJECT_DIR$/mvp/refit.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/refit.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/tools/filetool.py" beforeDir="false" afterPath="$PROJECT_DIR$/tools/filetool.py" afterDir="false" />
    </list>
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
@@ -15,53 +18,46 @@
  <component name="FileEditorManager">
    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/mvp/allocator.py">
+        <entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state>
+            <state relative-caret-position="306">
+              <caret line="51" column="30" lean-forward="true" selection-start-line="51" selection-start-column="30" selection-end-line="51" selection-end-column="30" />
              <folding>
-                <element signature="e#0#19#0" expanded="true" />
+                <element signature="e#0#31#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file pinned="false" current-in-tab="true">
-        <entry file="file://$PROJECT_DIR$/mvp/refit.py">
+        <entry file="file://$PROJECT_DIR$/mvp/allocator.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="222">
+            <state relative-caret-position="119">
-              <caret line="70" column="19" selection-start-line="70" selection-start-column="14" selection-end-line="70" selection-end-column="19" />
+              <caret line="58" column="22" lean-forward="true" selection-start-line="58" selection-start-column="22" selection-end-line="58" selection-end-column="22" />
              <folding>
-                <element signature="e#0#30#0" expanded="true" />
+                <element signature="e#4120#4150#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py">
+        <entry file="file://$PROJECT_DIR$/mvp/refit.py">
-          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-291">
-              <caret line="639" column="21" lean-forward="true" selection-start-line="639" selection-start-column="21" selection-end-line="639" selection-end-column="21" />
-            </state>
-          </provider>
-        </entry>
-      </file>
-      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/tools/filetool.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-102">
+            <state relative-caret-position="204">
+              <caret line="171" lean-forward="true" selection-start-line="171" selection-end-line="171" />
              <folding>
-                <element signature="e#0#9#0" expanded="true" />
+                <element signature="e#0#30#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/tools/datacal.py">
+        <entry file="file://$PROJECT_DIR$/mvp/lgbreport.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="1139">
+            <state relative-caret-position="187">
-              <caret line="67" column="36" lean-forward="true" selection-start-line="67" selection-start-column="36" selection-end-line="67" selection-end-column="36" />
+              <caret line="11" lean-forward="true" selection-start-line="11" selection-end-line="11" />
              <folding>
                <element signature="e#0#19#0" expanded="true" />
              </folding>
@@ -70,22 +66,22 @@
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/datasource/mongodb.py">
+        <entry file="file://$PROJECT_DIR$/tools/filetool.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="493">
+            <state relative-caret-position="473">
-              <caret line="29" selection-start-line="29" selection-end-line="29" />
+              <caret line="46" lean-forward="true" selection-start-line="46" selection-end-line="46" />
              <folding>
-                <element signature="e#0#14#0" expanded="true" />
+                <element signature="e#0#9#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
+        <entry file="file://$PROJECT_DIR$/tools/datacal.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-952">
+            <state relative-caret-position="595">
-              <caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" />
+              <caret line="35" column="36" lean-forward="true" selection-start-line="35" selection-start-column="36" selection-end-line="35" selection-end-column="36" />
              <folding>
                <element signature="e#0#19#0" expanded="true" />
              </folding>
@@ -94,10 +90,13 @@
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/graph/matplot.py">
+        <entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="442">
+            <state relative-caret-position="34">
-              <caret line="29" column="21" lean-forward="true" selection-start-line="29" selection-start-column="21" selection-end-line="29" selection-end-column="21" />
+              <caret line="212" column="33" selection-start-line="212" selection-start-column="8" selection-end-line="212" selection-end-column="33" />
+              <folding>
+                <element signature="e#0#19#0" expanded="true" />
+              </folding>
            </state>
          </provider>
        </entry>
@@ -105,22 +104,17 @@
      <file pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="214">
+            <state relative-caret-position="-34">
-              <caret line="167" column="10" lean-forward="true" selection-start-line="167" selection-start-column="10" selection-end-line="167" selection-end-column="52" />
+              <caret line="1" selection-start-line="1" selection-end-line="1" selection-end-column="41" />
-              <folding>
-                <element signature="e#0#22#0" expanded="true" />
-              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file pinned="false" current-in-tab="false">
-        <entry file="file://$PROJECT_DIR$/models_kit/xgboost.py">
+        <entry file="file://$PROJECT_DIR$/graph/matplot.py">
          <provider selected="true" editor-type-id="text-editor">
-            <state relative-caret-position="-1173">
+            <state relative-caret-position="152">
-              <folding>
+              <caret line="377" lean-forward="true" selection-start-line="377" selection-end-line="377" />
-                <element signature="e#0#19#0" expanded="true" />
-              </folding>
            </state>
          </provider>
        </entry>
@@ -133,11 +127,14 @@
  <component name="IdeDocumentHistory">
    <option name="CHANGED_PATHS">
      <list>
-        <option value="$PROJECT_DIR$/models_kit/general_methods.py" />
        <option value="$PROJECT_DIR$/models_kit/lightgbm.py" />
+        <option value="$PROJECT_DIR$/models_obj/dhb_obj.py" />
+        <option value="$PROJECT_DIR$/models_kit/general_methods.py" />
+        <option value="$PROJECT_DIR$/tools/filetool.py" />
        <option value="$PROJECT_DIR$/tools/datacal.py" />
        <option value="$PROJECT_DIR$/mvp/refit.py" />
        <option value="$PROJECT_DIR$/graph/matplot.py" />
+        <option value="$PROJECT_DIR$/mvp/allocator.py" />
      </list>
    </option>
  </component>
@@ -152,8 +149,8 @@
    </option>
  </component>
  <component name="ProjectFrameBounds" extendedState="6">
-    <option name="x" value="174" />
+    <option name="x" value="261" />
-    <option name="y" value="167" />
+    <option name="y" value="251" />
    <option name="width" value="1400" />
    <option name="height" value="831" />
  </component>
@@ -170,16 +167,6 @@
              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
            </path>
-            <path>
-              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
-              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
-              <item name="datasource" type="462c0819:PsiDirectoryNode" />
-            </path>
-            <path>
-              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
-              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
-              <item name="feature" type="462c0819:PsiDirectoryNode" />
-            </path>
            <path>
              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
@@ -190,11 +177,6 @@
              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
              <item name="models_kit" type="462c0819:PsiDirectoryNode" />
            </path>
-            <path>
-              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
-              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
-              <item name="models_obj" type="462c0819:PsiDirectoryNode" />
-            </path>
            <path>
              <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
              <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
@@ -245,7 +227,28 @@
      </list>
    </option>
  </component>
-  <component name="RunManager" selected="Python.refit">
+  <component name="RunManager" selected="Python.allocator">
+    <configuration name="allocator" type="PythonConfigurationType" factoryName="Python" temporary="true">
+      <module name="model_mvp" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/mvp" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/mvp/allocator.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
    <configuration name="lightgbm" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <module name="model_mvp" />
      <option name="INTERPRETER_OPTIONS" value="" />
@@ -290,6 +293,7 @@
    </configuration>
    <recent_temporary>
      <list>
+        <item itemvalue="Python.allocator" />
        <item itemvalue="Python.refit" />
        <item itemvalue="Python.lightgbm" />
      </list>
@@ -312,12 +316,12 @@
    <frame x="-8" y="-8" width="1936" height="1066" extended-state="6" />
    <editor active="true" />
    <layout>
-      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.28556374" />
+      <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.17860906" />
      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
      <window_info id="Favorites" order="2" side_tool="true" />
      <window_info anchor="bottom" id="Message" order="0" />
      <window_info anchor="bottom" id="Find" order="1" />
-      <window_info active="true" anchor="bottom" id="Run" order="2" visible="true" weight="0.3290461" />
+      <window_info anchor="bottom" id="Run" order="2" weight="0.3290461" />
      <window_info anchor="bottom" id="Debug" order="3" weight="0.39978564" />
      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
@@ -325,7 +329,7 @@
      <window_info anchor="bottom" id="Version Control" order="7" />
      <window_info anchor="bottom" id="Terminal" order="8" weight="0.3290461" />
      <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
-      <window_info anchor="bottom" id="Python Console" order="10" weight="0.31511253" />
+      <window_info active="true" anchor="bottom" id="Python Console" order="10" visible="true" weight="0.46623793" />
      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
@@ -339,22 +343,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="289">
-          <caret line="17" selection-start-line="17" selection-end-line="17" />
-        </state>
-      </provider>
-    </entry>
-    <entry file="file://$PROJECT_DIR$/mvp/allocator.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state>
-          <folding>
-            <element signature="e#0#19#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
    <entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_imps/_pydev_execfile.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="289">
@@ -372,15 +360,6 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tools/filetool.py">
-      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-102">
-          <folding>
-            <element signature="e#0#9#0" expanded="true" />
-          </folding>
-        </state>
-      </provider>
-    </entry>
    <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/pandas/core/generic.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="373">
@@ -397,60 +376,115 @@
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/tools/datacal.py">
+    <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="2140">
+          <caret line="662" column="38" lean-forward="true" selection-start-line="662" selection-start-column="38" selection-end-line="662" selection-end-column="38" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="1139">
+        <state relative-caret-position="34">
-          <caret line="67" column="36" lean-forward="true" selection-start-line="67" selection-start-column="36" selection-end-line="67" selection-end-column="36" />
+          <caret line="212" column="33" selection-start-line="212" selection-start-column="8" selection-end-line="212" selection-end-column="33" />
          <folding>
            <element signature="e#0#19#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
-    <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py">
+    <entry file="file://$PROJECT_DIR$/mvp/xgbreport.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-291">
+        <state relative-caret-position="170">
-          <caret line="639" column="21" lean-forward="true" selection-start-line="639" selection-start-column="21" selection-end-line="639" selection-end-column="21" />
+          <caret line="10" column="4" selection-start-line="10" selection-start-column="4" selection-end-line="10" selection-end-column="4" />
+          <folding>
+            <element signature="e#0#19#0" expanded="true" />
+          </folding>
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
+    <entry file="file://$PROJECT_DIR$/tools/filetool.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="214">
+        <state relative-caret-position="473">
-          <caret line="167" column="10" lean-forward="true" selection-start-line="167" selection-start-column="10" selection-end-line="167" selection-end-column="52" />
+          <caret line="46" lean-forward="true" selection-start-line="46" selection-end-line="46" />
          <folding>
-            <element signature="e#0#22#0" expanded="true" />
+            <element signature="e#0#9#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
-    <entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
+    <entry file="file://$PROJECT_DIR$/README.md">
+      <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
+        <state split_layout="SPLIT">
+          <first_editor />
+          <second_editor />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/tools/datacal.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="-952">
+        <state relative-caret-position="595">
-          <caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" />
+          <caret line="35" column="36" lean-forward="true" selection-start-line="35" selection-start-column="36" selection-end-line="35" selection-end-column="36" />
          <folding>
            <element signature="e#0#19#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
+    <entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="306">
+          <caret line="51" column="30" lean-forward="true" selection-start-line="51" selection-start-column="30" selection-end-line="51" selection-end-column="30" />
+          <folding>
+            <element signature="e#0#31#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="-34">
+          <caret line="1" selection-start-line="1" selection-end-line="1" selection-end-column="41" />
+        </state>
+      </provider>
+    </entry>
    <entry file="file://$PROJECT_DIR$/graph/matplot.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="442">
+        <state relative-caret-position="152">
-          <caret line="29" column="21" lean-forward="true" selection-start-line="29" selection-start-column="21" selection-end-line="29" selection-end-column="21" />
+          <caret line="377" lean-forward="true" selection-start-line="377" selection-end-line="377" />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/mvp/refit.py">
      <provider selected="true" editor-type-id="text-editor">
-        <state relative-caret-position="222">
+        <state relative-caret-position="204">
-          <caret line="70" column="19" selection-start-line="70" selection-start-column="14" selection-end-line="70" selection-end-column="19" />
+          <caret line="171" lean-forward="true" selection-start-line="171" selection-end-line="171" />
          <folding>
            <element signature="e#0#30#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
+    <entry file="file://$PROJECT_DIR$/mvp/lgbreport.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="187">
+          <caret line="11" lean-forward="true" selection-start-line="11" selection-end-line="11" />
+          <folding>
+            <element signature="e#0#19#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/mvp/allocator.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="119">
+          <caret line="58" column="22" lean-forward="true" selection-start-line="58" selection-start-column="22" selection-end-line="58" selection-end-column="22" />
+          <folding>
+            <element signature="e#4120#4150#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
  </component>
  <component name="masterDetails">
    <states>

--- a/graph/__pycache__/matplot.cpython-36.pyc
+++ b/graph/__pycache__/matplot.cpython-36.pyc
--- a/graph/matplot.py
+++ b/graph/matplot.py
@@ -48,12 +48,11 @@ def plot_table_list(datalist, auc, datalist_description=None, title='untitled',
        '''
    fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1)
    # datalist description
    if datalist_description is None:
        datalist_description = range(len(datalist))
    for table_index in range(len(datalist)):
        # 每个table需要只有一个index，一个values
        x = range(len(datalist[table_index].index))
@@ -302,3 +301,648 @@ def density_chart(dataset, title):
 ##		plt.subplots(,figsize=fig_size,linewidth=0.1)
 #
 #	return 1
+##############################################################
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Apr 28 17:45:08 2018
+@author: olivia_deyu
+"""
+import warnings
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from matplotlib.pylab import rcParams
+from sklearn import metrics
+from xgboost import XGBClassifier
+from xgboost import cv, DMatrix
+warnings.filterwarnings('ignore')
+# Fit Model ------------------------------------------------------------------------------------------
+def train_model(df, predictors, resp, params, idcol, useTrainCV=True, trvalsplit='random', trvalsplitRatio=0.8,
+                sort_col='applied_at'):
+    print('Train/Val evnetRate over all: %s' % resp, df[resp].mean())
+    if trvalsplit in ('random', 'timeSeries'):
+        if trvalsplit == 'random':
+            # 随机分配 train / val
+            train = df.sample(frac=trvalsplitRatio, random_state=1)
+            val = df[~df[idcol].isin(train[idcol])]
+        elif trvalsplit == 'timeSeries':
+            # 按时间序列分配 train /val
+            train = df.sort_values(by=sort_col).head(int(len(df) * trvalsplitRatio))
+            val = df[~df[idcol].isin(train[idcol])]
+        print('---------- train/val -------------')
+        print('eventRate on train: ', train[resp].mean(), '; sampleSize on train: ', train.shape, train[sort_col].min(),
+              train[sort_col].max())
+        print('eventRate on val: ', val[resp].mean(), '; sampleSize on val: ', val.shape, val[sort_col].min(),
+              val[sort_col].max())
+    else:
+        train = df
+        val = None
+        #         print ('Specify methods of train/val split !')
+        print('---------- train, no val -------------')
+        print('eventRate on train: ', train[resp].mean(), '; sampleSize on train: ', train.shape, train[sort_col].min(),
+              train[sort_col].max())
+    xgbC = XGBClassifier(**params)
+    model, fts_imp = modelfit(xgbC, train, val, predictors, resp, useTrainCV=useTrainCV)  #
+    return model, fts_imp
+def modelfit(alg, dtrain, dval, predictors, resp, useTrainCV=True, cv_folds=10, early_stopping_rounds=20):
+    if useTrainCV:
+        xgb_param = alg.get_xgb_params()
+        xgtrain = DMatrix(dtrain[predictors].values, label=dtrain[resp].values)
+        cvresult = cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
+                      metrics=(['auc']), early_stopping_rounds=early_stopping_rounds, verbose_eval=100)  # True, )
+        alg.set_params(n_estimators=cvresult.shape[0])
+        print(cvresult, cvresult.shape)
+    # Fit the algorithm on the data and save the model
+    alg.fit(dtrain[predictors], dtrain[resp], eval_metric='auc')
+    print('Model params: -----------')
+    print(alg.n_estimators, alg.max_depth, alg.learning_rate)
+    # joblib.dump(alg, '%s.pkl' %pklname)
+    # Predict training set:
+    dtrain_predictions = alg.predict(dtrain[predictors])
+    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
+    # Print Model Report:
+    print("\nModel Report")
+    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[resp].values, dtrain_predictions))
+    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[resp], dtrain_predprob))
+    if dval is not None:
+        # Predict validation Set:
+        dval_predprob = alg.predict_proba(dval[predictors])[:, 1]
+        print("AUC Score (Validation): %f" % metrics.roc_auc_score(dval[resp], dval_predprob))
+    # Print Feature Importance:
+    feat_imp = pd.Series(alg.get_booster().get_fscore(), predictors).sort_values(ascending=False, na_position='last')
+    # feat_imp = pd.Series(alg.booster().get_fscore(), predictors).sort(ascending=False)
+    feat_imp = feat_imp[feat_imp > 0]
+    print('----------- Feature importance -------------')
+    print(feat_imp)
+    return alg, feat_imp
+# Univariate Chart ------------------------------------------------------------------------------------------
+rcParams['figure.figsize'] = 12, 8
+def uniVarChart(df, feature, resp, n_bins=10, dfltValue=-99999, dftrain=False, dftest=False, drawAll=True,
+                drawTrTe=False,saved_path='./mvp/plots/cache/'):
+    """
+    Draw Univariate-Chart for certain feature on all/train/test sample respectively
+    Parameters
+    ----------
+    df : pd.DataFrame
+        at least contains feature and resp
+    feature : string, feature need to draw
+    resp : string, resp column
+        only contains 0/1 value
+    n_bins: int, default 10
+        only works with numeric data
+    dfltValue : numeric, default value for this feature
+    dftrain : pd.DataFrame
+        at least contains feature and resp
+    dftest : pd.DataFrame
+        at least contains feature and resp
+    drawAll : boolean
+        if True then draw univariate chart on all sample
+    drawTrTe : boolean
+        if True then draw univariate chart on train and test samples respectively
+    Returns
+    -------
+    fig : figure
+    """
+    idx = (df[feature] != dfltValue)
+    if n_bins > df[feature].nunique():
+        predictions, predictionsTr, predictionsTe = [], [], []
+        qq, qqTr, qqTe = [], [], []
+        n_bins = df[feature].nunique()
+        feature_grid = sorted(df.loc[idx, feature].unique().tolist())
+        for feature_val in feature_grid:
+            predictions.append(df.loc[df[feature] == feature_val, resp].mean())
+            qq.append(df.loc[df[feature] == feature_val, resp].count())
+        if drawTrTe:
+            for feature_val in feature_grid:
+                predictionsTr.append(dftrain.loc[dftrain[feature] == feature_val, resp].mean())
+                predictionsTe.append(dftest.loc[dftest[feature] == feature_val, resp].mean())
+                qqTr.append(dftrain.loc[dftrain[feature] == feature_val, resp].count())
+                qqTe.append(dftest.loc[dftest[feature] == feature_val, resp].count())
+            predictionsTr = np.round(predictionsTr, 3)
+            predictionsTe = np.round(predictionsTe, 3)
+        else:
+            pass
+        fig1 = plt.figure(11)
+        xindex = list(range(1, len(feature_grid) + 1))
+        if drawAll:
+            plt.plot(xindex, predictions, 'bo-', label='%s' % 'all')
+            plt.gcf().text(0.6, 0.60, 'training Sample: %s' % qq, fontsize=9)
+        else:
+            pass
+        if drawTrTe:
+            plt.plot(xindex, predictionsTr, 'co-', label='%s' % 'train')
+            plt.plot(xindex, predictionsTe, 'mo-', label='%s' % 'test')
+            plt.gcf().text(0.6, 0.55, 'Validation Data Sample: %s' % qqTr, fontsize=9)
+            plt.gcf().text(0.6, 0.50, 'Validation Data eventR: %s' % predictionsTr, fontsize=9)
+            plt.gcf().text(0.6, 0.45, 'Test Data Sample: %s' % qqTe, fontsize=9)
+            plt.gcf().text(0.6, 0.40, 'Test Data eventR: %s' % predictionsTe, fontsize=9)
+        else:
+            pass
+        plt.axhline(y=df[resp].mean(), color='k', linestyle='-.', label='eventR_all')
+        plt.axhline(y=df.loc[df[feature] == dfltValue, resp].mean(), color='r', linestyle='--', label='dflVal_eventR')
+        plt.gcf().text(0.6, 0.7, 'Categorical value:', fontsize=9)
+        plt.gcf().text(0.6, 0.65, 'feature grid: %s' % [str(int(x)) for x in feature_grid], fontsize=9)
+        plt.subplots_adjust(right=0.59)
+    else:
+        feature_grid = sorted(
+            list(set(df.loc[idx, feature].describe(percentiles=[.1, .2, .3, .4, .5, .6, .7, .8, .9])[3:].values)))
+        feature_grid[-1] = feature_grid[-1] + 1
+        df['tmp'] = 99999
+        _tmp = pd.cut(df.loc[idx, feature], feature_grid, include_lowest=True)
+        df.loc[idx, 'tmp'] = _tmp
+        df.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
+        tt = df[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
+        tt.rename(columns={'mean': 'allEvntR', 'count': 'allSpl', 'sum': 'allEvnt'}, inplace=True)
+        if drawTrTe:
+            # Train sample
+            dftrain['tmp'] = 99999
+            _tmp = pd.cut(dftrain.loc[idx, feature], feature_grid, include_lowest=True)
+            dftrain.loc[idx, 'tmp'] = _tmp
+            dftrain.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
+            ttr = dftrain[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
+            ttr.rename(columns={'mean': 'trEvntR', 'count': 'trSpl', 'sum': 'trEvnt'}, inplace=True)
+            # Test sample
+            dftest['tmp'] = 99999
+            _tmp = pd.cut(dftest.loc[idx, feature], feature_grid, include_lowest=True)
+            dftest.loc[idx, 'tmp'] = _tmp
+            dftest.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
+            tte = dftest[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
+            tte.rename(columns={'mean': 'teEvntR', 'count': 'teSpl', 'sum': 'teEvnt'}, inplace=True)
+            _aa = pd.concat([tt, ttr, tte], axis=1)
+        else:
+            _aa = tt
+        _aa = _aa.sortlevel(1)
+        if len(feature_grid) != len(_aa['allEvntR']) + 1:
+            strss = '\n有的分段内没有数据！！！-----------------------------------'
+        else:
+            strss = '\n'
+        print(strss)
+        fig1 = plt.figure(11)
+        xindex = list(_aa.index.get_level_values('tmp_lbl'))
+        if drawAll:
+            plt.plot(xindex, _aa['allEvntR'], 'bo-', label='%s' % 'all')
+        else:
+            pass
+        if drawTrTe:
+            plt.plot(xindex, _aa['trEvntR'], 'co-', label='%s' % 'train')
+            plt.plot(xindex, _aa['teEvntR'], 'mo-', label='%s' % 'test')
+        else:
+            pass
+        plt.axhline(y=df[resp].mean(), color='k', linestyle='-.', label='eventR_all')
+        plt.axhline(y=df.loc[df[feature] == dfltValue, resp].mean(), color='r', linestyle='--', label='dflVal_eventR')
+        plt.gcf().text(0.6, 0.7, '%s' % strss, fontsize=10)
+        plt.gcf().text(0.6, 0.3, '%s' % _aa, fontsize=10)
+        plt.subplots_adjust(right=0.59)
+        plt.subplots_adjust(right=0.59)
+    plt.title('Univariate Chart of %s' % feature)
+    plt.ylabel('evnet Rate')
+    plt.legend(fontsize=10, loc=4, framealpha=0.5)
+    plt.grid()
+    plt.savefig(saved_path + 'Univariate Chart of %s' % feature + ".png")
+    plt.show()
+    return (saved_path + 'Univariate Chart of %s' % feature + ".png")
+# PDP_chart --------------------------------------------------------------------------------------------------
+def pdpChart(model, df, var, predictors, n_bins, dfltValue, maxVal, saved_path="./mvp/plots/cache/"):
+    """
+    Draw PDP-Chart for certain feature
+    Parameters
+    ----------
+    model : trained model
+    df : pd.DataFrame
+        contains all features used in model
+    var : string, feature need to draw
+    predictors : list of string
+        all features used in model
+    n_bins: int
+        only works with numeric data
+    dfltValue : numeric, default value for this feature
+    maxVal : boolean or numeric
+        designed max value for this feature
+    Returns
+    -------
+    fig : figure
+    """
+    idx = (df[var] != dfltValue)
+    if n_bins > df[var].nunique():
+        n_bins = df[var].nunique()
+        feature_grid = [dfltValue] + sorted(df.loc[idx, var].unique().tolist())
+    else:
+        feature_grid = range(n_bins)
+        if maxVal:
+            feature_grid = [dfltValue] + [df.loc[idx, var].min() + val * (maxVal - df.loc[idx, var].min()) / n_bins for
+                                          val in feature_grid]
+        else:
+            feature_grid = [dfltValue] + [
+                df.loc[idx, var].min() + val * (df.loc[idx, var].max() - df.loc[idx, var].min()) / n_bins for val in
+                feature_grid]
+    #     print (var, feature_grid)
+    if df.shape[0] > 10000:
+        x_small = df.sample(n=10000, random_state=77)
+    else:
+        x_small = df
+    predictions = []
+    for feature_val in feature_grid:
+        x_copy = x_small.copy()
+        x_copy[var] = feature_val
+        try:
+            predictions.append(model.predict_proba(x_copy[predictors])[:, 1].mean())
+        except Exception:
+            predictions.append(model.predict(x_copy[predictors]).mean())
+    xindex = feature_grid[1:]
+    plt.plot(xindex, predictions[1:], 'bo-', label='%s' % var)
+    try:
+        plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--', label='scoreAvg')
+    except Exception:
+        plt.axhline(y=model.predict(x_small[predictors]).mean(), color='k', linestyle='--',
+                    label='scoreAvg')
+    plt.axhline(y=predictions[0], color='r', linestyle='--', label='dfltValue')
+    plt.title('pdp Chart of %s' % var)
+    plt.ylabel('Score')
+    plt.legend(fontsize=10, loc=4, framealpha=0.5)
+    plt.grid()
+def pdpCharts9(model, df, collist, predictors, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/"):
+    """
+    Draw PDP-Chart for certain features
+    Parameters
+    ----------
+    model : trained model
+    df : pd.DataFrame
+        contains all features used in model
+    collist : list of string, features need to draw
+    predictors : list of string
+        all features used in model
+    n_bins: int, default 10
+        only works with numeric data
+    dfltValue : numeric, default -99999
+        default value for this feature,
+    maxValRatio : numeric, default 1
+        assign max value with x quantile
+    Returns
+    -------
+    fig : figure with at most 9 subplots
+    """
+    lenth = len(collist)
+    cntPlt = int(np.ceil(lenth / 9))
+    figlist = []
+    for i in list(range(1, cntPlt + 1)):
+        fig = plt.figure(i)
+        figlist.append(fig)
+        j = 1
+        for col in collist[(i - 1) * 9:i * 9]:
+            plt.subplot(3, 3, j)
+            pdpChart(model, df, col, predictors, n_bins, dfltValue=dfltValue, maxVal=df[col].quantile(maxValRatio))
+            j += 1
+        plt.tight_layout()
+        #plt.show()
+        plt.savefig(saved_path + 'pdp Chart with 9 {}'.format(str(cntPlt)) + ".png")
+        plt.show()
+    return (saved_path + 'pdp Chart with 9 {}'.format(str(cntPlt)) + ".png")
+def pdpChart_new(model, df, var, predictors, n_bins, dfltValue, maxValRatio=1):
+    """
+    Draw PDP-Chart for certain feature
+    Parameters
+    ----------
+    model : trained model
+    df : pd.DataFrame
+        contains all features used in model
+    var : string, feature need to draw
+    predictors : list of string
+        all features used in model
+    n_bins: int
+        only works with numeric data
+    dfltValue : numeric,value to sample bin max
+    maxVal : boolean or numeric
+        designed max value for this feature
+    Returns
+    -------
+    fig : figure
+    """
+    maxVal = df[var][df[var] > dfltValue].quantile(maxValRatio)
+    # feature_grid
+    idx = ((df[var] > dfltValue) & (df[var] <= maxVal))
+    # 是否包含所需单一分箱的取值区间
+    if sum((df[var] <= dfltValue)) > 0:
+        feature_grid = [dfltValue]
+    else:
+        feature_grid = []
+    bin_index = []
+    for i in range(0, n_bins + 1):
+        bin_index.append(i * 1.0 * maxValRatio / n_bins)
+    feature_grid = sorted(list(df.loc[idx, var].quantile(bin_index)) + feature_grid)
+    print(var, len(df.loc[idx, var]), feature_grid)
+    # 取观察样本 原始样本大于1w时随机抽取1w
+    if df.shape[0] > 10000:
+        x_small = df.sample(n=10000, random_state=77)
+    else:
+        x_small = df
+    # score
+    predictions = []
+    for feature_val in feature_grid:
+        x_copy = x_small.copy()
+        x_copy[var] = feature_val
+        predictions.append(model.predict_proba(x_copy[predictors])[:, 1].mean())
+    # 制图
+    if feature_grid[0] != dfltValue:
+        xindex = feature_grid[:]
+        plt.plot(bin_index, predictions[:], 'bo-', label='%s' % var)
+        plt.xticks(bin_index, ['%.2f' % i for i in feature_grid])
+        plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--',
+                    label='scoreAvg')
+    else:
+        xindex = feature_grid[1:]
+        plt.plot(bin_index, predictions[1:], 'bo-', label='%s' % var)
+        plt.xticks(bin_index, ['%.2f' % i for i in feature_grid[1:]])
+        plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--',
+                    label='scoreAvg')
+        plt.axhline(y=predictions[0], color='r', linestyle='--', label='dfltValue')
+    plt.title('pdp Chart of %s' % var)
+    plt.ylabel('Score')
+    plt.legend(fontsize=10, loc=4, framealpha=0.5)
+    plt.grid()
+def pdpCharts9_new(model, df, collist, predictors, n_bins=10, dfltValue=-99999, maxValRatio=1):
+    """
+    Draw PDP-Chart for certain features
+    Parameters
+    ----------
+    model : trained model
+    df : pd.DataFrame
+        contains all features used in model
+    collist : list of string, features need to draw
+    predictors : list of string
+        all features used in model
+    n_bins: int, default 10
+        only works with numeric data
+    dfltValue : numeric, default -99999
+        default value for this feature,
+    maxValRatio : numeric, default 1
+        assign max value with x quantile
+    Returns
+    -------
+    fig : figure with at most 9 subplots
+    """
+    lenth = len(collist)
+    cntPlt = int(np.ceil(lenth / 9))
+    figlist = []
+    for i in list(range(1, cntPlt + 2)):
+        fig = plt.figure(i)
+        figlist.append(fig)
+        j = 1
+        for col in collist[(i - 1) * 9:min(i * 9, lenth)]:
+            plt.subplot(3, 3, j)
+            pdpChart_new(model, df, col, predictors, n_bins, dfltValue=dfltValue, maxValRatio=maxValRatio)
+            j += 1
+        plt.tight_layout()
+        plt.show()
+    return figlist
+# liftChart ------------------------------------------------------------------------------------------
+rcParams['figure.figsize'] = 16, 8
+def cal_rate(df, resp, lenth):
+    return pd.DataFrame.from_dict(
+        {
+            'cntLoan': len(df),
+            'event': df[resp].sum(),
+            # 'rate'    : len(df)/lenth,
+            'eventRate': df[resp].mean()
+        },
+        orient='index').T
+def show_result(df, var, resp, n_bins, label=None):
+    """
+    Draw Lift-Chart and AccumLift-Chart for certain score
+    Parameters
+    ----------
+    df : pd.DataFrame
+        at least contains score and resp
+    var : string, score need to draw
+    resp : string, resp column
+        only contain 0/1 value
+    label: string, name of var
+    n_bins: int
+    Returns
+    -------
+    fig : 2 figures
+    """
+    if label == None:
+        label = var
+    df['bkl_%s' % var] = pd.qcut(df[var], n_bins, duplicates='drop')
+    lenth = len(df)
+    r1 = df.groupby('bkl_%s' % var).apply(lambda x: cal_rate(x, resp, lenth)).reset_index(level=1, drop=True)
+    # r1['accumRate'] = r1['rate'].cumsum()
+    r1['acmLoan'] = r1['cntLoan'].cumsum()
+    r1['acmEvent'] = r1['event'].cumsum()
+    r1['acmEventRate'] = r1['acmEvent'] / r1['acmLoan']
+    print(label)
+    print(r1)
+    # plot lift_chart - marginal
+    plt.subplot(1, 2, 1)
+    # xtickss = r1.index
+    r1.reset_index(drop=True, inplace=True)
+    r1.index = r1.index + 1
+    #     r1.index = range(1, n_bins+1)
+    plt.plot(r1.index, r1['eventRate'], marker='o',
+             label='Auc of %s:%.3f' % (label, np.round(metrics.roc_auc_score(df[resp], df[var]), 3)))  # linestyle='--'
+    plt.title('EventRate in %d Quantiles' % n_bins)
+    plt.ylabel('eventRate')
+    plt.grid(True)
+    #   plt.xticks(r1.index, xtickss, rotation = 70)
+    plt.legend(fontsize=13, loc=2, framealpha=0.5)
+    # plot lift_chart - accumulative
+    plt.subplot(1, 2, 2)
+    plt.plot(r1.index, r1['acmEventRate'], marker='o',
+             label='Auc of %s:%.3f' % (label, np.round(metrics.roc_auc_score(df[resp], df[var]), 3)))  # linestyle='--'
+    plt.title('Accum-EventRate in %d Quantiles' % n_bins)
+    plt.ylabel('accumEventRate')
+    # plt.xticks(r1.index, xtickss, rotation = 70)
+    plt.grid(True)
+    plt.legend(fontsize=13, loc=2, framealpha=0.5)
+    plt.tight_layout()
+# TDR_analysis ------------------------------------------------------------------------------------------
+from collections import Counter
+def tdr_rule(df, predictors, score, n_bins=10, dfltValue=-99999):
+    '''
+    Turn Down Rules on all sample
+    Parameters
+    ----------
+    df : pd.DataFrame, dataframe of all sample
+    predictors : list of string, names of all features
+    score : string, model score
+    n_bins: numeric, default 10
+        defines the number of equal-width bins in the range of df[col]
+    dfltValue: numeric, default -99999
+    Returns
+    -------
+    dict_rule : dict
+    eg: {'feature_name':
+            {'lst': list of bin edges,
+             'mean': {mean of score in each bin},
+             'min': min of means in all bins}
+    '''
+    data = df.copy()
+    dict_rule = {}
+    for col in predictors:
+        temp_dict = {}
+        data.sort_values(col, inplace=True)
+        data.reset_index(drop=True, inplace=True)
+        bins = pd.qcut(data.index, n_bins)
+        group = data.groupby(bins)[col].agg([max]).reset_index(level=[0])
+        group["max"] = group["max"].apply(lambda x: round(x, 2))
+        lst = sorted(list(set(group["max"])))
+        if lst[0] == dfltValue:
+            lst[0] = dfltValue
+        else:
+            lst.insert(0, dfltValue)
+        temp_dict["lst"] = lst
+        bins = pd.cut(data[col], lst)
+        group = data.groupby(bins)[score].agg(["mean", "count"]).reset_index(level=[0])
+        group["mean"] = group["mean"].apply(lambda x: np.round(x, 4))
+        temp_dict["min"] = group["mean"].min()
+        bb = group[["mean"]]
+        cc = bb.to_dict()
+        temp_dict["mean"] = cc["mean"]
+        dict_rule[col] = temp_dict
+    return dict_rule
+def tdr_result(df, predictors, idcol, score, dict_rule, dfltValue, topX=10):
+    """
+    list TurnDown Reason for each sample
+    Parameters
+    ----------
+    df : pd.DataFrame,
+        normally dataframe of turn-down samples
+    predictors : list of string
+    idcol : string,
+        name of id column, eg: loan_id
+    score : model score
+    dict_rule : dictionary
+        turn-down rules generated on all sample
+    dfltValue : numeric,
+        default value for these predictors
+    topX : numeric, default 10
+        display top x turn-down reasons for each sample
+    Returns
+    -------
+    dict_result : dict
+    eg: dict{loan_id:
+                {'top5Rsns': [('loan_amt_max', 0.4663),
+                            ('zhima_score', 0.3278),
+                            ('delq_days_max', 0.1085),
+                            ('last_repay_day', 0.0),
+                            ('last_repay_itv', 0.0)],
+                 'v5': 0.5167027077367229},
+    """
+    data = df.copy()
+    # print (dict_rule)
+    # 计算每一个的用户的每一个特征对score的影响
+    dict_result = {}
+    for _, row in data.iterrows():
+        temp = {}
+        for col in predictors:
+            for k, p in zip(list(range(len(dict_rule[col]["lst"]))), dict_rule[col]["lst"]):
+                if row[col] <= p:
+                    if k < 1:
+                        k = 1
+                    else:
+                        pass
+                    temp[col] = dict_rule[col]["mean"][k - 1] - dict_rule[col]["min"]
+                    break
+        temp = sorted(temp.items(), key=lambda x: x[1], reverse=True)
+        tmp = {}
+        tmp[score] = row[score]
+        tmp['top%dRsns' % topX] = temp[:topX]
+        dict_result[row[idcol]] = tmp
+    return dict_result
+def tdr_analysis(df, predictors, idcol, score, dict_rule, dfltValue, topX=10):
+    """
+    计算拒绝样本中排名前三的拒绝原因的 top3最常出现特征及占比
+    """
+    dict_result = tdr_result(df, predictors, idcol, score, dict_rule, dfltValue, topX=topX)
+    # print (dict_result)
+    lenth = len(dict_result)
+    top1var, top2var, top3var = [], [], []
+    for i in dict_result.keys():
+        top1var.append(dict_result[i]['top%dRsns' % topX][0][0])
+        top2var.append(dict_result[i]['top%dRsns' % topX][1][0])
+        top3var.append(dict_result[i]['top%dRsns' % topX][2][0])
+    top1Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top1var).most_common(3)]
+    top2Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top2var).most_common(3)]
+    top3Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top3var).most_common(3)]
+    print('3 most-common candidates in top1Reason (variable, frequency): -------- \n', top1Rsn)
+    print('3 most-common candidates in top2Reason (variable, frequency): -------- \n', top2Rsn)
+    print('3 most-common candidates in top3Reason (variable, frequency): -------- \n', top3Rsn)
+    return dict_result
\ No newline at end of file
--- a/models_kit/__pycache__/general_methods.cpython-36.pyc
+++ b/models_kit/__pycache__/general_methods.cpython-36.pyc
--- a/models_kit/general_methods.py
+++ b/models_kit/general_methods.py
 import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from models_kit import lightgbm
+from models_kit import xgboost
+def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/plots/', topN=20):
-def topN_feature_importance(classifier, clf ,mode , topN=20):
    '''
    plot feature importance squence
+    params:
+        classifier
    '''
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['savefig.dpi'] = 226  # 图片像素
    plt.rcParams['figure.dpi'] = 200  # 分辨率
    plt.figure(figsize=(10, 6))
-    classifier.plot_importance(clf, max_num_features=topN)
+    model.plot_importance(clf, max_num_features=topN)
    plt.title("Feature Importances")
+    path = save_path + title + "_featureImportance.png"
+    plt.savefig(path)
    plt.show()
+    return path
+def topN_feature_importance_list(features, clf, topN=3):
+    '''
+    instructions : return topN_feature_importance dataframe
+    :param features:
+    :param clf:
+    :param topN:
+    :return:
+    '''
+    importanct_feat = pd.DataFrame({
+        'column': features,
+        'importance': clf.feature_importance(),
+    }).sort_values(by='importance', ascending=False).column.tolist()[:3]
+    return importanct_feat
+def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
+    # model matrix 存储不同模型指标的矩阵
+    model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
+    model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
+                                 roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
+                                columns=['线上模型'])
+    # 定义最优参指针
+    pointer = 0
+    # 遍历最优参组合
+    for param in optimal_para:
+        if algorthm == "lightGBM":
+            train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
+                                                       adds_on=param, target=target)
+        model_matrix = pd.concat([model_matrix,
+                                  pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
+                                               columns=[pointer])], axis=1)
+        pointer += 1
+    # 简单选取一下validation set auc 最高的 params
+    best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
--- a/models_obj/__pycache__/dhb_obj.cpython-36.pyc
+++ b/models_obj/__pycache__/dhb_obj.cpython-36.pyc
--- a/models_obj/dhb_obj.py
+++ b/models_obj/dhb_obj.py
@@ -210,7 +210,7 @@ class dhb:
                and datediff(now(),deadline) > ''' + str(passdue_day) + '''
                '''
-    def dhb_features_extract(self,df):
+    def dhb_features_prepocessing(self,dhb_loan):
        try:
            value_map = {
                "近3天": 1,
@@ -229,12 +229,12 @@ class dhb:
            # print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
            # use risk_analysis to extract data
-            print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
+            # print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
-                                                                                                  self.end_time_period))
+            #                                                                                       self.end_time_period))
-            dhb_loan = pd.read_sql(
+            # dhb_loan = pd.read_sql(
-                self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
+            #     self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
-                mysqldb.engine_risk_analysis)
+            #     mysqldb.engine_risk_analysis)
            dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
                      "dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
@@ -267,9 +267,9 @@ class dhb:
            dhb_loan.loc[
                dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
-            dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
+            # dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
-            print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
+            # print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
-                time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
+            #     time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
        # ignore exceptions such as "colmns doesn't exist"
        except Exception as e:
            print("data preprocessing ERR ",e)

--- a/mvp/__pycache__/__init__.cpython-36.pyc
+++ b/mvp/__pycache__/__init__.cpython-36.pyc
--- a/mvp/__pycache__/refit.cpython-36.pyc
+++ b/mvp/__pycache__/refit.cpython-36.pyc
--- a/mvp/allocator.py
+++ b/mvp/allocator.py
-import pandas as pd
+'''
-import numpy as np
+@allocator V1.0
+'''
+#################################################### report settings ###################################################
+from models_obj import dhb_obj
 import datetime
-from tools import datacal
+import pandas as pd
+import os
 from mvp import refit
-from mvp import rebuild
+from tools import datacal
-from models_obj import dhb_obj
-###### global variable ######
-# label
-target = 'target'
-#############################
-dhb = dhb_obj.dhb()
-df_sample = dhb.dhb_features_extract()
-features = dhb.features
-df_sample[features] = df_sample[features].astype(float)
-df_sample['target'] = df_sample['target'].astype(int)
-print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
-print('----no.',len(features),'of samples of dhb----')
-# to save model performance
-if __name__ == '__main__':
-    # data extraction
-    ''' ## Old Edition here
-    # if total sample more than 30000, it would use train-validation-test
-    # else use CV to parameters tuning
-    # if len(df_sample) >= 30000:
-    #     df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
-    # else:
-    #     df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
-    '''
-    # 默认取样本方法
-    df_train, df_val, df_test = datacal.train_test_split_general()
-    # model refit
-    #xgboost
-    xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
-    xgb_model_auc['training_auc'] = None
-    xgb_model_auc['val_auc'] = None
-    #xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
-    ## 待加入 ： xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
-    #ligthtgbm
-    lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
-    lgb_model_auc['training_auc'] = None
-    lgb_model_auc['val_auc'] = None
-    #dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
-    #lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
-    # merge as single dataframe full of models
-    #pd.DataFrame(xgb_model)
-    # dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
-    # df=dhb.dhb_features_extract()
-    # print(df.columns.tolist())
-    # print(df.target.unique())
-    # label='target'
-    # features=dhb.get_feature()
-    # df[features]=df[features].astype(float)
-    # df['target']=df['target'].astype(int)
-    # print('----feature---',len(features))
-    # df=pd.read_csv('test.csv')
-    #== 模型名称
-    model_name='dhb'
-    #== 目标是15天
-    passdue_day=15
-    df_log=sample.get_last_record(model_name)
-    if df_log.shape[0]==1:
-        start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
-    else:
-        start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
-    start_date='2019-01-01'
-    end_date='2019-01-10'
-    print(start_date,end_date)
-    df_sample=dhb.query_sample(start_date,end_date)
-    df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
-    df_sample['label']=1
-    df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
-    dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
-    # 记录样本信息
-    # sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
-    #                          train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
-    #                          test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
-    #== xgboost gbtree
-    xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
+# 渠道列表
+applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
+# 申请类型列表
+applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
+# workspace 路径
+worksapce = 'E:\\bla\\model_mvp\\'
+# 样本路径
+sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv'
+# N+标签
+target = 'target'
-#################################################### report settings #############################################################################
+# 线上模型分字段
+score = 'score'
-applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
+# 预测模型分字段
-applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
+prediction = 'predict'
-    # refit / rebuild sequence
+# 报告生成路径
+report_path = worksapce
+# 报告名称
+report_name = "lgb_report.docx"
+# 切换到workspace目录下 避免相对路径不能识别问题
+os.chdir(worksapce)
+#################################################### training settings #################################################
 # 生成电话帮对象(使用默认参数)
 dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
+# 需要对特征进行调整时，在这里直接dhb.features = 赋值即可
 # 提取样本
-df_sample = dhb.dhb_features_extract()
+#df_sample = dhb.dhb_features_extract()
+# 这里直接使用csv读入样本
-# 备份df_sample
+df_sample = pd.read_csv(sample_path,engine='python')
-df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
 # 电话帮数据处理
-    # report sequence
+# 自定义方法 / 默认数据处理方法
+df_sample = dhb.dhb_features_prepocessing(df_sample)
+# 备份df_sample
+#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
+# 默认样本划分
+df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
+                                                             random_state=7, split_methods='random',
+                                                             time_label='applied_at')
+del df_sample
+# 模型refit
+model_matrix, lgbm = refit.model_fit(df_sample, dhb, target, score)
+print(model_matrix)
+# 生成报告
+status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target,
+                 score, prediction, report_path, report_name, applied_from, applied_type, topN=3)

--- a/mvp/plots/cache/360金融复申 lift Chart.png
+++ b/mvp/plots/cache/360金融复申 lift Chart.png
--- a/mvp/plots/cache/360金融复贷 lift Chart.png
+++ b/mvp/plots/cache/360金融复贷 lift Chart.png
--- a/mvp/plots/cache/360金融首申 lift Chart.png
+++ b/mvp/plots/cache/360金融首申 lift Chart.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
--- a/mvp/plots/cache/dhb_overview_dun_call_total_duration PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_dun_call_total_duration PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_dun_call_total_duration univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_dun_call_total_duration univar Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration univar Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 univar Chart.png
--- a/mvp/plots/cache/pdp Chart with 9 1.png
+++ b/mvp/plots/cache/pdp Chart with 9 1.png
--- a/mvp/plots/cache/untitled_featureImportance.png
+++ b/mvp/plots/cache/untitled_featureImportance.png
--- a/mvp/plots/cache/内部首付贷 lift Chart.png
+++ b/mvp/plots/cache/内部首付贷 lift Chart.png
--- a/mvp/plots/cache/融360复申 lift Chart.png
+++ b/mvp/plots/cache/融360复申 lift Chart.png
--- a/mvp/plots/cache/融360复贷 lift Chart.png
+++ b/mvp/plots/cache/融360复贷 lift Chart.png
--- a/mvp/plots/cache/融360首申 lift Chart.png
+++ b/mvp/plots/cache/融360首申 lift Chart.png
--- a/mvp/refit.py
+++ b/mvp/refit.py
@@ -7,137 +7,177 @@ import lightgbm as lgb
 from graph import matplot
 from tools import filetool
 from sklearn.metrics import roc_auc_score
+from models_kit import general_methods
+from docx.shared import Inches
+import math
+import pandas as pd
-dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
+def model_fit(df_train, df_val, df_test, model_obj, target, score):
-# 提取样本
+    '''
-#df_sample = dhb.dhb_features_extract()
-######### temp #############
-import pandas as pd
-df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
-target = 'target'
-score = 'score'
-prediction = 'predict'
-############################
+    :param df_train: 训练集
-# 备份df_sample
+    :param df_val: 验证集
-#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
+    :param df_test: 测试集
+    :param model_obj: 线上模型对象
+    :param target: 目标列标签（逾期率标签 1 and 0）
-# 默认样本划分
+    :param score:  线上分字段
-df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
+    :return:
-                                                             random_state=7,split_methods='random',
+        model_matrix - 不同模型的同一算法运行结果指标二维表
-                                                             time_label='applied_at')
+        lgbm - 验证集上选择的最优分类器
-del df_sample
-# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
+    '''
-optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
-                                               topN=3, cv_fold=5)
+   # 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
-print('topn 通过train交叉验证得到的auc ',topn)
+    optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, model_obj.features, df_train, df_val, target=target,
+                                                   topN=3, cv_fold=5)
-# model matrix
-model_matrix_index = ['name','Params','trainAUC','validationAUC']
+    print('topn 通过train交叉验证得到的auc ',topn)
-model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
+    # model matrix 存储不同模型指标的矩阵
-pointer = 0
+    model_matrix_index = ['name','Params','trainAUC','validationAUC']
-for param in optimal_para:
+    model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
-    train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
-                                                   adds_on=param, target=target)
+    # 定义最优参指针
-    model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
+    pointer = 0
-    pointer += 1
+    # 遍历最优参组合
+    for param in optimal_para:
-# 简单选取一下validation set auc 最高的 params
+        train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
-best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
+                                                       adds_on=param, target=target)
+        model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
-# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
+        pointer += 1
-train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
-                                               adds_on=best_params, target='target')
+    # 简单选取一下validation set auc 最高的 params
+    best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
-# 用新模型预测结果
-predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target)
+    # 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
-# 把新的预测结果加入test
+    train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
-df_test[prediction] = predictions
+                                                   adds_on=best_params, target='target')
+    return model_matrix, lgbm
-####### allocator cache ############
-applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
-applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
-####################################
-### report
+###################################### 生成报告 ################################################################
-import os
+def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
-os.chdir("E:/bla/model_mvp/")
+                 score, prediction, report_path, report_name, applied_from, applied_type, topN=3):
+    '''
-# plot feature importance
-topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/cache/', topN=20)
+    :param clf:  模型分类器对象
+    :param df_train: 训练集
-importanct_feat = pd.DataFrame({
+    :param df_val: validation set
-        'column': dhb.features,
+    :param df_test: 测试集
-        'importance': lgbm.feature_importance(),
+    :param model_obj: 线上模型对象
-    }).sort_values(by='importance',ascending=False).column.tolist()[:3]
+    :param target: 目标列标签（逾期标签）
+    :param model_matrix: 模型对比二维表
-# report file
+    :param score: 线上模型分数字段
-report_path = "E:/bla/model_mvp/"
+    :param prediction: 模型预测分数字段
-report_name = "lgb_report.docx"
+    :param report_path: 报告生成路径
+    :param report_name: 报告生成名
-# 生成docx Documents
+    :param applied_from: 报告中包含的渠道字典
-document = filetool.buildDocument(report_path, report_name)
+    :param applied_type: 报告中包含的申请类型字典
+    :param topN: 前N个特征（重要性）
-# docx加入title
+    :return:
-document.add_heading('lightGBM 算法refit报告')
+        status ： 返回1表示执行完成
+    '''
-# docx新增 特征权重段
-document.add_paragraph('特征权重图')
+    # 用新模型预测结果 xgb还需要加一个proba (TODO here)
+    predictions ,test_auc = lightgbm.predict(clf,df_test,model_obj.features,target)
-# docx加入特征权重图像
-document.add_picture(topnfeat_path)
+    # 把新的预测结果加入test
+    df_test[prediction] = predictions
-# 新增 univar_chart段
-document.add_paragraph('univar_chart')
+    # plot feature importance
+    topnfeat_path = general_methods.topN_feature_importance_plot(lgb, clf, title="untitled", save_path='./mvp/plots/cache/', topN=20)
-# 遍历目标features画出univarchart
-for i in importanct_feat:
+    # 获取前N个权重特征列表
-    univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
+    importanct_feat = general_methods.topN_feature_importance_list(model_obj.features, clf, topN=3)
-    univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
-    univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
+    # 生成docx Documents
-    tab_df_list = [univar_train,univar_val,univar_test]
+    document = filetool.buildDocument(report_path, report_name)
-    univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
-                    tab_df_list=tab_df_list, plot_tab=False,
+    # docx加入title
-                    saved_path='./mvp/plots/cache/')
+    document.add_heading('lightGBM 算法refit报告')
-    document.add_picture(univarChart)
+    # docx新增 特征权重段
-document.add_paragraph('PDP_chart')
+    document.add_paragraph('特征权重图')
-# 遍历目标features 画出对应PDP
-for i in importanct_feat:
+    # docx加入特征权重图像
-    pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
+    document.add_picture(topnfeat_path)
-    pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
-                  tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
+    # 新增 univar_chart段
-    document.add_picture(pdpChart)
+    document.add_paragraph('univar_chart')
+    # 遍历目标features画出univarchart
+    for i in importanct_feat:
+        # 训练集 univar
-filetool.saveDocument(document, report_path, report_name)
+        univar_chart = matplot.uniVarChart(df_train, i, target, n_bins=10, dfltValue=-99999, dftrain=df_val, dftest=df_test, drawAll=True,
+                    drawTrTe=False, saved_path='./mvp/plots/cache/')
-document.add_paragraph('lift_chart')
+        # univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
-# 遍历给定渠道 & 客群 默认等频画出liftchart
+        # # validation univar
-try:
+        # univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
-    for channel in ['333','159537','1,214,217,198']:
+        # # test集 univar
-        for type in ['1','2','3']:
+        # univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
-            df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
+        # 用于univarChart画图的参数列表
-            lift_pred = datacal.cal_lift(df_sliced,score=prediction)
+        #tab_df_list = [univar_train,univar_val,univar_test]
-            lift_online = datacal.cal_lift(df_sliced,score=score)
+        # 调用plot_table_list
-            #liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
+        #univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
-            liftChart = matplot.plot_table_list([lift_pred, lift_online], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None,
+        #                tab_df_list=tab_df_list, plot_tab=False,
-                                    title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None,
+        #                saved_path='./mvp/plots/cache/')
-                                    tab_df_list=tab_df_list, plot_tab=False,
+        document.add_picture(univar_chart,width=Inches(8))
-                                    saved_path='./mvp/plots/cache/')
-            document.add_picture(liftChart)
+    # 新增pdp段
-except:
+    document.add_paragraph('PDP_chart')
-    pass
+    # 遍历目标features 画出对应PDP
+    for i in range(math.ceil(len(importanct_feat)/9)):
-filetool.saveDocument(document, report_path, report_name)
+        # pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
+        # pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
+        #               tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
+        pdpChart = matplot.pdpCharts9(clf, df_test, importanct_feat, model_obj.features, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/")
+        document.add_picture(pdpChart,width=Inches(8))
+    # 新增liftchart段
+    document.add_paragraph('lift_chart')
+    # 遍历给定渠道 & 客群 默认等频画出liftchart
+    try:
+        lift_pred = datacal.cal_lift(df_test, score=prediction)
+        lift_online = datacal.cal_lift(df_test, score=score)
+        # liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
+        liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
+                                            [roc_auc_score(df_test[target], df_test[prediction]),
+                                             roc_auc_score(df_test[target], df_test[score])], datalist_description=None,
+                                            title='全渠道全量客群测试集上的 lift Chart',
+                                            X_label=None, y_label=None,
+                                            tab_df_list=None, plot_tab=False,
+                                            saved_path='./mvp/plots/cache/')
+        document.add_picture(liftChart, width=Inches(8))
+        # 遍历渠道
+        for channel in applied_from.keys():
+            # 遍历客群类型
+            for type in applied_type.keys():
+                print('lift ',type,channel)
+                # 数据切片
+                df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
+                #
+                lift_pred = datacal.cal_lift(df_sliced,score=prediction)
+                lift_online = datacal.cal_lift(df_sliced,score=score)
+                #liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
+                liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None,
+                                        title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None,
+                                        tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=True,
+                                        saved_path='./mvp/plots/cache/')
+                document.add_picture(liftChart,width=Inches(8))
+    # 存在某些渠道量很少的情况，加入try catch异常处理
+    except Exception as e:
+        print('Exception: ',e)
+        pass
+    # docx 保存
+    filetool.saveDocument(document, report_path, report_name)
+    return 1

--- a/tools/__pycache__/filetool.cpython-36.pyc
+++ b/tools/__pycache__/filetool.cpython-36.pyc
--- a/tools/filetool.py
+++ b/tools/filetool.py
@@ -22,9 +22,16 @@ def saveDocument(document,path,filename):
            raise ValueError('{} is not a word file'.format(filename))
    return document.save(os.path.join(path,filename))
-def insert_table(document, cols, values):
+def insert_table(document,df):
-    # cols 为列名
+    '''
-    # values 为值，list
+    instructions : plot table which insert into docx
+    :param document: document obj
+    :param df: dataframe
+    :return:
+    '''
+    cols = df.columns
+    values = df.values
    table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
    hdr_cells = table.rows[0].cells
    for i in range(len(cols)):
@@ -32,5 +39,8 @@ def insert_table(document, cols, values):
    for value in values:
        row_cells = table.add_row().cells
        for i in range(len(cols)):
-            row_cells[i].text = str(value[i])
+            if type(value[i])==str:
-    return document
+                row_cells[i].text = value[i]
\ No newline at end of file
+            else:
+                row_cells[i].text = str(value[i])
+    return document