Commit 9b10189a authored by 王家华's avatar 王家华

plot函数加表格出异常暂时没法全部解决,调用了model tools的方法画图

parent bd18c3b0
...@@ -2,9 +2,12 @@ ...@@ -2,9 +2,12 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="c45d2e80-934e-41cc-8f01-c6d0d282db9d" name="Default Changelist" comment=""> <list default="true" id="c45d2e80-934e-41cc-8f01-c6d0d282db9d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/graph/matplot.py" beforeDir="false" afterPath="$PROJECT_DIR$/graph/matplot.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/graph/matplot.py" beforeDir="false" afterPath="$PROJECT_DIR$/graph/matplot.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/models_kit/general_methods.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_kit/general_methods.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/models_obj/dhb_obj.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_obj/dhb_obj.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/mvp/allocator.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/allocator.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/mvp/refit.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/refit.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/mvp/refit.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/refit.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/tools/filetool.py" beforeDir="false" afterPath="$PROJECT_DIR$/tools/filetool.py" afterDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
...@@ -15,53 +18,46 @@ ...@@ -15,53 +18,46 @@
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/mvp/allocator.py"> <entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state> <state relative-caret-position="306">
<caret line="51" column="30" lean-forward="true" selection-start-line="51" selection-start-column="30" selection-end-line="51" selection-end-column="30" />
<folding> <folding>
<element signature="e#0#19#0" expanded="true" /> <element signature="e#0#31#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="true"> <file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/mvp/refit.py"> <entry file="file://$PROJECT_DIR$/mvp/allocator.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="222"> <state relative-caret-position="119">
<caret line="70" column="19" selection-start-line="70" selection-start-column="14" selection-end-line="70" selection-end-column="19" /> <caret line="58" column="22" lean-forward="true" selection-start-line="58" selection-start-column="22" selection-end-line="58" selection-end-column="22" />
<folding> <folding>
<element signature="e#0#30#0" expanded="true" /> <element signature="e#4120#4150#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py"> <entry file="file://$PROJECT_DIR$/mvp/refit.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-291">
<caret line="639" column="21" lean-forward="true" selection-start-line="639" selection-start-column="21" selection-end-line="639" selection-end-column="21" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-102"> <state relative-caret-position="204">
<caret line="171" lean-forward="true" selection-start-line="171" selection-end-line="171" />
<folding> <folding>
<element signature="e#0#9#0" expanded="true" /> <element signature="e#0#30#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tools/datacal.py"> <entry file="file://$PROJECT_DIR$/mvp/lgbreport.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1139"> <state relative-caret-position="187">
<caret line="67" column="36" lean-forward="true" selection-start-line="67" selection-start-column="36" selection-end-line="67" selection-end-column="36" /> <caret line="11" lean-forward="true" selection-start-line="11" selection-end-line="11" />
<folding> <folding>
<element signature="e#0#19#0" expanded="true" /> <element signature="e#0#19#0" expanded="true" />
</folding> </folding>
...@@ -70,22 +66,22 @@ ...@@ -70,22 +66,22 @@
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/datasource/mongodb.py"> <entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="493"> <state relative-caret-position="473">
<caret line="29" selection-start-line="29" selection-end-line="29" /> <caret line="46" lean-forward="true" selection-start-line="46" selection-end-line="46" />
<folding> <folding>
<element signature="e#0#14#0" expanded="true" /> <element signature="e#0#9#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py"> <entry file="file://$PROJECT_DIR$/tools/datacal.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-952"> <state relative-caret-position="595">
<caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" /> <caret line="35" column="36" lean-forward="true" selection-start-line="35" selection-start-column="36" selection-end-line="35" selection-end-column="36" />
<folding> <folding>
<element signature="e#0#19#0" expanded="true" /> <element signature="e#0#19#0" expanded="true" />
</folding> </folding>
...@@ -94,10 +90,13 @@ ...@@ -94,10 +90,13 @@
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/graph/matplot.py"> <entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442"> <state relative-caret-position="34">
<caret line="29" column="21" lean-forward="true" selection-start-line="29" selection-start-column="21" selection-end-line="29" selection-end-column="21" /> <caret line="212" column="33" selection-start-line="212" selection-start-column="8" selection-end-line="212" selection-end-column="33" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
...@@ -105,22 +104,17 @@ ...@@ -105,22 +104,17 @@
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py"> <entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="214"> <state relative-caret-position="-34">
<caret line="167" column="10" lean-forward="true" selection-start-line="167" selection-start-column="10" selection-end-line="167" selection-end-column="52" /> <caret line="1" selection-start-line="1" selection-end-line="1" selection-end-column="41" />
<folding>
<element signature="e#0#22#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file pinned="false" current-in-tab="false"> <file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_kit/xgboost.py"> <entry file="file://$PROJECT_DIR$/graph/matplot.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1173"> <state relative-caret-position="152">
<folding> <caret line="377" lean-forward="true" selection-start-line="377" selection-end-line="377" />
<element signature="e#0#19#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
...@@ -133,11 +127,14 @@ ...@@ -133,11 +127,14 @@
<component name="IdeDocumentHistory"> <component name="IdeDocumentHistory">
<option name="CHANGED_PATHS"> <option name="CHANGED_PATHS">
<list> <list>
<option value="$PROJECT_DIR$/models_kit/general_methods.py" />
<option value="$PROJECT_DIR$/models_kit/lightgbm.py" /> <option value="$PROJECT_DIR$/models_kit/lightgbm.py" />
<option value="$PROJECT_DIR$/models_obj/dhb_obj.py" />
<option value="$PROJECT_DIR$/models_kit/general_methods.py" />
<option value="$PROJECT_DIR$/tools/filetool.py" />
<option value="$PROJECT_DIR$/tools/datacal.py" /> <option value="$PROJECT_DIR$/tools/datacal.py" />
<option value="$PROJECT_DIR$/mvp/refit.py" /> <option value="$PROJECT_DIR$/mvp/refit.py" />
<option value="$PROJECT_DIR$/graph/matplot.py" /> <option value="$PROJECT_DIR$/graph/matplot.py" />
<option value="$PROJECT_DIR$/mvp/allocator.py" />
</list> </list>
</option> </option>
</component> </component>
...@@ -152,8 +149,8 @@ ...@@ -152,8 +149,8 @@
</option> </option>
</component> </component>
<component name="ProjectFrameBounds" extendedState="6"> <component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="174" /> <option name="x" value="261" />
<option name="y" value="167" /> <option name="y" value="251" />
<option name="width" value="1400" /> <option name="width" value="1400" />
<option name="height" value="831" /> <option name="height" value="831" />
</component> </component>
...@@ -170,16 +167,6 @@ ...@@ -170,16 +167,6 @@
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" /> <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" /> <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="datasource" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="feature" type="462c0819:PsiDirectoryNode" />
</path>
<path> <path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" /> <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" /> <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
...@@ -190,11 +177,6 @@ ...@@ -190,11 +177,6 @@
<item name="model_mvp" type="462c0819:PsiDirectoryNode" /> <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="models_kit" type="462c0819:PsiDirectoryNode" /> <item name="models_kit" type="462c0819:PsiDirectoryNode" />
</path> </path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="models_obj" type="462c0819:PsiDirectoryNode" />
</path>
<path> <path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" /> <item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" /> <item name="model_mvp" type="462c0819:PsiDirectoryNode" />
...@@ -245,7 +227,28 @@ ...@@ -245,7 +227,28 @@
</list> </list>
</option> </option>
</component> </component>
<component name="RunManager" selected="Python.refit"> <component name="RunManager" selected="Python.allocator">
<configuration name="allocator" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_mvp" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/mvp" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/mvp/allocator.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="lightgbm" type="PythonConfigurationType" factoryName="Python" temporary="true"> <configuration name="lightgbm" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_mvp" /> <module name="model_mvp" />
<option name="INTERPRETER_OPTIONS" value="" /> <option name="INTERPRETER_OPTIONS" value="" />
...@@ -290,6 +293,7 @@ ...@@ -290,6 +293,7 @@
</configuration> </configuration>
<recent_temporary> <recent_temporary>
<list> <list>
<item itemvalue="Python.allocator" />
<item itemvalue="Python.refit" /> <item itemvalue="Python.refit" />
<item itemvalue="Python.lightgbm" /> <item itemvalue="Python.lightgbm" />
</list> </list>
...@@ -312,12 +316,12 @@ ...@@ -312,12 +316,12 @@
<frame x="-8" y="-8" width="1936" height="1066" extended-state="6" /> <frame x="-8" y="-8" width="1936" height="1066" extended-state="6" />
<editor active="true" /> <editor active="true" />
<layout> <layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.28556374" /> <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.17860906" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" /> <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" /> <window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" /> <window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" /> <window_info anchor="bottom" id="Find" order="1" />
<window_info active="true" anchor="bottom" id="Run" order="2" visible="true" weight="0.3290461" /> <window_info anchor="bottom" id="Run" order="2" weight="0.3290461" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.39978564" /> <window_info anchor="bottom" id="Debug" order="3" weight="0.39978564" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" /> <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
...@@ -325,7 +329,7 @@ ...@@ -325,7 +329,7 @@
<window_info anchor="bottom" id="Version Control" order="7" /> <window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" weight="0.3290461" /> <window_info anchor="bottom" id="Terminal" order="8" weight="0.3290461" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" weight="0.31511253" /> <window_info active="true" anchor="bottom" id="Python Console" order="10" visible="true" weight="0.46623793" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" /> <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" /> <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
...@@ -339,22 +343,6 @@ ...@@ -339,22 +343,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="17" selection-start-line="17" selection-end-line="17" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/mvp/allocator.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_imps/_pydev_execfile.py"> <entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_imps/_pydev_execfile.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289"> <state relative-caret-position="289">
...@@ -372,15 +360,6 @@ ...@@ -372,15 +360,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-102">
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/pandas/core/generic.py"> <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/pandas/core/generic.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="373"> <state relative-caret-position="373">
...@@ -397,60 +376,115 @@ ...@@ -397,60 +376,115 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/tools/datacal.py"> <entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2140">
<caret line="662" column="38" lean-forward="true" selection-start-line="662" selection-start-column="38" selection-end-line="662" selection-end-column="38" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1139"> <state relative-caret-position="34">
<caret line="67" column="36" lean-forward="true" selection-start-line="67" selection-start-column="36" selection-end-line="67" selection-end-column="36" /> <caret line="212" column="33" selection-start-line="212" selection-start-column="8" selection-end-line="212" selection-end-column="33" />
<folding> <folding>
<element signature="e#0#19#0" expanded="true" /> <element signature="e#0#19#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/table.py"> <entry file="file://$PROJECT_DIR$/mvp/xgbreport.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-291"> <state relative-caret-position="170">
<caret line="639" column="21" lean-forward="true" selection-start-line="639" selection-start-column="21" selection-end-line="639" selection-end-column="21" /> <caret line="10" column="4" selection-start-line="10" selection-start-column="4" selection-end-line="10" selection-end-column="4" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py"> <entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="214"> <state relative-caret-position="473">
<caret line="167" column="10" lean-forward="true" selection-start-line="167" selection-start-column="10" selection-end-line="167" selection-end-column="52" /> <caret line="46" lean-forward="true" selection-start-line="46" selection-end-line="46" />
<folding> <folding>
<element signature="e#0#22#0" expanded="true" /> <element signature="e#0#9#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py"> <entry file="file://$PROJECT_DIR$/README.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor />
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tools/datacal.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-952"> <state relative-caret-position="595">
<caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" /> <caret line="35" column="36" lean-forward="true" selection-start-line="35" selection-start-column="36" selection-end-line="35" selection-end-column="36" />
<folding> <folding>
<element signature="e#0#19#0" expanded="true" /> <element signature="e#0#19#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="306">
<caret line="51" column="30" lean-forward="true" selection-start-line="51" selection-start-column="30" selection-end-line="51" selection-end-column="30" />
<folding>
<element signature="e#0#31#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-34">
<caret line="1" selection-start-line="1" selection-end-line="1" selection-end-column="41" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/graph/matplot.py"> <entry file="file://$PROJECT_DIR$/graph/matplot.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442"> <state relative-caret-position="152">
<caret line="29" column="21" lean-forward="true" selection-start-line="29" selection-start-column="21" selection-end-line="29" selection-end-column="21" /> <caret line="377" lean-forward="true" selection-start-line="377" selection-end-line="377" />
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/mvp/refit.py"> <entry file="file://$PROJECT_DIR$/mvp/refit.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="222"> <state relative-caret-position="204">
<caret line="70" column="19" selection-start-line="70" selection-start-column="14" selection-end-line="70" selection-end-column="19" /> <caret line="171" lean-forward="true" selection-start-line="171" selection-end-line="171" />
<folding> <folding>
<element signature="e#0#30#0" expanded="true" /> <element signature="e#0#30#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/mvp/lgbreport.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="187">
<caret line="11" lean-forward="true" selection-start-line="11" selection-end-line="11" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/mvp/allocator.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="119">
<caret line="58" column="22" lean-forward="true" selection-start-line="58" selection-start-column="22" selection-end-line="58" selection-end-column="22" />
<folding>
<element signature="e#4120#4150#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component> </component>
<component name="masterDetails"> <component name="masterDetails">
<states> <states>
......
...@@ -48,12 +48,11 @@ def plot_table_list(datalist, auc, datalist_description=None, title='untitled', ...@@ -48,12 +48,11 @@ def plot_table_list(datalist, auc, datalist_description=None, title='untitled',
''' '''
fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1) fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1)
# datalist description # datalist description
if datalist_description is None: if datalist_description is None:
datalist_description = range(len(datalist)) datalist_description = range(len(datalist))
for table_index in range(len(datalist)): for table_index in range(len(datalist)):
# 每个table需要只有一个index,一个values # 每个table需要只有一个index,一个values
x = range(len(datalist[table_index].index)) x = range(len(datalist[table_index].index))
...@@ -302,3 +301,648 @@ def density_chart(dataset, title): ...@@ -302,3 +301,648 @@ def density_chart(dataset, title):
## plt.subplots(,figsize=fig_size,linewidth=0.1) ## plt.subplots(,figsize=fig_size,linewidth=0.1)
# #
# return 1 # return 1
##############################################################
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 28 17:45:08 2018
@author: olivia_deyu
"""
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.pylab import rcParams
from sklearn import metrics
from xgboost import XGBClassifier
from xgboost import cv, DMatrix
warnings.filterwarnings('ignore')
# Fit Model ------------------------------------------------------------------------------------------
def train_model(df, predictors, resp, params, idcol, useTrainCV=True, trvalsplit='random', trvalsplitRatio=0.8,
sort_col='applied_at'):
print('Train/Val evnetRate over all: %s' % resp, df[resp].mean())
if trvalsplit in ('random', 'timeSeries'):
if trvalsplit == 'random':
# 随机分配 train / val
train = df.sample(frac=trvalsplitRatio, random_state=1)
val = df[~df[idcol].isin(train[idcol])]
elif trvalsplit == 'timeSeries':
# 按时间序列分配 train /val
train = df.sort_values(by=sort_col).head(int(len(df) * trvalsplitRatio))
val = df[~df[idcol].isin(train[idcol])]
print('---------- train/val -------------')
print('eventRate on train: ', train[resp].mean(), '; sampleSize on train: ', train.shape, train[sort_col].min(),
train[sort_col].max())
print('eventRate on val: ', val[resp].mean(), '; sampleSize on val: ', val.shape, val[sort_col].min(),
val[sort_col].max())
else:
train = df
val = None
# print ('Specify methods of train/val split !')
print('---------- train, no val -------------')
print('eventRate on train: ', train[resp].mean(), '; sampleSize on train: ', train.shape, train[sort_col].min(),
train[sort_col].max())
xgbC = XGBClassifier(**params)
model, fts_imp = modelfit(xgbC, train, val, predictors, resp, useTrainCV=useTrainCV) #
return model, fts_imp
def modelfit(alg, dtrain, dval, predictors, resp, useTrainCV=True, cv_folds=10, early_stopping_rounds=20):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = DMatrix(dtrain[predictors].values, label=dtrain[resp].values)
cvresult = cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics=(['auc']), early_stopping_rounds=early_stopping_rounds, verbose_eval=100) # True, )
alg.set_params(n_estimators=cvresult.shape[0])
print(cvresult, cvresult.shape)
# Fit the algorithm on the data and save the model
alg.fit(dtrain[predictors], dtrain[resp], eval_metric='auc')
print('Model params: -----------')
print(alg.n_estimators, alg.max_depth, alg.learning_rate)
# joblib.dump(alg, '%s.pkl' %pklname)
# Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
# Print Model Report:
print("\nModel Report")
print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[resp].values, dtrain_predictions))
print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[resp], dtrain_predprob))
if dval is not None:
# Predict validation Set:
dval_predprob = alg.predict_proba(dval[predictors])[:, 1]
print("AUC Score (Validation): %f" % metrics.roc_auc_score(dval[resp], dval_predprob))
# Print Feature Importance:
feat_imp = pd.Series(alg.get_booster().get_fscore(), predictors).sort_values(ascending=False, na_position='last')
# feat_imp = pd.Series(alg.booster().get_fscore(), predictors).sort(ascending=False)
feat_imp = feat_imp[feat_imp > 0]
print('----------- Feature importance -------------')
print(feat_imp)
return alg, feat_imp
# Univariate Chart ------------------------------------------------------------------------------------------
rcParams['figure.figsize'] = 12, 8
def uniVarChart(df, feature, resp, n_bins=10, dfltValue=-99999, dftrain=False, dftest=False, drawAll=True,
drawTrTe=False,saved_path='./mvp/plots/cache/'):
"""
Draw Univariate-Chart for certain feature on all/train/test sample respectively
Parameters
----------
df : pd.DataFrame
at least contains feature and resp
feature : string, feature need to draw
resp : string, resp column
only contains 0/1 value
n_bins: int, default 10
only works with numeric data
dfltValue : numeric, default value for this feature
dftrain : pd.DataFrame
at least contains feature and resp
dftest : pd.DataFrame
at least contains feature and resp
drawAll : boolean
if True then draw univariate chart on all sample
drawTrTe : boolean
if True then draw univariate chart on train and test samples respectively
Returns
-------
fig : figure
"""
idx = (df[feature] != dfltValue)
if n_bins > df[feature].nunique():
predictions, predictionsTr, predictionsTe = [], [], []
qq, qqTr, qqTe = [], [], []
n_bins = df[feature].nunique()
feature_grid = sorted(df.loc[idx, feature].unique().tolist())
for feature_val in feature_grid:
predictions.append(df.loc[df[feature] == feature_val, resp].mean())
qq.append(df.loc[df[feature] == feature_val, resp].count())
if drawTrTe:
for feature_val in feature_grid:
predictionsTr.append(dftrain.loc[dftrain[feature] == feature_val, resp].mean())
predictionsTe.append(dftest.loc[dftest[feature] == feature_val, resp].mean())
qqTr.append(dftrain.loc[dftrain[feature] == feature_val, resp].count())
qqTe.append(dftest.loc[dftest[feature] == feature_val, resp].count())
predictionsTr = np.round(predictionsTr, 3)
predictionsTe = np.round(predictionsTe, 3)
else:
pass
fig1 = plt.figure(11)
xindex = list(range(1, len(feature_grid) + 1))
if drawAll:
plt.plot(xindex, predictions, 'bo-', label='%s' % 'all')
plt.gcf().text(0.6, 0.60, 'training Sample: %s' % qq, fontsize=9)
else:
pass
if drawTrTe:
plt.plot(xindex, predictionsTr, 'co-', label='%s' % 'train')
plt.plot(xindex, predictionsTe, 'mo-', label='%s' % 'test')
plt.gcf().text(0.6, 0.55, 'Validation Data Sample: %s' % qqTr, fontsize=9)
plt.gcf().text(0.6, 0.50, 'Validation Data eventR: %s' % predictionsTr, fontsize=9)
plt.gcf().text(0.6, 0.45, 'Test Data Sample: %s' % qqTe, fontsize=9)
plt.gcf().text(0.6, 0.40, 'Test Data eventR: %s' % predictionsTe, fontsize=9)
else:
pass
plt.axhline(y=df[resp].mean(), color='k', linestyle='-.', label='eventR_all')
plt.axhline(y=df.loc[df[feature] == dfltValue, resp].mean(), color='r', linestyle='--', label='dflVal_eventR')
plt.gcf().text(0.6, 0.7, 'Categorical value:', fontsize=9)
plt.gcf().text(0.6, 0.65, 'feature grid: %s' % [str(int(x)) for x in feature_grid], fontsize=9)
plt.subplots_adjust(right=0.59)
else:
feature_grid = sorted(
list(set(df.loc[idx, feature].describe(percentiles=[.1, .2, .3, .4, .5, .6, .7, .8, .9])[3:].values)))
feature_grid[-1] = feature_grid[-1] + 1
df['tmp'] = 99999
_tmp = pd.cut(df.loc[idx, feature], feature_grid, include_lowest=True)
df.loc[idx, 'tmp'] = _tmp
df.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
tt = df[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
tt.rename(columns={'mean': 'allEvntR', 'count': 'allSpl', 'sum': 'allEvnt'}, inplace=True)
if drawTrTe:
# Train sample
dftrain['tmp'] = 99999
_tmp = pd.cut(dftrain.loc[idx, feature], feature_grid, include_lowest=True)
dftrain.loc[idx, 'tmp'] = _tmp
dftrain.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
ttr = dftrain[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
ttr.rename(columns={'mean': 'trEvntR', 'count': 'trSpl', 'sum': 'trEvnt'}, inplace=True)
# Test sample
dftest['tmp'] = 99999
_tmp = pd.cut(dftest.loc[idx, feature], feature_grid, include_lowest=True)
dftest.loc[idx, 'tmp'] = _tmp
dftest.loc[idx, 'tmp_lbl'] = _tmp.cat.codes
tte = dftest[idx].groupby(['tmp', 'tmp_lbl'])[resp].agg({'mean', 'count', 'sum'})
tte.rename(columns={'mean': 'teEvntR', 'count': 'teSpl', 'sum': 'teEvnt'}, inplace=True)
_aa = pd.concat([tt, ttr, tte], axis=1)
else:
_aa = tt
_aa = _aa.sortlevel(1)
if len(feature_grid) != len(_aa['allEvntR']) + 1:
strss = '\n有的分段内没有数据!!!-----------------------------------'
else:
strss = '\n'
print(strss)
fig1 = plt.figure(11)
xindex = list(_aa.index.get_level_values('tmp_lbl'))
if drawAll:
plt.plot(xindex, _aa['allEvntR'], 'bo-', label='%s' % 'all')
else:
pass
if drawTrTe:
plt.plot(xindex, _aa['trEvntR'], 'co-', label='%s' % 'train')
plt.plot(xindex, _aa['teEvntR'], 'mo-', label='%s' % 'test')
else:
pass
plt.axhline(y=df[resp].mean(), color='k', linestyle='-.', label='eventR_all')
plt.axhline(y=df.loc[df[feature] == dfltValue, resp].mean(), color='r', linestyle='--', label='dflVal_eventR')
plt.gcf().text(0.6, 0.7, '%s' % strss, fontsize=10)
plt.gcf().text(0.6, 0.3, '%s' % _aa, fontsize=10)
plt.subplots_adjust(right=0.59)
plt.subplots_adjust(right=0.59)
plt.title('Univariate Chart of %s' % feature)
plt.ylabel('evnet Rate')
plt.legend(fontsize=10, loc=4, framealpha=0.5)
plt.grid()
plt.savefig(saved_path + 'Univariate Chart of %s' % feature + ".png")
plt.show()
return (saved_path + 'Univariate Chart of %s' % feature + ".png")
# PDP_chart --------------------------------------------------------------------------------------------------
def pdpChart(model, df, var, predictors, n_bins, dfltValue, maxVal, saved_path="./mvp/plots/cache/"):
"""
Draw PDP-Chart for certain feature
Parameters
----------
model : trained model
df : pd.DataFrame
contains all features used in model
var : string, feature need to draw
predictors : list of string
all features used in model
n_bins: int
only works with numeric data
dfltValue : numeric, default value for this feature
maxVal : boolean or numeric
designed max value for this feature
Returns
-------
fig : figure
"""
idx = (df[var] != dfltValue)
if n_bins > df[var].nunique():
n_bins = df[var].nunique()
feature_grid = [dfltValue] + sorted(df.loc[idx, var].unique().tolist())
else:
feature_grid = range(n_bins)
if maxVal:
feature_grid = [dfltValue] + [df.loc[idx, var].min() + val * (maxVal - df.loc[idx, var].min()) / n_bins for
val in feature_grid]
else:
feature_grid = [dfltValue] + [
df.loc[idx, var].min() + val * (df.loc[idx, var].max() - df.loc[idx, var].min()) / n_bins for val in
feature_grid]
# print (var, feature_grid)
if df.shape[0] > 10000:
x_small = df.sample(n=10000, random_state=77)
else:
x_small = df
predictions = []
for feature_val in feature_grid:
x_copy = x_small.copy()
x_copy[var] = feature_val
try:
predictions.append(model.predict_proba(x_copy[predictors])[:, 1].mean())
except Exception:
predictions.append(model.predict(x_copy[predictors]).mean())
xindex = feature_grid[1:]
plt.plot(xindex, predictions[1:], 'bo-', label='%s' % var)
try:
plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--', label='scoreAvg')
except Exception:
plt.axhline(y=model.predict(x_small[predictors]).mean(), color='k', linestyle='--',
label='scoreAvg')
plt.axhline(y=predictions[0], color='r', linestyle='--', label='dfltValue')
plt.title('pdp Chart of %s' % var)
plt.ylabel('Score')
plt.legend(fontsize=10, loc=4, framealpha=0.5)
plt.grid()
def pdpCharts9(model, df, collist, predictors, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/"):
"""
Draw PDP-Chart for certain features
Parameters
----------
model : trained model
df : pd.DataFrame
contains all features used in model
collist : list of string, features need to draw
predictors : list of string
all features used in model
n_bins: int, default 10
only works with numeric data
dfltValue : numeric, default -99999
default value for this feature,
maxValRatio : numeric, default 1
assign max value with x quantile
Returns
-------
fig : figure with at most 9 subplots
"""
lenth = len(collist)
cntPlt = int(np.ceil(lenth / 9))
figlist = []
for i in list(range(1, cntPlt + 1)):
fig = plt.figure(i)
figlist.append(fig)
j = 1
for col in collist[(i - 1) * 9:i * 9]:
plt.subplot(3, 3, j)
pdpChart(model, df, col, predictors, n_bins, dfltValue=dfltValue, maxVal=df[col].quantile(maxValRatio))
j += 1
plt.tight_layout()
#plt.show()
plt.savefig(saved_path + 'pdp Chart with 9 {}'.format(str(cntPlt)) + ".png")
plt.show()
return (saved_path + 'pdp Chart with 9 {}'.format(str(cntPlt)) + ".png")
def pdpChart_new(model, df, var, predictors, n_bins, dfltValue, maxValRatio=1):
"""
Draw PDP-Chart for certain feature
Parameters
----------
model : trained model
df : pd.DataFrame
contains all features used in model
var : string, feature need to draw
predictors : list of string
all features used in model
n_bins: int
only works with numeric data
dfltValue : numeric,value to sample bin max
maxVal : boolean or numeric
designed max value for this feature
Returns
-------
fig : figure
"""
maxVal = df[var][df[var] > dfltValue].quantile(maxValRatio)
# feature_grid
idx = ((df[var] > dfltValue) & (df[var] <= maxVal))
# 是否包含所需单一分箱的取值区间
if sum((df[var] <= dfltValue)) > 0:
feature_grid = [dfltValue]
else:
feature_grid = []
bin_index = []
for i in range(0, n_bins + 1):
bin_index.append(i * 1.0 * maxValRatio / n_bins)
feature_grid = sorted(list(df.loc[idx, var].quantile(bin_index)) + feature_grid)
print(var, len(df.loc[idx, var]), feature_grid)
# 取观察样本 原始样本大于1w时随机抽取1w
if df.shape[0] > 10000:
x_small = df.sample(n=10000, random_state=77)
else:
x_small = df
# score
predictions = []
for feature_val in feature_grid:
x_copy = x_small.copy()
x_copy[var] = feature_val
predictions.append(model.predict_proba(x_copy[predictors])[:, 1].mean())
# 制图
if feature_grid[0] != dfltValue:
xindex = feature_grid[:]
plt.plot(bin_index, predictions[:], 'bo-', label='%s' % var)
plt.xticks(bin_index, ['%.2f' % i for i in feature_grid])
plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--',
label='scoreAvg')
else:
xindex = feature_grid[1:]
plt.plot(bin_index, predictions[1:], 'bo-', label='%s' % var)
plt.xticks(bin_index, ['%.2f' % i for i in feature_grid[1:]])
plt.axhline(y=model.predict_proba(x_small[predictors])[:, 1].mean(), color='k', linestyle='--',
label='scoreAvg')
plt.axhline(y=predictions[0], color='r', linestyle='--', label='dfltValue')
plt.title('pdp Chart of %s' % var)
plt.ylabel('Score')
plt.legend(fontsize=10, loc=4, framealpha=0.5)
plt.grid()
def pdpCharts9_new(model, df, collist, predictors, n_bins=10, dfltValue=-99999, maxValRatio=1):
"""
Draw PDP-Chart for certain features
Parameters
----------
model : trained model
df : pd.DataFrame
contains all features used in model
collist : list of string, features need to draw
predictors : list of string
all features used in model
n_bins: int, default 10
only works with numeric data
dfltValue : numeric, default -99999
default value for this feature,
maxValRatio : numeric, default 1
assign max value with x quantile
Returns
-------
fig : figure with at most 9 subplots
"""
lenth = len(collist)
cntPlt = int(np.ceil(lenth / 9))
figlist = []
for i in list(range(1, cntPlt + 2)):
fig = plt.figure(i)
figlist.append(fig)
j = 1
for col in collist[(i - 1) * 9:min(i * 9, lenth)]:
plt.subplot(3, 3, j)
pdpChart_new(model, df, col, predictors, n_bins, dfltValue=dfltValue, maxValRatio=maxValRatio)
j += 1
plt.tight_layout()
plt.show()
return figlist
# liftChart ------------------------------------------------------------------------------------------
rcParams['figure.figsize'] = 16, 8
def cal_rate(df, resp, lenth):
return pd.DataFrame.from_dict(
{
'cntLoan': len(df),
'event': df[resp].sum(),
# 'rate' : len(df)/lenth,
'eventRate': df[resp].mean()
},
orient='index').T
def show_result(df, var, resp, n_bins, label=None):
"""
Draw Lift-Chart and AccumLift-Chart for certain score
Parameters
----------
df : pd.DataFrame
at least contains score and resp
var : string, score need to draw
resp : string, resp column
only contain 0/1 value
label: string, name of var
n_bins: int
Returns
-------
fig : 2 figures
"""
if label == None:
label = var
df['bkl_%s' % var] = pd.qcut(df[var], n_bins, duplicates='drop')
lenth = len(df)
r1 = df.groupby('bkl_%s' % var).apply(lambda x: cal_rate(x, resp, lenth)).reset_index(level=1, drop=True)
# r1['accumRate'] = r1['rate'].cumsum()
r1['acmLoan'] = r1['cntLoan'].cumsum()
r1['acmEvent'] = r1['event'].cumsum()
r1['acmEventRate'] = r1['acmEvent'] / r1['acmLoan']
print(label)
print(r1)
# plot lift_chart - marginal
plt.subplot(1, 2, 1)
# xtickss = r1.index
r1.reset_index(drop=True, inplace=True)
r1.index = r1.index + 1
# r1.index = range(1, n_bins+1)
plt.plot(r1.index, r1['eventRate'], marker='o',
label='Auc of %s:%.3f' % (label, np.round(metrics.roc_auc_score(df[resp], df[var]), 3))) # linestyle='--'
plt.title('EventRate in %d Quantiles' % n_bins)
plt.ylabel('eventRate')
plt.grid(True)
# plt.xticks(r1.index, xtickss, rotation = 70)
plt.legend(fontsize=13, loc=2, framealpha=0.5)
# plot lift_chart - accumulative
plt.subplot(1, 2, 2)
plt.plot(r1.index, r1['acmEventRate'], marker='o',
label='Auc of %s:%.3f' % (label, np.round(metrics.roc_auc_score(df[resp], df[var]), 3))) # linestyle='--'
plt.title('Accum-EventRate in %d Quantiles' % n_bins)
plt.ylabel('accumEventRate')
# plt.xticks(r1.index, xtickss, rotation = 70)
plt.grid(True)
plt.legend(fontsize=13, loc=2, framealpha=0.5)
plt.tight_layout()
# TDR_analysis ------------------------------------------------------------------------------------------
from collections import Counter
def tdr_rule(df, predictors, score, n_bins=10, dfltValue=-99999):
'''
Turn Down Rules on all sample
Parameters
----------
df : pd.DataFrame, dataframe of all sample
predictors : list of string, names of all features
score : string, model score
n_bins: numeric, default 10
defines the number of equal-width bins in the range of df[col]
dfltValue: numeric, default -99999
Returns
-------
dict_rule : dict
eg: {'feature_name':
{'lst': list of bin edges,
'mean': {mean of score in each bin},
'min': min of means in all bins}
'''
data = df.copy()
dict_rule = {}
for col in predictors:
temp_dict = {}
data.sort_values(col, inplace=True)
data.reset_index(drop=True, inplace=True)
bins = pd.qcut(data.index, n_bins)
group = data.groupby(bins)[col].agg([max]).reset_index(level=[0])
group["max"] = group["max"].apply(lambda x: round(x, 2))
lst = sorted(list(set(group["max"])))
if lst[0] == dfltValue:
lst[0] = dfltValue
else:
lst.insert(0, dfltValue)
temp_dict["lst"] = lst
bins = pd.cut(data[col], lst)
group = data.groupby(bins)[score].agg(["mean", "count"]).reset_index(level=[0])
group["mean"] = group["mean"].apply(lambda x: np.round(x, 4))
temp_dict["min"] = group["mean"].min()
bb = group[["mean"]]
cc = bb.to_dict()
temp_dict["mean"] = cc["mean"]
dict_rule[col] = temp_dict
return dict_rule
def tdr_result(df, predictors, idcol, score, dict_rule, dfltValue, topX=10):
"""
list TurnDown Reason for each sample
Parameters
----------
df : pd.DataFrame,
normally dataframe of turn-down samples
predictors : list of string
idcol : string,
name of id column, eg: loan_id
score : model score
dict_rule : dictionary
turn-down rules generated on all sample
dfltValue : numeric,
default value for these predictors
topX : numeric, default 10
display top x turn-down reasons for each sample
Returns
-------
dict_result : dict
eg: dict{loan_id:
{'top5Rsns': [('loan_amt_max', 0.4663),
('zhima_score', 0.3278),
('delq_days_max', 0.1085),
('last_repay_day', 0.0),
('last_repay_itv', 0.0)],
'v5': 0.5167027077367229},
"""
data = df.copy()
# print (dict_rule)
# 计算每一个的用户的每一个特征对score的影响
dict_result = {}
for _, row in data.iterrows():
temp = {}
for col in predictors:
for k, p in zip(list(range(len(dict_rule[col]["lst"]))), dict_rule[col]["lst"]):
if row[col] <= p:
if k < 1:
k = 1
else:
pass
temp[col] = dict_rule[col]["mean"][k - 1] - dict_rule[col]["min"]
break
temp = sorted(temp.items(), key=lambda x: x[1], reverse=True)
tmp = {}
tmp[score] = row[score]
tmp['top%dRsns' % topX] = temp[:topX]
dict_result[row[idcol]] = tmp
return dict_result
def tdr_analysis(df, predictors, idcol, score, dict_rule, dfltValue, topX=10):
"""
计算拒绝样本中排名前三的拒绝原因的 top3最常出现特征及占比
"""
dict_result = tdr_result(df, predictors, idcol, score, dict_rule, dfltValue, topX=topX)
# print (dict_result)
lenth = len(dict_result)
top1var, top2var, top3var = [], [], []
for i in dict_result.keys():
top1var.append(dict_result[i]['top%dRsns' % topX][0][0])
top2var.append(dict_result[i]['top%dRsns' % topX][1][0])
top3var.append(dict_result[i]['top%dRsns' % topX][2][0])
top1Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top1var).most_common(3)]
top2Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top2var).most_common(3)]
top3Rsn = [(i, float(cnt) / float(lenth)) for (i, cnt) in Counter(top3var).most_common(3)]
print('3 most-common candidates in top1Reason (variable, frequency): -------- \n', top1Rsn)
print('3 most-common candidates in top2Reason (variable, frequency): -------- \n', top2Rsn)
print('3 most-common candidates in top3Reason (variable, frequency): -------- \n', top3Rsn)
return dict_result
\ No newline at end of file
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import roc_auc_score
from models_kit import lightgbm
from models_kit import xgboost
def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/plots/', topN=20):
def topN_feature_importance(classifier, clf ,mode , topN=20):
''' '''
plot feature importance squence plot feature importance squence
params:
classifier
''' '''
plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素 plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率 plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN) model.plot_importance(clf, max_num_features=topN)
plt.title("Feature Importances") plt.title("Feature Importances")
path = save_path + title + "_featureImportance.png"
plt.savefig(path)
plt.show() plt.show()
return path
def topN_feature_importance_list(features, clf, topN=3):
'''
instructions : return topN_feature_importance dataframe
:param features:
:param clf:
:param topN:
:return:
'''
importanct_feat = pd.DataFrame({
'column': features,
'importance': clf.feature_importance(),
}).sort_values(by='importance', ascending=False).column.tolist()[:3]
return importanct_feat
def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
# model matrix 存储不同模型指标的矩阵
model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
columns=['线上模型'])
# 定义最优参指针
pointer = 0
# 遍历最优参组合
for param in optimal_para:
if algorthm == "lightGBM":
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
adds_on=param, target=target)
model_matrix = pd.concat([model_matrix,
pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
columns=[pointer])], axis=1)
pointer += 1
# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
...@@ -210,7 +210,7 @@ class dhb: ...@@ -210,7 +210,7 @@ class dhb:
and datediff(now(),deadline) > ''' + str(passdue_day) + ''' and datediff(now(),deadline) > ''' + str(passdue_day) + '''
''' '''
def dhb_features_extract(self,df): def dhb_features_prepocessing(self,dhb_loan):
try: try:
value_map = { value_map = {
"近3天": 1, "近3天": 1,
...@@ -229,12 +229,12 @@ class dhb: ...@@ -229,12 +229,12 @@ class dhb:
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period)) # print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data # use risk_analysis to extract data
print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period', # print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
self.end_time_period)) # self.end_time_period))
dhb_loan = pd.read_sql( # dhb_loan = pd.read_sql(
self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period), # self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
mysqldb.engine_risk_analysis) # mysqldb.engine_risk_analysis)
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[ "dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
...@@ -267,9 +267,9 @@ class dhb: ...@@ -267,9 +267,9 @@ class dhb:
dhb_loan.loc[ dhb_loan.loc[
dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300 dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv") # dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime( # print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本") # time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
# ignore exceptions such as "colmns doesn't exist" # ignore exceptions such as "colmns doesn't exist"
except Exception as e: except Exception as e:
print("data preprocessing ERR ",e) print("data preprocessing ERR ",e)
......
import pandas as pd '''
import numpy as np @allocator V1.0
'''
#################################################### report settings ###################################################
from models_obj import dhb_obj
import datetime import datetime
from tools import datacal import pandas as pd
import os
from mvp import refit from mvp import refit
from mvp import rebuild from tools import datacal
from models_obj import dhb_obj
###### global variable ######
# label
target = 'target'
#############################
dhb = dhb_obj.dhb()
df_sample = dhb.dhb_features_extract()
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__':
# data extraction
''' ## Old Edition here
# if total sample more than 30000, it would use train-validation-test
# else use CV to parameters tuning
# if len(df_sample) >= 30000:
# df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# else:
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
'''
# 默认取样本方法
df_train, df_val, df_test = datacal.train_test_split_general()
# model refit
#xgboost
xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
xgb_model_auc['training_auc'] = None
xgb_model_auc['val_auc'] = None
#xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
#ligthtgbm
lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
lgb_model_auc['training_auc'] = None
lgb_model_auc['val_auc'] = None
#dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# merge as single dataframe full of models
#pd.DataFrame(xgb_model)
# dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
# df=dhb.dhb_features_extract()
# print(df.columns.tolist())
# print(df.target.unique())
# label='target'
# features=dhb.get_feature()
# df[features]=df[features].astype(float)
# df['target']=df['target'].astype(int)
# print('----feature---',len(features))
# df=pd.read_csv('test.csv')
#== 模型名称
model_name='dhb'
#== 目标是15天
passdue_day=15
df_log=sample.get_last_record(model_name)
if df_log.shape[0]==1:
start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
else:
start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
start_date='2019-01-01'
end_date='2019-01-10'
print(start_date,end_date)
df_sample=dhb.query_sample(start_date,end_date)
df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
df_sample['label']=1
df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 记录样本信息
# sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
# train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
# 渠道列表
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
# 申请类型列表
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
# workspace 路径
worksapce = 'E:\\bla\\model_mvp\\'
# 样本路径
sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv'
# N+标签
target = 'target'
#################################################### report settings ############################################################################# # 线上模型分字段
score = 'score'
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'} # 预测模型分字段
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'} prediction = 'predict'
# refit / rebuild sequence # 报告生成路径
report_path = worksapce
# 报告名称
report_name = "lgb_report.docx"
# 切换到workspace目录下 避免相对路径不能识别问题
os.chdir(worksapce)
#################################################### training settings #################################################
# 生成电话帮对象(使用默认参数) # 生成电话帮对象(使用默认参数)
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15) dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 需要对特征进行调整时,在这里直接dhb.features = 赋值即可
# 提取样本 # 提取样本
df_sample = dhb.dhb_features_extract() #df_sample = dhb.dhb_features_extract()
# 这里直接使用csv读入样本
# 备份df_sample df_sample = pd.read_csv(sample_path,engine='python')
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 电话帮数据处理 # 电话帮数据处理
# report sequence # 自定义方法 / 默认数据处理方法
df_sample = dhb.dhb_features_prepocessing(df_sample)
# 备份df_sample
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7, split_methods='random',
time_label='applied_at')
del df_sample
# 模型refit
model_matrix, lgbm = refit.model_fit(df_sample, dhb, target, score)
print(model_matrix)
# 生成报告
status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target,
score, prediction, report_path, report_name, applied_from, applied_type, topN=3)
......
...@@ -7,137 +7,177 @@ import lightgbm as lgb ...@@ -7,137 +7,177 @@ import lightgbm as lgb
from graph import matplot from graph import matplot
from tools import filetool from tools import filetool
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
from models_kit import general_methods
from docx.shared import Inches
import math
import pandas as pd
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15) def model_fit(df_train, df_val, df_test, model_obj, target, score):
# 提取样本 '''
#df_sample = dhb.dhb_features_extract()
######### temp #############
import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
target = 'target'
score = 'score'
prediction = 'predict'
############################ :param df_train: 训练集
# 备份df_sample :param df_val: 验证集
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx") :param df_test: 测试集
:param model_obj: 线上模型对象
:param target: 目标列标签(逾期率标签 1 and 0)
# 默认样本划分 :param score: 线上分字段
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target, :return:
random_state=7,split_methods='random', model_matrix - 不同模型的同一算法运行结果指标二维表
time_label='applied_at') lgbm - 验证集上选择的最优分类器
del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn '''
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
topN=3, cv_fold=5) # 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
print('topn 通过train交叉验证得到的auc ',topn) optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, model_obj.features, df_train, df_val, target=target,
topN=3, cv_fold=5)
# model matrix
model_matrix_index = ['name','Params','trainAUC','validationAUC'] print('topn 通过train交叉验证得到的auc ',topn)
model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
# model matrix 存储不同模型指标的矩阵
pointer = 0 model_matrix_index = ['name','Params','trainAUC','validationAUC']
for param in optimal_para: model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=param, target=target) # 定义最优参指针
model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1) pointer = 0
pointer += 1 # 遍历最优参组合
for param in optimal_para:
# 简单选取一下validation set auc 最高的 params train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params'] adds_on=param, target=target)
model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance pointer += 1
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=best_params, target='target') # 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
# 用新模型预测结果
predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target) # 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
# 把新的预测结果加入test train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
df_test[prediction] = predictions adds_on=best_params, target='target')
return model_matrix, lgbm
####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
####################################
### report ###################################### 生成报告 ################################################################
import os def model_report(clf, df_train, df_val, df_test, model_obj, target,model_matrix,
os.chdir("E:/bla/model_mvp/") score, prediction, report_path, report_name, applied_from, applied_type, topN=3):
'''
# plot feature importance
topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/cache/', topN=20) :param clf: 模型分类器对象
:param df_train: 训练集
importanct_feat = pd.DataFrame({ :param df_val: validation set
'column': dhb.features, :param df_test: 测试集
'importance': lgbm.feature_importance(), :param model_obj: 线上模型对象
}).sort_values(by='importance',ascending=False).column.tolist()[:3] :param target: 目标列标签(逾期标签)
:param model_matrix: 模型对比二维表
# report file :param score: 线上模型分数字段
report_path = "E:/bla/model_mvp/" :param prediction: 模型预测分数字段
report_name = "lgb_report.docx" :param report_path: 报告生成路径
:param report_name: 报告生成名
# 生成docx Documents :param applied_from: 报告中包含的渠道字典
document = filetool.buildDocument(report_path, report_name) :param applied_type: 报告中包含的申请类型字典
:param topN: 前N个特征(重要性)
# docx加入title :return:
document.add_heading('lightGBM 算法refit报告') status : 返回1表示执行完成
'''
# docx新增 特征权重段
document.add_paragraph('特征权重图') # 用新模型预测结果 xgb还需要加一个proba (TODO here)
predictions ,test_auc = lightgbm.predict(clf,df_test,model_obj.features,target)
# docx加入特征权重图像
document.add_picture(topnfeat_path) # 把新的预测结果加入test
df_test[prediction] = predictions
# 新增 univar_chart段
document.add_paragraph('univar_chart') # plot feature importance
topnfeat_path = general_methods.topN_feature_importance_plot(lgb, clf, title="untitled", save_path='./mvp/plots/cache/', topN=20)
# 遍历目标features画出univarchart
for i in importanct_feat: # 获取前N个权重特征列表
univar_train = datacal.cal_univar(df_train, i, target, qcut=10) importanct_feat = general_methods.topN_feature_importance_list(model_obj.features, clf, topN=3)
univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
univar_test = datacal.cal_univar(df_test, i, target, qcut=10) # 生成docx Documents
tab_df_list = [univar_train,univar_val,univar_test] document = filetool.buildDocument(report_path, report_name)
univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
tab_df_list=tab_df_list, plot_tab=False, # docx加入title
saved_path='./mvp/plots/cache/') document.add_heading('lightGBM 算法refit报告')
document.add_picture(univarChart)
# docx新增 特征权重段
document.add_paragraph('PDP_chart') document.add_paragraph('特征权重图')
# 遍历目标features 画出对应PDP
for i in importanct_feat: # docx加入特征权重图像
pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10) document.add_picture(topnfeat_path)
pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/') # 新增 univar_chart段
document.add_picture(pdpChart) document.add_paragraph('univar_chart')
# 遍历目标features画出univarchart
for i in importanct_feat:
# 训练集 univar
filetool.saveDocument(document, report_path, report_name) univar_chart = matplot.uniVarChart(df_train, i, target, n_bins=10, dfltValue=-99999, dftrain=df_val, dftest=df_test, drawAll=True,
drawTrTe=False, saved_path='./mvp/plots/cache/')
document.add_paragraph('lift_chart') # univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
# 遍历给定渠道 & 客群 默认等频画出liftchart # # validation univar
try: # univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
for channel in ['333','159537','1,214,217,198']: # # test集 univar
for type in ['1','2','3']: # univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)] # 用于univarChart画图的参数列表
lift_pred = datacal.cal_lift(df_sliced,score=prediction) #tab_df_list = [univar_train,univar_val,univar_test]
lift_online = datacal.cal_lift(df_sliced,score=score) # 调用plot_table_list
#liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/') #univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
liftChart = matplot.plot_table_list([lift_pred, lift_online], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None, # tab_df_list=tab_df_list, plot_tab=False,
title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None, # saved_path='./mvp/plots/cache/')
tab_df_list=tab_df_list, plot_tab=False, document.add_picture(univar_chart,width=Inches(8))
saved_path='./mvp/plots/cache/')
document.add_picture(liftChart) # 新增pdp段
except: document.add_paragraph('PDP_chart')
pass # 遍历目标features 画出对应PDP
for i in range(math.ceil(len(importanct_feat)/9)):
filetool.saveDocument(document, report_path, report_name) # pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
# pdpChart = matplot.plot_table_df(pdp, ['1'], title=i + ' PDP Chart', X_label=None, y_label=None,
# tab_df=None, plot_tab=True, saved_path='./mvp/plots/cache/')
pdpChart = matplot.pdpCharts9(clf, df_test, importanct_feat, model_obj.features, n_bins=10, dfltValue=-99999, maxValRatio=1, saved_path="./mvp/plots/cache/")
document.add_picture(pdpChart,width=Inches(8))
# 新增liftchart段
document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
try:
lift_pred = datacal.cal_lift(df_test, score=prediction)
lift_online = datacal.cal_lift(df_test, score=score)
# liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']],
[roc_auc_score(df_test[target], df_test[prediction]),
roc_auc_score(df_test[target], df_test[score])], datalist_description=None,
title='全渠道全量客群测试集上的 lift Chart',
X_label=None, y_label=None,
tab_df_list=None, plot_tab=False,
saved_path='./mvp/plots/cache/')
document.add_picture(liftChart, width=Inches(8))
# 遍历渠道
for channel in applied_from.keys():
# 遍历客群类型
for type in applied_type.keys():
print('lift ',type,channel)
# 数据切片
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
#
lift_pred = datacal.cal_lift(df_sliced,score=prediction)
lift_online = datacal.cal_lift(df_sliced,score=score)
#liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache/')
liftChart = matplot.plot_table_list([lift_pred['mean'], lift_online['mean']], [roc_auc_score(df_test[target],df_test[prediction]),roc_auc_score(df_test[target],df_test[score])], datalist_description=None,
title= applied_from[channel]+applied_type[type]+ ' lift Chart', X_label=None, y_label=None,
tab_df_list=[lift_pred['count'], lift_online['count']], plot_tab=True,
saved_path='./mvp/plots/cache/')
document.add_picture(liftChart,width=Inches(8))
# 存在某些渠道量很少的情况,加入try catch异常处理
except Exception as e:
print('Exception: ',e)
pass
# docx 保存
filetool.saveDocument(document, report_path, report_name)
return 1
......
...@@ -22,9 +22,16 @@ def saveDocument(document,path,filename): ...@@ -22,9 +22,16 @@ def saveDocument(document,path,filename):
raise ValueError('{} is not a word file'.format(filename)) raise ValueError('{} is not a word file'.format(filename))
return document.save(os.path.join(path,filename)) return document.save(os.path.join(path,filename))
def insert_table(document, cols, values): def insert_table(document,df):
# cols 为列名 '''
# values 为值,list instructions : plot table which insert into docx
:param document: document obj
:param df: dataframe
:return:
'''
cols = df.columns
values = df.values
table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1') table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
hdr_cells = table.rows[0].cells hdr_cells = table.rows[0].cells
for i in range(len(cols)): for i in range(len(cols)):
...@@ -32,5 +39,8 @@ def insert_table(document, cols, values): ...@@ -32,5 +39,8 @@ def insert_table(document, cols, values):
for value in values: for value in values:
row_cells = table.add_row().cells row_cells = table.add_row().cells
for i in range(len(cols)): for i in range(len(cols)):
row_cells[i].text = str(value[i]) if type(value[i])==str:
return document row_cells[i].text = value[i]
\ No newline at end of file else:
row_cells[i].text = str(value[i])
return document
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment