Commit 589bfcb3 authored by 王家华's avatar 王家华

对画图组件做了一些bug修正,新增了多个子图的支持

parent e511a80c
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (model_mvp)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (model_mvp)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/model_mvp.iml" filepath="$PROJECT_DIR$/.idea/model_mvp.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="c45d2e80-934e-41cc-8f01-c6d0d282db9d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/graph/matplot.py" beforeDir="false" afterPath="$PROJECT_DIR$/graph/matplot.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/models_kit/general_methods.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_kit/general_methods.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/models_kit/lightgbm.py" beforeDir="false" afterPath="$PROJECT_DIR$/models_kit/lightgbm.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/mvp/refit.py" beforeDir="false" afterPath="$PROJECT_DIR$/mvp/refit.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/tools/datacal.py" beforeDir="false" afterPath="$PROJECT_DIR$/tools/datacal.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="17" selection-start-line="17" selection-end-line="17" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/mvp/allocator.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/mvp/refit.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-986">
<caret line="26" column="13" lean-forward="true" selection-start-line="26" selection-start-column="13" selection-end-line="26" selection-end-column="13" />
<folding>
<element signature="e#0#30#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-102">
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/tools/datacal.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="68" column="46" lean-forward="true" selection-start-line="68" selection-start-column="46" selection-end-line="68" selection-end-column="46" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/datasource/mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="493">
<caret line="29" selection-start-line="29" selection-end-line="29" />
<folding>
<element signature="e#0#14#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-952">
<caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/graph/matplot.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="452">
<caret line="101" column="43" lean-forward="true" selection-start-line="101" selection-start-column="43" selection-end-line="101" selection-end-column="43" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="408">
<caret line="165" selection-start-line="165" selection-end-line="169" />
<folding>
<element signature="e#0#22#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/models_kit/xgboost.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/models_kit/general_methods.py" />
<option value="$PROJECT_DIR$/models_kit/lightgbm.py" />
<option value="$PROJECT_DIR$/mvp/refit.py" />
<option value="$PROJECT_DIR$/graph/matplot.py" />
<option value="$PROJECT_DIR$/tools/datacal.py" />
</list>
</option>
</component>
<component name="ProjectConfigurationFiles">
<option name="files">
<list>
<option value="$PROJECT_DIR$/.idea/model_mvp.iml" />
<option value="$PROJECT_DIR$/.idea/vcs.xml" />
<option value="$PROJECT_DIR$/.idea/misc.xml" />
<option value="$PROJECT_DIR$/.idea/modules.xml" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="174" />
<option name="y" value="167" />
<option name="width" value="1400" />
<option name="height" value="831" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="datasource" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="feature" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="graph" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="models_kit" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="models_obj" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="mvp" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="mvp" type="462c0819:PsiDirectoryNode" />
<item name="plots" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="mvp" type="462c0819:PsiDirectoryNode" />
<item name="refit_pkls" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="model_mvp" type="462c0819:PsiDirectoryNode" />
<item name="tools" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="model_mvp" type="b2602c69:ProjectViewProjectNode" />
<item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="ASKED_SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
<property name="restartRequiresConfirmation" value="false" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.lightgbm">
<configuration name="lightgbm" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_mvp" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/models_kit" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/models_kit/lightgbm.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="refit" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="model_mvp" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/mvp" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/mvp/refit.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.lightgbm" />
<item itemvalue="Python.refit" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="c45d2e80-934e-41cc-8f01-c6d0d282db9d" name="Default Changelist" comment="" />
<created>1557804124990</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1557804124990</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1936" height="1066" extended-state="6" />
<layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.17492098" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.3290461" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.39978564" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="8" weight="0.3290461" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info active="true" anchor="bottom" id="Python Console" order="10" visible="true" weight="0.31511253" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
</layout>
</component>
<component name="editorHistoryManager">
<entry file="file://C:/ProgramData/Anaconda3/Lib/site-packages/matplotlib/backends/qt_compat.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="729">
<caret line="203" selection-start-line="203" selection-end-line="203" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_kit/general_methods.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="17" selection-start-line="17" selection-end-line="17" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/mvp/allocator.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_imps/_pydev_execfile.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="17" column="57" selection-start-line="17" selection-start-column="57" selection-end-line="17" selection-end-column="57" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_kit/xgboost.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_kit/lightgbm.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="408">
<caret line="165" selection-start-line="165" selection-end-line="169" />
<folding>
<element signature="e#0#22#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/datasource/mongodb.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="493">
<caret line="29" selection-start-line="29" selection-end-line="29" />
<folding>
<element signature="e#0#14#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/models_obj/dhb_obj.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-952">
<caret line="19" column="137" selection-start-line="19" selection-start-column="125" selection-end-line="19" selection-end-column="137" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tools/filetool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-102">
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/graph/matplot.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="452">
<caret line="101" column="43" lean-forward="true" selection-start-line="101" selection-start-column="43" selection-end-line="101" selection-end-column="43" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/tools/datacal.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="289">
<caret line="68" column="46" lean-forward="true" selection-start-line="68" selection-start-column="46" selection-end-line="68" selection-end-column="46" />
<folding>
<element signature="e#0#19#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/mvp/refit.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-986">
<caret line="26" column="13" lean-forward="true" selection-start-line="26" selection-start-column="13" selection-end-line="26" selection-end-column="13" />
<folding>
<element signature="e#0#30#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
<state key="ScopeChooserConfigurable.UI">
<settings>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>
\ No newline at end of file
......@@ -4,43 +4,123 @@ Created on Thu Apr 18 11:32:06 2019
@author: Jason Wang
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import seaborn as sns
############# plot config ###############
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
def topN_feature_importance(model, clf, title="untitled", save_path = './plots/', topN=20):
def topN_feature_importance(model, clf, title="untitled", save_path='./mvp/plots/', topN=20):
'''
plot feature importance squence
params:
classifier
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
model.plot_importance(clf, max_num_features = topN)
model.plot_importance(clf, max_num_features=topN)
plt.title("Feature Importances")
path = save_path + title + "featureImportance.png"
path = save_path + title + " featureImportance.png"
plt.savefig(path)
plt.show()
return path
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path=None):
def plot_table_list(datalist, auc, datalist_description=None, title='untitled', X_label=None, y_label=None,
tab_df_list=None, plot_tab=True,
tab_rows=None, saved_path=None):
'''
instructions : visualization of pivot
instructions : visualization of pivot with given list of dataframe
Params :
dataset -
auc - auc list / array
title - title of plot('untitled' as default)
x_label - X axis label of plot
y_label - y axis label of plot
plot_tab - plot table or not , default as True
saved_path - saved path, set as None as there has no download needs
'''
fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1)
# datalist description
if datalist_description is None:
datalist_description = range(len(datalist))
for table_index in range(len(datalist)):
# 每个table需要只有一个index,一个values
x = range(len(datalist[table_index].index))
y = datalist[table_index].values
axs.plot(x, y, label=datalist_description[table_index])
if len(x) == 1:
plot_tab = False
if plot_tab:
table_rows = []
table_cols = range(len(datalist[table_index].index))
tab_df = []
if tab_df_list is None:
for data in datalist:
tab_df.append(
pd.Series(data.index).astype(str).map(lambda x: x.replace(' ', '').replace('0.', '.')).tolist())
tab_df.append(
pd.Series(data.values).astype(str).map(lambda x: x.replace(' ', '').replace('0.', '.')).tolist())
# validate tab_rows
if tab_rows is None:
table_rows.append('index');
table_rows.append('values')
else:
# tab_rows was given by
table_rows = table_rows + tab_rows
else:
for data in tab_df_list:
tab_df.append(
pd.Series(data.index).astype(str).map(lambda x: x.replace(' ', '').replace('0.', '.')).tolist())
tab_df.append(
pd.Series(data.values).astype(str).map(lambda x: x.replace(' ', '').replace('0.', '.')).tolist())
# validate tab_rows
if tab_rows is None:
table_rows.append('index')
table_rows.append('values')
else:
# tab_rows was given by
table_rows = table_rows + tab_rows
the_table = plt.table(cellText=tab_df,
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
# otherwise, nothing to do here
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
if y_label is not None:
plt.ylabel(y_label)
if X_label is not None:
plt.xlabel(X_label)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
if saved_path is not None:
plt.savefig(saved_path + title + ".png")
plt.show()
return 1
def plot_table_df(dataset, auc, title='untitled', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path=None):
print(tab_df)
'''
instructions : visualization of pivot with single dataframe
Params :
dataset -
auc - auc list / array
......@@ -50,21 +130,27 @@ def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None,
plot_tab - plot table or not , default as True
saved_path - saved path, set as None as there has no download needs
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 100 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(6, 6), linewidth=0.1)
fig, axs = plt.subplots(1, 1, figsize=(13, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
table_cols = pd.Series(dataset.index).astype(str).map(lambda x: x.replace(' ', '')).map(
lambda x : x.replace('0.', '.'))
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, label=str(i) + ' AUC: ' + str(auc[i]))
for i in range(len(table_rows)):
x = range(len(table_cols))
y = dataset.iloc[:, i]
axs.plot(x, y, label=str(table_rows[i]) + ' AUC: ' + str(auc[i]))
# if table should be plot
if plot_tab:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset))],
if tab_df is None:
tab_df = [list(dataset.iloc[:, 1].values) for i in range(len(table_rows))]
else:
table_rows = tab_df.columns
table_cols = tab_df.index
tab_df = [list(tab_df.iloc[:, 1].values) for i in range(len(table_rows))]
the_table = plt.table(cellText=tab_df,
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
......@@ -72,7 +158,7 @@ def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None,
plt.xticks([])
# otherwise, nothing to do here
the_table.auto_set_font_size(False)
the_table.set_fontsize(6)
the_table.set_fontsize(9)
fig.subplots_adjust(bottom=0.2)
plt.grid()
if y_label is not None:
......@@ -82,98 +168,99 @@ def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None,
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
if saved_path is not None:
plt.savefig(saved_path + title + ".png")
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
def plot_curve_singleCurve(dataset, x_label=None, y_label=None, table_tab=None,
save_path=None, figure_arrangement=11, fig_size=(4, 3),
fig_title='General Plot', fig_name='untitled',
fig_path=None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
metric = figure_arrangement // 10 * figure_arrangement % 10
for i in range(int(np.ceil(len(col) // metric))):
cols = col[i * metric:]
for fig_ith in range(len(cols)):
axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
axs.plot(index,dataset.loc[cols[fig_ith]])
axs.set_title(cols[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
axs.plot(index, dataset.loc[cols[fig_ith]])
axs.set_title(cols[fig_ith], fontsize=7)
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
axs.set_xlabel(x_label, fontsize=5)
if y_label != None:
axs.set_ylabel(y_label, fontsize=5)
plt.tight_layout()
plt.show()
return 1
#fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
# fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
#
#for fig_ith in range(len(df.columns)):
# for fig_ith in range(len(df.columns)):
# axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
# axs.plot(df.index,df.iloc[fig_ith])
# axs.set_title(col[])
#plt.tight_layout()
def plot_curve_multiCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
# plt.tight_layout()
def plot_curve_multiCurve(dataset, x_label=None, y_label=None, table_tab=None,
save_path=None, figure_arrangement=11, fig_size=(4, 3),
fig_title='General Plot', fig_name='untitled',
fig_path=None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
#metric = figure_arrangement // 10 * figure_arrangement % 10
#cols = col[i * metric:]
# metric = figure_arrangement // 10 * figure_arrangement % 10
# cols = col[i * metric:]
axs = plt.subplot(111)
for fig_ith in range(len(col)):
axs.plot(index,dataset.loc[col[fig_ith]],label=col[fig_ith])
axs.set_title(col[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
for fig_ith in range(len(col)):
axs.plot(index, dataset.loc[col[fig_ith]], label=col[fig_ith])
axs.set_title(col[fig_ith], fontsize=7)
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
axs.set_xlabel(x_label, fontsize=5)
if y_label != None:
axs.set_ylabel(y_label, fontsize=5)
plt.legend()
plt.tight_layout()
plt.show()
return 1
'''
'''
def plot_curve_mingle():
return 1
def density_chart(dataset,title):
def density_chart(dataset, title):
for col in dataset.columns:
sns.kdeplot(dataset.loc[:,col],label = col)
sns.kdeplot(dataset.loc[:, col], label=col)
plt.title(title)
plt.show()
#
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
# t1 = np.arange(0.0, 1.0, 0.01)
......@@ -194,4 +281,4 @@ def density_chart(dataset,title):
## for i in range(figure_arrangement%10):
## plt.subplots(,figsize=fig_size,linewidth=0.1)
#
# return 1
\ No newline at end of file
# return 1
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
import matplotlib.pyplot as plt
def topN_feature_importance(classifier, clf ,mode , topN=20):
'''
plot feature importance squence
'''
......@@ -11,3 +15,4 @@ def topN_feature_importance(classifier, clf, topN=20, model=lgb):
plt.title("Feature Importances")
plt.show()
......@@ -44,8 +44,7 @@ def returnAUC(clf, training_set, validation_set, features, target='target'):
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target',
featureImportance_path = '../mvp/plots/', topN_featureImportance=20, featureImportance_title='lightgbm'):
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
......@@ -70,8 +69,6 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
matplot.topN_feature_importance(lgb, lgbm, title=featureImportance_title,
save_path = featureImportance_path, topN=topN_featureImportance)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
......@@ -102,12 +99,12 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
for deepth in np.arange(2, 7, 1):
for leaves in np.arange(2, 2 ** deepth, 2):
for deepth in np.arange(2, 4, 1):
for leaves in np.arange(2, 2 ** deepth, 4):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=30)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
......@@ -122,7 +119,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
para['max_depth'] = deepth
para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para
return optimal_para, topn
return optimal_para, list(topn)
# training_curve.append(train_auc)
......@@ -168,7 +165,7 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
def predict(lgbm,df_test,features,target='target'):
predictions = lgbm.predict(df_test[features])
auc = roc_auc_score(predictions,df_test[target])
auc = roc_auc_score(df_test[target],predictions)
return predictions, auc
......
......@@ -6,85 +6,122 @@ from models_kit import xgboost
import lightgbm as lgb
from graph import matplot
from tools import filetool
from sklearn.metrics import roc_auc_score
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本
#df_sample = dhb.dhb_features_extract()
######### temp #############
import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\dhb_loan_sample——2019-04-23.csv',engine='python')
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\sample.csv',engine='python')
target = 'target'
score = 'score'
prediction = 'predict'
############################
# 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify='target',
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7,split_methods='random',
time_label='applied_at')
del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target='target',
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target=target,
topN=3, cv_fold=5)
print('topn 通过train交叉验证得到的auc ',topn)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=optimal_para, target='target')
predictions ,test_auc = lightgbm.predict(lgbm,df_test,features=dhb.features)
df_test['predict'] = predictions
# model matrix
model_matrix_index = ['name','Params','trainAUC','validationAUC']
model_matrix = pd.DataFrame(['NULL','NULL',roc_auc_score(df_train[target],df_train[score]),roc_auc_score(df_train[target],df_train[score])],index=model_matrix_index,columns=['线上模型'])
pointer = 0
for param in optimal_para:
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=param, target=target)
model_matrix = pd.concat([model_matrix, pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index, columns=[pointer])],axis=1)
pointer += 1
# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC',ascending=False).iloc[0,:].loc['Params']
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=best_params, target='target')
# 用新模型预测结果
predictions ,test_auc = lightgbm.predict(lgbm,df_test,dhb.features,target)
# 把新的预测结果加入test
df_test[prediction] = predictions
####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
applied_type = {'1,2':'首贷','1,2,3':'全量客群','1':'首申','2':'复申','3':'复贷'}
####################################
### report
# plot feature importance
path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./plots/', topN=20)
topnfeat_path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./mvp/plots/', topN=20)
# report file
report_path = "E:\\bla\\"
report_path = "E:/bla/model_mvp/"
report_name = "lgb_report.docx"
# 生成docx Documents
document = filetool.buildDocument(report_path, report_name)
# docx加入title
document.add_heading('lightGBM 算法refit报告')
filetool.Document.add_paragraph('特征权重图')
# docx新增 特征权重段
document.add_paragraph('特征权重图')
# docx加入特征权重图像
document.add_picture(topnfeat_path)
filetool.add_picture(path)
# 新增 univar_chart段
document.add_paragraph('univar_chart')
filetool.Document.add_paragraph('univar_chart')
# 遍历目标features画出univarchart
for i in dhb.features[:3]:
univar_train = datacal.cal_univar(df_train, i, target, qcut=10)
univar_val = datacal.cal_univar(df_val, i, target, qcut=10)
univar_test = datacal.cal_univar(df_test, i, target, qcut=10)
univarChart = matplot.plot_table_list([univar_train,univar_val,univar_test], [1,2,3], datalist_description=None, title= i +' univar Chart', X_label=None, y_label=None,
tab_df_list=None, plot_tab=True,
saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' univar Chart' + ".png")
for i in dhb.features:
univar = datacal.cal_univar(df_train,score='raw_score')
univarChart = matplot.plot_table(univar,title= i +' univar Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i +' univar Chart')
document.add_paragraph('PDP_chart')
# 遍历目标features 画出对应PDP
for i in dhb.features[:3]:
pdp = datacal.cal_pdp(df=df_test, score=prediction, feature=i, qcut=10)
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./mvp/plots/cache/')
document.add_picture('./mvp/plots/cache/' + i +' PDP Chart' + ".png")
for i in dhb.features:
pdp = datacal.cal_pdp(df_test,score='predict')
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' PDP Chart')
for i in dhb.features:
lift = datacal.cal_liftchart(df_test,score='predict')
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' lift Chart')
filetool.saveDocument(document, report_path, report_name)
document.add_paragraph('lift_chart')
# 遍历给定渠道 & 客群 默认等频画出liftchart
for channel in applied_from:
for type in applied_type:
df_sliced = df_test[df_test.applied_type.map(lambda x : True if str(x) in type.split(',') else False) & df_test.applied_from.map(lambda x : True if str(x) in channel.split(',') else False)]
lift = datacal.cal_liftchart(df_sliced,score=prediction)
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./mvp/plots/cache')
document.add_picture("./mvp/plots/cache" + i + ' lift Chart.png')
filetool.saveDocument(document, report_path, report_name)
......
......@@ -4,26 +4,79 @@ import datetime
from sklearn.model_selection import train_test_split
def liftchart(df,target='target',qcut=10,retbins=True):
def cal_lift(df_list, score, target='target', qcut=10, retbin=False):
'''
instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
Params :
df - dataframe(注意一定是是放款集!!)
df - dataframe(注意一定是是放款集!!) list
score - 模型分数
target - label column
qcut - quantiles
retbins - return bins interval when 'retbins' is True, else False
:return:
liftchart dataframe
liftchart pivot
'''
pivot = pd.DataFrame([])
if type(df_list) == pd.DataFrame:
df = df_list.copy()
# fillin missing with -1
df.fillna(value=-1,inplace=True)
df = df[[score, target]]
# create a bins column
df_noneNA = [df[score] < 0]
df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
pivot = pd.concat([pivot, pivot_tmp], axis=1)
if type(df_list) == list:
print('none')
for df in df_list:
df = df.copy()
df = df[[score, target]]
# create a bins column
df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
pivot = pd.concat([pivot, pivot_tmp], axis=1)
return pivot[target]
def cal_univar(df, feature, target, qcut=10):
'''
instructions : return univar pivot
Params:
:param df: dataframe with unvariable & label target(overdue label)
:param feature: single feature to
:param target:
:param qcut: N bins in the same frequency
:return: univar pivot
'''
df = df.copy()
# create a bins column
df['bins'] = pd.qcut(df, q=10, precision=6, retbins=False, duplicates='drop')
pivot = df[['bins','target']].groupby('bins').agg(['mean','count'])
return pivot
df = df[[feature, target]]
# fill missing with -1
df.fillna(value=-1,inplace=True)
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[target,'bins']].groupby('bins').sum() / df[[target,'bins']].groupby('bins').count()
return pivot[target]
def cal_pdp(df, score, feature, qcut=10):
'''
instructions : return pdp pivot
:param df: dataframe of test set
:param score: score that predicts by model
:param feature:
:param qcut:
:return:
'''
df = df.copy()
df = df[[feature, score]]
df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
pivot = df[[score,'bins']].groupby('bins').sum() / df[[score,'bins']].groupby('bins').count()
return pivot[score]
......@@ -184,45 +237,45 @@ def cal_accume(df,feature,target,bin=10,classes=[]):
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if df.shape[0]==0:
raise('no data')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
if feature not in columns:
raise('not found %s' % feature)
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature] = tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp, feature, bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature]
if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
df_out = df_all
return df_out
# def cal_univar(df,feature,target,bin=10,classes=[]):
# '''
# groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
# :param df: dataframe
# :param feature: feature in df.columns
# :param target: in df.columns eg: count(target) mean(target)
# :param bins:default =10
# :param classes: 分组
# :return:
# '''
# if df.shape[0]==0:
# raise('no data')
# columns=df.columns.tolist()
# if target not in columns:
# raise('not found %s' % target)
# if feature not in columns:
# raise('not found %s' % feature)
#
# tmp=df.copy()
# tmp[feature].fillna(-1, inplace=True)
# # == bin 划分,feature 有可能 非数字
# try:
# tmp[feature] = tmp[feature].astype(float)
# feature_grid = cal_feature_grid(tmp, feature, bin)
# tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
# tmp['grid'] = tmp['lbl'].cat.codes
# except ValueError:
# tmp['lbl']=tmp[feature]
# tmp['grid']=tmp[feature]
#
# if len(classes) > 0:
# df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
# df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
# df_out=df_gp
# else:
# df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
# df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
# df_out = df_all
# return df_out
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment