plot函数加表格出异常暂时没法全部解决，调用了model tools的方法画图

9b10189a · 王家华 · bd18c3b0 · 9b10189a · 9b10189a · 9b10189a
Commit 9b10189a authored May 20, 2019 by 王家华
31 changed files
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/graph/__pycache__/matplot.cpython-36.pyc
+++ b/graph/__pycache__/matplot.cpython-36.pyc
--- a/graph/matplot.py
+++ b/graph/matplot.py
--- a/models_kit/__pycache__/general_methods.cpython-36.pyc
+++ b/models_kit/__pycache__/general_methods.cpython-36.pyc
--- a/models_kit/general_methods.py
+++ b/models_kit/general_methods.py
 import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.metrics import roc_auc_score
+from models_kit import lightgbm
+from models_kit import xgboost

-
-
-def topN_feature_importance(classifier, clf ,mode , topN=20):
+def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/plots/', topN=20):
    '''
    plot feature importance squence
+    params:
+        classifier
    '''
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['savefig.dpi'] = 226  # 图片像素
    plt.rcParams['figure.dpi'] = 200  # 分辨率
    plt.figure(figsize=(10, 6))
-    classifier.plot_importance(clf, max_num_features=topN)
+    model.plot_importance(clf, max_num_features=topN)
    plt.title("Feature Importances")
+
+    path = save_path + title + "_featureImportance.png"
+    plt.savefig(path)
    plt.show()
+    return path
+
+
+def topN_feature_importance_list(features, clf, topN=3):
+    '''
+    instructions : return topN_feature_importance dataframe
+    :param features:
+    :param clf:
+    :param topN:
+    :return:
+    '''
+    importanct_feat = pd.DataFrame({
+        'column': features,
+        'importance': clf.feature_importance(),
+    }).sort_values(by='importance', ascending=False).column.tolist()[:3]
+    return importanct_feat
+
+
+def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
+    # model matrix 存储不同模型指标的矩阵
+    model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
+    model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
+                                 roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
+                                columns=['线上模型'])

+    # 定义最优参指针
+    pointer = 0
+    # 遍历最优参组合
+    for param in optimal_para:
+        if algorthm == "lightGBM":
+            train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
+                                                       adds_on=param, target=target)
+        model_matrix = pd.concat([model_matrix,
+                                  pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
+                                               columns=[pointer])], axis=1)
+        pointer += 1

+    # 简单选取一下validation set auc 最高的 params
+    best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
--- a/models_obj/__pycache__/dhb_obj.cpython-36.pyc
+++ b/models_obj/__pycache__/dhb_obj.cpython-36.pyc
--- a/models_obj/dhb_obj.py
+++ b/models_obj/dhb_obj.py
@@ -210,7 +210,7 @@ class dhb:
                and datediff(now(),deadline) > ''' + str(passdue_day) + '''
                '''

-    def dhb_features_extract(self,df):
+    def dhb_features_prepocessing(self,dhb_loan):
        try:
            value_map = {
                "近3天": 1,
@@ -229,12 +229,12 @@ class dhb:

            # print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
            # use risk_analysis to extract data
-            print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
-                                                                                                  self.end_time_period))
+            # print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
+            #                                                                                       self.end_time_period))

-            dhb_loan = pd.read_sql(
-                self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
-                mysqldb.engine_risk_analysis)
+            # dhb_loan = pd.read_sql(
+            #     self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
+            #     mysqldb.engine_risk_analysis)

            dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
                      "dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
@@ -267,9 +267,9 @@ class dhb:
            dhb_loan.loc[
                dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300

-            dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
-            print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
-                time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
+            # dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
+            # print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
+            #     time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
        # ignore exceptions such as "colmns doesn't exist"
        except Exception as e:
            print("data preprocessing ERR ",e)

--- a/mvp/__pycache__/__init__.cpython-36.pyc
+++ b/mvp/__pycache__/__init__.cpython-36.pyc
--- a/mvp/__pycache__/refit.cpython-36.pyc
+++ b/mvp/__pycache__/refit.cpython-36.pyc
--- a/mvp/allocator.py
+++ b/mvp/allocator.py
-import pandas as pd
-import numpy as np
+'''
+@allocator V1.0
+'''
+#################################################### report settings ###################################################
+from models_obj import dhb_obj
 import datetime
-from tools import datacal
-
+import pandas as pd
+import os
 from mvp import refit
-from mvp import rebuild
-
-
-from models_obj import dhb_obj
-
-###### global variable ######
-# label
-target = 'target'
-
-
-
-
-#############################
-
-
-
-
-
-
-dhb = dhb_obj.dhb()
-df_sample = dhb.dhb_features_extract()
-
-features = dhb.features
-df_sample[features] = df_sample[features].astype(float)
-df_sample['target'] = df_sample['target'].astype(int)
-print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
-print('----no.',len(features),'of samples of dhb----')
-
-# to save model performance
-
-if __name__ == '__main__':
-    # data extraction
-
-    ''' ## Old Edition here
-    # if total sample more than 30000, it would use train-validation-test
-    # else use CV to parameters tuning
-
-    # if len(df_sample) >= 30000:
-    #     df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
-    # else:
-    #     df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
-    '''
-    # 默认取样本方法
-    df_train, df_val, df_test = datacal.train_test_split_general()
-
-
-
-
-
-    # model refit
-
-
-
-    #xgboost
-    xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
-    xgb_model_auc['training_auc'] = None
-    xgb_model_auc['val_auc'] = None
-
-    #xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
-
-    ## 待加入 ： xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
-
-    #ligthtgbm
-    lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
-    lgb_model_auc['training_auc'] = None
-    lgb_model_auc['val_auc'] = None
-
-    #dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
-    #lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
-
-    # merge as single dataframe full of models
-    #pd.DataFrame(xgb_model)
-
-
-
-
-
-
-    # dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
-    # df=dhb.dhb_features_extract()
-    # print(df.columns.tolist())
-    # print(df.target.unique())
-    # label='target'
-    # features=dhb.get_feature()
-    # df[features]=df[features].astype(float)
-    # df['target']=df['target'].astype(int)
-    # print('----feature---',len(features))
-    # df=pd.read_csv('test.csv')
-    #== 模型名称
-    model_name='dhb'
-    #== 目标是15天
-    passdue_day=15
-    df_log=sample.get_last_record(model_name)
-    if df_log.shape[0]==1:
-        start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
-    else:
-        start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
-    start_date='2019-01-01'
-    end_date='2019-01-10'
-    print(start_date,end_date)
-    df_sample=dhb.query_sample(start_date,end_date)
-    df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
-    df_sample['label']=1
-    df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
-    dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
-    # 记录样本信息
-    # sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
-    #                          train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
-    #                          test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
-    #== xgboost gbtree
-    xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
-
-
+from tools import datacal

+# 渠道列表
+applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}

+# 申请类型列表
+applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}

+# workspace 路径
+worksapce = 'E:\\bla\\model_mvp\\'

+# 样本路径
+sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv'

+# N+标签
+target = 'target'

-#################################################### report settings #############################################################################
+# 线上模型分字段
+score = 'score'

-applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
-applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
+# 预测模型分字段
+prediction = 'predict'

-    # refit / rebuild sequence
+# 报告生成路径
+report_path = worksapce
+# 报告名称
+report_name = "lgb_report.docx"

+# 切换到workspace目录下 避免相对路径不能识别问题
+os.chdir(worksapce)
+#################################################### training settings #################################################
 # 生成电话帮对象(使用默认参数)
 dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
+# 需要对特征进行调整时，在这里直接dhb.features = 赋值即可

 # 提取样本
-df_sample = dhb.dhb_features_extract()
-
-# 备份df_sample
-df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
-
-
-
+#df_sample = dhb.dhb_features_extract()
+# 这里直接使用csv读入样本
+df_sample = pd.read_csv(sample_path,engine='python')

 # 电话帮数据处理
-    # report sequence
+# 自定义方法 / 默认数据处理方法
+df_sample = dhb.dhb_features_prepocessing(df_sample)
+

+# 备份df_sample
+#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")

+# 默认样本划分
+df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
+                                                             random_state=7, split_methods='random',
+                                                             time_label='applied_at')
+del df_sample

+# 模型refit
+model_matrix, lgbm = refit.model_fit(df_sample, dhb, target, score)

+print(model_matrix)
+# 生成报告
+status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target,
+                 score, prediction, report_path, report_name, applied_from, applied_type, topN=3)




--- a/mvp/plots/cache/360金融复申 lift Chart.png
+++ b/mvp/plots/cache/360金融复申 lift Chart.png
--- a/mvp/plots/cache/360金融复贷 lift Chart.png
+++ b/mvp/plots/cache/360金融复贷 lift Chart.png
--- a/mvp/plots/cache/360金融首申 lift Chart.png
+++ b/mvp/plots/cache/360金融首申 lift Chart.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_dun_call_total_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_avg_duration.png
--- a/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
+++ b/mvp/plots/cache/Univariate Chart of dhb_overview_ntdun_call_duration_below15.png
--- a/mvp/plots/cache/dhb_overview_dun_call_total_duration PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_dun_call_total_duration PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_dun_call_total_duration univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_dun_call_total_duration univar Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_avg_duration univar Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 PDP Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 PDP Chart.png
--- a/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 univar Chart.png
+++ b/mvp/plots/cache/dhb_overview_ntdun_call_duration_below15 univar Chart.png
--- a/mvp/plots/cache/pdp Chart with 9 1.png
+++ b/mvp/plots/cache/pdp Chart with 9 1.png
--- a/mvp/plots/cache/untitled_featureImportance.png
+++ b/mvp/plots/cache/untitled_featureImportance.png
--- a/mvp/plots/cache/内部首付贷 lift Chart.png
+++ b/mvp/plots/cache/内部首付贷 lift Chart.png
--- a/mvp/plots/cache/融360复申 lift Chart.png
+++ b/mvp/plots/cache/融360复申 lift Chart.png
--- a/mvp/plots/cache/融360复贷 lift Chart.png
+++ b/mvp/plots/cache/融360复贷 lift Chart.png
--- a/mvp/plots/cache/融360首申 lift Chart.png
+++ b/mvp/plots/cache/融360首申 lift Chart.png
--- a/mvp/refit.py
+++ b/mvp/refit.py
--- a/tools/__pycache__/filetool.cpython-36.pyc
+++ b/tools/__pycache__/filetool.cpython-36.pyc
--- a/tools/filetool.py
+++ b/tools/filetool.py
@@ -22,9 +22,16 @@ def saveDocument(document,path,filename):
            raise ValueError('{} is not a word file'.format(filename))
    return document.save(os.path.join(path,filename))

-def insert_table(document, cols, values):
-    # cols 为列名
-    # values 为值，list
+def insert_table(document,df):
+    '''
+    instructions : plot table which insert into docx
+    :param document: document obj
+    :param df: dataframe
+    :return:
+    '''
+
+    cols = df.columns
+    values = df.values
    table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
    hdr_cells = table.rows[0].cells
    for i in range(len(cols)):
@@ -32,5 +39,8 @@ def insert_table(document, cols, values):
    for value in values:
        row_cells = table.add_row().cells
        for i in range(len(cols)):
-            row_cells[i].text = str(value[i])
-    return document
\ No newline at end of file
+            if type(value[i])==str:
+                row_cells[i].text = value[i]
+            else:
+                row_cells[i].text = str(value[i])
+    return document