调整了代码结构

15489661 · 王家华 · da357ba7 · 15489661 · 15489661 · 15489661
Commit 15489661 authored May 08, 2019 by 王家华
26 changed files
--- a/datasource/__init__.py
+++ b/datasource/__init__.py
--- a/datasource/dbquery.py
+++ b/datasource/dbquery.py
+import pandas as pd
+def mysql_query(sql,engine_sql):
+    '''
+    查询大量数据
+    :param sql:
+    :param engine_sql:查询器
+    :return:dataframe
+    '''
+    res=[]
+    #== palo 每次查询不超过10000
+    tmp=pd.read_sql(sql,engine_sql,chunksize=5001)
+    for tt in tmp:
+        res.append(tt)
+    return pd.concat(res)
\ No newline at end of file
--- a/datasource/mongodb.py
+++ b/datasource/mongodb.py
+import pymongo
+import pandas as pd
+import numpy as np
+limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
+query = "{'order_id':1,'@key':1}"
+'''
+instructions : query from mongoDB which should assign a defined list
+Params : 
+    limit - limit dict
+    query - qurey dict which contains keys that should be query
+'''
+def querymongo(start_time_period, end_time_period, limit, query):    
+    myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
+    mydb = myclient["rc_dp_feature_pro"]
+    mycol = mydb["rc_feature_analysis_timing_v2"]
+    # all data 
+    #x = mycol.find()
+    # approval data
+    #x = mycol.find({"wf_audit_result":"1"})
+    # gt greater than, lt less than. e = equals
+    x = mycol.find(eval(limit),eval(query)) 
+    myclient.close()
+    return pd.DataFrame(list(x))
--- a/datasource/mysql_config.ini
+++ b/datasource/mysql_config.ini
+[analysis_new]
+db=analysis
+host=172.30.4.63
+port=3306
+user=analysis_model
+passwd=BGzTPQjDQqJ6PVnK
+[risk_info]
+db=risk_info
+host=172.30.5.106
+port=3306
+user=sys_read
+passwd=quant12345
+[xyqb_feature]
+db=xyqb_feature
+host=xyqb-rule-db.quantgroups.com
+port=6606
+user=xyqb_rule_read
+passwd=1q2w3e4r
+[risk_analysis]
+db=risk_analysis
+host=172.20.6.9
+port=9030
+user=linfang_wang
+passwd=BHWZ3zcZ
\ No newline at end of file
--- a/datasource/mysqldb.py
+++ b/datasource/mysqldb.py
+import os
+from sqlalchemy import create_engine
+import datetime
+class sql_engine():
+    def __init__(self, db, db_name=None, echo=False):
+        """
+        给出数据库名字，创建数据库连接
+        :param db:
+        :param db_name:
+        :param echo:
+        """
+        try:
+            import Configparser
+            self.cf = Configparser.ConfigParser()
+        except:
+            import configparser
+            self.cf = configparser.ConfigParser()
+        self.cf.read(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'mysql_config.ini'))
+        host = self.cf.get(db, 'host')
+        user = self.cf.get(db, 'user')
+        passwd = self.cf.get(db, 'passwd')
+        port = int(self.cf.get(db, 'port'))
+        if not db_name:
+            db_name = self.cf.get(db, 'db')
+        try:
+            self.__engine = create_engine(
+                'mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
+                connect_args={'connect_timeout': 3600})
+        except:
+            self.__engine = create_engine(
+                'mysql+pymysql://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
+                connect_args={'connect_timeout': 3600})
+    def get_engine(self):
+        return self.__engine
+if 'echo' not in vars():
+    echo = False
+engine_feature = sql_engine('xyqb_feature', 'xyqb_rule').get_engine()
+engine_risk = sql_engine('risk_info', 'risk_info').get_engine()
+engine_analysis_new = sql_engine('analysis_new').get_engine()
+engine_risk_analysis = sql_engine('risk_analysis').get_engine()
--- a/feature/__init__.py
+++ b/feature/__init__.py
--- a/graph/__init__.py
+++ b/graph/__init__.py
--- a/graph/drawplot.py
+++ b/graph/drawplot.py
+from pyplotz.pyplotz import PyplotZ
+from pyplotz.pyplotz import plt
+from data.analyis import datacal
+import seaborn as sns
+import pandas as pd
+plt.rc('figure',figsize=(8,6))
+font_options={
+    'weight':'bold',
+    'size':'14'
+}
+plt.rc('font',**font_options)
+def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
+    '''
+    x:x轴；y:y轴
+    :param df:dataframe
+    :param x:
+    :param y:
+    :param classes:分组，str
+    :param bin:
+    :param title:
+    :param xlabel:
+    :param ylabel:
+    :return:
+    '''
+    # #== 单个TODO 待输出
+    plt.close('all')
+    if classes !='':
+        df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
+        #== 显示样本数量
+        df_fig = pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
+                                 values=['count'], aggfunc=['mean'])
+        df_fig=df_fig['mean']['count']
+        #== 行数
+        rows=df_fig.index.tolist()
+        n_rows=len(rows)
+        # 列数
+        cols=df_fig.columns.levels[0].categories.to_tuples().tolist()
+        n_cols=len(cols)
+        cell_text=df_fig.values.tolist()
+        plt.subplot(2, 1,1)
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+        plt.subplot(2, 1, 2)
+        draw_lineplot(df_out,'grid','acmMean',hue=classes,title=title+'累计',xlabel=xlabel,ylabel=ylabel)
+    else :
+        df_out = datacal.cal_accume(df, x, y, bin)
+        plt.subplot(2, 1, 1)
+        draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
+        plt.subplot(2, 1, 2)
+        draw_lineplot(df_out, 'grid','acmMean', title=title+'累计', xlabel=xlabel, ylabel=ylabel)
+    plt.tight_layout()
+    # plt.show()
+    return plt
+def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
+    '''
+    特征与label的关系图,y为label
+    :param df:
+    :return:
+    '''
+    plt.close('all')
+    plt.subplot(1, 1, 1)
+    if classes !='':
+        df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+    else:
+        df_out = datacal.cal_univar(df, x, y, bin)
+        draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
+    # plt.show()
+    return plt
+def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
+    '''
+    特征与label的关系图,y为label
+    :param df:
+    :return:
+    '''
+    plt.close('all')
+    plt.subplot(1, 1, 1)
+    if classes !='':
+        df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
+        draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
+    else:
+        df_out = datacal.cal_univar(df, x, y, bin)
+        draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
+    # plt.show()
+    return plt
+def draw_barplot(df,x,y,hue='',title=''):
+    '''
+    :param df: dataframe
+    :param x: 横坐标
+    :param y: 纵坐标
+    :param hue: 分类
+    :param title:
+    :return:fig
+    '''
+    pltz = PyplotZ()
+    pltz.enable_chinese()
+    fig = plt.figure()
+    plt.close('all')
+    sns.set(style="whitegrid")
+    fig = plt.figure(figsize=(6, 4))
+    ax = fig.add_subplot(1, 1, 1)
+    if hue != '':
+        sns.barplot(x, y, hue=hue, data=df, ax=ax)
+    else:
+        sns.barplot(x, y, data=df, ax=ax)
+    # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
+    pltz.xlabel(x)
+    pltz.ylabel(y)
+    pltz.title(title)
+    pltz.legend()
+    plt.grid()
+    # plt.show()
+    return fig
+def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
+    '''
+    :param df: dataframe
+    :param x: 横坐标
+    :param y: 纵坐标
+    :param hue: 分类
+    :param title:
+    :return:fig
+    '''
+    pltz = PyplotZ()
+    pltz.enable_chinese()
+    # fig = plt.figure()
+    if hue != '':
+        for type in df[hue].unique().tolist():
+            # == 画图
+            tmp=df[df[hue]==type]
+            plt.plot(tmp[x], tmp[y], linestyle='dashed', marker='o',label=type)
+    else:
+        plt.plot(df[x], df[y], linestyle='dashed', marker='o')
+    # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
+    if xlabel !='':
+        pltz.xlabel(xlabel)
+    else:
+        pltz.xlabel(x)
+    if ylabel !='':
+        pltz.ylabel(ylabel)
+    else:
+        pltz.ylabel(y)
+    pltz.title(title)
+    pltz.legend()
+    plt.grid()
+    # plt.show()
+    return plt
\ No newline at end of file
--- a/graph/matplot.py
+++ b/graph/matplot.py
+"""
+Created on Thu Apr 18 11:32:06 2019
+@author: wangjiahua
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns 
+plt.rcParams['font.sans-serif'] = ['SimHei']
+plt.rcParams['axes.unicode_minus'] = False
+plt.rcParams['savefig.dpi'] = 226 #图片像素 
+plt.rcParams['figure.dpi'] = 200 #分辨率
+def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
+               saved_path=None):
+    '''
+    instructions : visualization of pivot
+    '''
+    plt.rcParams['font.sans-serif'] = ['SimHei']
+    plt.rcParams['axes.unicode_minus'] = False
+    plt.rcParams['savefig.dpi'] = 226  # 图片像素
+    plt.rcParams['figure.dpi'] = 200  # 分辨率
+    fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
+    table_rows = dataset.columns
+    table_cols = dataset.index
+    # traverse each columns of dataframe
+    for i in table_rows:
+        x = table_cols
+        y = dataset[i]
+        axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
+    if plot_tab != False:
+        the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
+                              rowLabels=table_rows,
+                              colLabels=table_cols,
+                              colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
+                              loc='bottom')
+        plt.xticks([])
+    the_table.auto_set_font_size(False)
+    the_table.set_fontsize(8)
+    fig.subplots_adjust(bottom=0.2)
+    plt.grid()
+    plt.ylabel(title)
+    plt.legend()
+    # plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
+    plt.title(title)
+    plt.show()
+    return 1
+def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
+                           save_path = None, figure_arrangement = 11, fig_size = (4,3),
+                           fig_title='General Plot', fig_name = 'untitled',
+                           fig_path = None):
+    col = dataset.columns
+    index = pd.Series(dataset.index.sort_values()).astype(str)
+    plt.figure(figsize=fig_size)
+    metric = figure_arrangement // 10 * figure_arrangement % 10
+    for i in range(int(np.ceil(len(col) // metric))):
+        cols = col[i * metric:]
+        for fig_ith in range(len(cols)):
+            axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
+            axs.plot(index,dataset.loc[cols[fig_ith]])
+            axs.set_title(cols[fig_ith],fontsize = 7)
+            plt.xticks(fontsize = 5)
+            plt.yticks(fontsize = 5)
+            plt.grid()
+            if x_label != None:
+                axs.set_xlabel(x_label, fontsize = 5)
+                if y_label != None:        
+                    axs.set_ylabel(y_label, fontsize = 5)
+        plt.tight_layout()
+        plt.show()
+    return 1
+#fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
+#
+#for fig_ith in range(len(df.columns)):
+#    axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
+#    axs.plot(df.index,df.iloc[fig_ith])
+#    axs.set_title(col[])
+#plt.tight_layout()
+def plot_curve_multiCurve(dataset, x_label = None, y_label = None,table_tab = None,
+                           save_path = None, figure_arrangement = 11, fig_size = (4,3),
+                           fig_title='General Plot', fig_name = 'untitled',
+                           fig_path = None):
+    col = dataset.columns
+    index = pd.Series(dataset.index.sort_values()).astype(str)
+    plt.figure(figsize=fig_size)
+    #metric = figure_arrangement // 10 * figure_arrangement % 10
+        #cols = col[i * metric:]
+    axs = plt.subplot(111)
+    for fig_ith in range(len(col)):            
+        axs.plot(index,dataset.loc[col[fig_ith]],label=col[fig_ith])
+    axs.set_title(col[fig_ith],fontsize = 7)
+    plt.xticks(fontsize = 5)
+    plt.yticks(fontsize = 5)
+    plt.grid()
+    if x_label != None:
+        axs.set_xlabel(x_label, fontsize = 5)
+    if y_label != None:        
+        axs.set_ylabel(y_label, fontsize = 5)
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+    return 1
+'''
+'''
+def plot_curve_mingle():
+    return 1
+def density_chart(dataset,title):
+    for col in dataset.columns:
+        sns.kdeplot(dataset.loc[:,col],label = col)
+    plt.title(title)
+    plt.show()
+#        
+#	    alpha = 0.98 / 4 * fig_ith + 0.01
+#	    ax.set_title('%.3f' % alpha)
+#	    t1 = np.arange(0.0, 1.0, 0.01)
+#
+#
+#	    for n in [1, 2, 3, 4]:
+#	        plt.plot(t1, t1 ** n, label="n=%d" % n)
+#	    leg = plt.legend(loc='best', ncol=4, mode="expand", shadow=True)
+#	    leg.get_frame().set_alpha(alpha)
+#
+#
+#	# if this fig should be saved
+#	if fig_path != None:
+#		plt.savefig(fig_path + fig_name +'.png')
+#	
+#
+#
+##	for i in range(figure_arrangement%10):
+##		plt.subplots(,figsize=fig_size,linewidth=0.1)
+#
+#	return 1
\ No newline at end of file
--- a/models_kit/__init__.py
+++ b/models_kit/__init__.py
--- a/models_kit/__pycache__/__init__.cpython-37.pyc
+++ b/models_kit/__pycache__/__init__.cpython-37.pyc
--- a/models_kit/__pycache__/lightgbm.cpython-36.pyc
+++ b/models_kit/__pycache__/lightgbm.cpython-36.pyc
--- a/models_kit/__pycache__/xgboost.cpython-37.pyc
+++ b/models_kit/__pycache__/xgboost.cpython-37.pyc
--- a/models_kit/lightgbm.py
+++ b/models_kit/lightgbm.py
+import lightgbm as lgb
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import confusion_matrix, mean_squared_error
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import os,psutil
+params_lgb = {
+    'task': 'train',  # 用途
+    'application': 'binary',  # 用于二分类
+    'boosting_type': 'gbdt',  # 设置提升类型
+    'num_boost_round': 150,  # 迭代次数
+    'learning_rate': 0.01,  # 学习速率
+    'metric': {'logloss', 'auc'},  # 评估函数
+    'early_stopping_rounds': None,
+    #         'objective': 'regression', # 目标函数
+    'max_depth': 4,
+    'num_leaves': 20,  # 叶子节点数
+    'feature_fraction': 0.9,  # 建树的特征选择比例
+    'bagging_fraction': 0.8,  # 建树的样本采样比例
+    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
+    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
+}
+def returnAUC(clf, training_set, validation_set, features, target='target'):
+    '''
+    instructions : return AUC of training set & test set
+    Parameters :
+        clf - classifier training object
+        training_set - training dataset
+        validation_set -
+        features - features of training set
+        target - X_test labels
+    '''
+    train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
+    val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
+    print('training set AUC : ', train_auc)
+    print('validation set AUC : ', val_auc)
+    return train_auc, val_auc
+def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
+    '''
+    instructions : training lightgbm model with specified params
+    Parameters :
+        params - default params
+        df_train - training set
+        df_val - validation set
+        features - feature list of dataset
+        adds_on - parameters dict which would assign as training parameters
+        target - tagert column or label list of samples
+    '''
+    params = params.copy()
+    print(type(df_train), type(df_val))
+    # training params
+    if adds_on != None:
+        for i in adds_on.keys():
+            params[i] = adds_on[i]
+            # convert DataFrame to binary format
+    lgb_train = lgb.Dataset(df_train[features], df_train[target])
+    lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
+    lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
+    train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
+    # auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
+    return train_auc, val_auc, lgbm
+def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
+    '''
+    instructions : find optimal parameters with lgbm
+    Parameters :
+        params - default parameters (dict format)
+        target_params - parameter which would be tuning
+        features - features list
+        train - training set
+        val - validation set
+        target - target label
+        topN - top N optimal parameters
+        cv_fold - k folders CV
+    '''
+    # reassign as a duplication
+    params = params.copy()
+    lgb_train = lgb.Dataset(train[features], train[target])
+    lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
+    # create a ndarray shapes 1*n
+    topn = np.zeros(topN)
+    # make sure that memory can afford
+    print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
+    optimal_para = list(topn)
+    for deepth in np.arange(2, 7, 1):
+        for leaves in np.arange(2, 2 ** deepth, 2):
+            params['max_depth'] = deepth
+            params['num_leaves'] = leaves
+            print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
+            cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
+            # return max auc(best performance)
+            auc_score = pd.Series(cv_result['auc-mean']).max()
+            print('auc ', auc_score)
+            boost_round = pd.Series(cv_result['auc-mean']).argmax()
+            # if anyone greater than item in topn list(either of them)
+            if (auc_score > topn).any():
+                # find the worst one / lowest AUC
+                topn[topn.argmin()] = auc_score
+                para = {}
+                # replace the worst parameter with a greater combination
+                para['max_depth'] = deepth
+                para['num_leaves'] = leaves
+                optimal_para[topn.argmin()] = para
+    return optimal_para, lgb_train, lgb_val, topn
+#        training_curve.append(train_auc)
+#        validation_curve.append(val_auc)
+# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
+#    print(auc_matrix)
+#
+#    plt.plot(candidate_list, training_curve,label='training')
+#    plt.plot(candidate_list, validation_curve,label='validation')
+#    plt.legend()
+#    plt.show()
+#
+#    return validation_curve[:3]
+# pending here 这个函数没有测
+# def lightGBM_gridCV(param_validation, params=params_lgb):
+#     # make sure that memory can afford
+#     print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
+#
+#     param_test = {
+#         'max_depth': np.arange(2, 7, 1),
+#         'num_leaves': np.arange(20, 200, 10),
+#     }
+#     estimator = LGBMRegressor(
+#         num_leaves=50,
+#         max_depth=13,
+#         learning_rate=0.1,
+#         n_estimators=1000,
+#         objective='binary',
+#         min_child_weight=1,
+#         param['metric'] = ['auc', 'binary_logloss'],
+#         subsample = 0.8,
+#         colsample_bytree = 0.8,
+#         nthread = 7
+#     )
+#     gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
+#     gsearch.fit(values, labels)
+#     gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
+#     return 1
+def topN_feature_importance(classifier, clf, topN=20, model=lgb):
+    '''
+    plot feature importance squence
+    '''
+    plt.rcParams['font.sans-serif'] = ['SimHei']
+    plt.rcParams['axes.unicode_minus'] = False
+    plt.rcParams['savefig.dpi'] = 226  # 图片像素
+    plt.rcParams['figure.dpi'] = 200  # 分辨率
+    plt.figure(figsize=(10, 6))
+    classifier.plot_importance(clf, max_num_features=topN)
+    plt.title("Featurer Importances")
+    plt.show()
+def buildClf(params=params_lgb):
+    '''
+    instructions : build a lgb classifier
+    Params :
+    '''
+    return lgbm.LGBMClassifier(params)
+def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
+    # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
+    grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
+    # == 模型训练
+    grid_search.fit(dftrain[features], dftrain[resp])
+    # == 获取最优参数
+    return grid_search
+##############################################################################
--- a/models_kit/xgboost.py
+++ b/models_kit/xgboost.py
+import pandas as pd
+import numpy as np
+import xgboost as xgb
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import confusion_matrix, mean_squared_error
+from sklearn import metrics
+target = 'target'
+import xgboost as xgb
+# default parameters
+params_xgb = {
+    'learning_rate': 0.1,
+    'n_estimators': 200,
+    'max_depth': 3,
+    'min_child_weight': 1,
+    'gamma': 0,
+    'subsample': 0.8,
+    'colsample_bytree': 0.8,
+    'objective': 'binary:logistic',
+    'nthread': 4,
+    'scale_pos_weight': 1,
+    'seed': 27
+}
+def returnAUC(clf, training_set, validation_set, features, target='target'):
+    '''
+    instructions : return AUC of training set & test set
+    Parameters :
+        clf - classifier training object
+        training_set - training dataset
+        validation_set -
+        features - features of training set
+        target - X_test labels
+    '''
+    train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
+    val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
+    print('training set AUC : ', train_auc)
+    print('validation set AUC : ', val_auc)
+    return train_auc, val_auc
+def xgb_train(params, train, val, features, target='target'):
+    '''
+    instructions : training lightgbm model with specified params
+    Parameters :
+        dataset -
+        features - feature list of dataset
+        target - tagert column or label list of samples
+    '''
+    dtrain = xgb.DMatrix(train[features], train[target])
+    dval = xgb.DMatrix(val[features], val[target])
+    # xgb_clf = xgb.XGBClassifier(params_xgb)
+    xgb_clf = xgb.XGBClassifier(params_xgb)
+    xgb_clf.fit(train[features], train['target'])
+    # xgbm =  xgb.train(params,dtrain)
+    returnAUC(xgb_clf, train, val, features)
+    # auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
+    return xgb_clf
+#############################################################################
+def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
+                  min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
+                  scale_pos_weight=1, base_score=0.5):
+    '''
+    创建 XGBClassifier instance
+    :param max_depth:叶子节点深度,值越大越容易过拟合。可使用CV 进行调节-- booster 参数
+    :param learning_rate:学习率，alias eta-- booster 参数
+    :param n_estimators:number of trees
+    :param verbosity:0:silent;3:debug replace silent 是否输出模型迭代信息-- 通用参数
+    :param objective:目标学习函数-- 学习目标参数
+        binary:logistic 二分类的逻辑回归，返回预测的概率(不是类别)。
+        multi:softmax 使用softmax的多分类器，返回预测的类别(不是概率)。 需要设置num_class(类别数目)。
+        multi:softprob 和multi:softmax参数一样，但是返回的是每个数据属于各个类别的概率
+    :param booster:gbtree  gblinear dart-- 通用参数
+    :param n_jobs: replaces nthread 进程数-- 通用参数
+    :param gamma:如果损失函数下降，则分裂节点。控制最小损失函数下降值-- booster 参数
+    :param min_child_weight:最小叶子节点样本权重和。避免过拟合，使用cv进行调整，值大，防过拟合，亦可能欠拟合-- booster 参数
+    :param max_delta_step:限制每棵树权重改变的最大步长。0：无约束，>0 保守-- booster 参数
+    :param subsample:这个参数控制对于每棵树，随机采样的比例
+    :param colsample_bytree:用来控制每棵随机采样的列数的占比(每一列是一个特征)。
+    :param reg_alpha:L1 正则项参数
+    :param reg_lambda:L2 正则项参数
+    :param scale_pos_weight:一般为负样本数/正样本数
+    :param base_score:
+    :param random_state: replace seed,统一设置为7，仅为随机可复现
+    :return:XGBClassifier
+    '''
+    return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
+                             verbosity=0,silent=0,objective='binary:logistic',
+                             booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
+                             max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
+                             reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
+                             base_score=base_score,random_state=7,seed=7
+                             )
+def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
+    '''
+    模型自动调参
+    :param clf : XGBClassifier
+    :param param_grid : dict,调参的区间设定
+    :param scoring : 调参 评估标准 默认 roc_auc
+    :param dftrain:
+    :param features:
+    :param resp:
+    :param kfold:
+    :return:
+    '''
+    # kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
+    grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
+    #== 模型训练
+    grid_search.fit(dftrain[features],dftrain[resp])
+    #== 获取最优参数
+    return grid_search
+def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
+    '''
+    模型训练
+    :type useTrainCV: object
+    :param clf:XGBClassifier
+    :param dftrain:训练集
+    :param features: 特征
+    :param resp:label
+    :param useTrainCV:if True  call cv function,目的是调节参数 n_estimators
+    :param cv_folds: N 折交叉验证
+    :param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数，达到这个数就退出训练过程
+    :param eval_metric 同 目标函数 objective 有关，取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
+    :return:
+    '''
+    if dftrain[features].shape[0]==0:
+        raise(' NO train data !!!! ')
+    if useTrainCV:
+        # kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
+        xgb_param = clf.get_xgb_params()
+        xgtrain = xgb.DMatrix(dftrain[features], label=dftrain[resp])
+        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
+            metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
+        clf.set_params(n_estimators=cvresult.shape[0])
+    clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
+    return clf
+def predict(clf,df,features):
+    '''
+    计算预测值
+    :param clf:
+    :param df:
+    :param features:
+    :return:
+    '''
+    df['predict']=clf.predict(df[features])
+    df['predict_proba']=clf.predict_proba(df[features])[:,1]
+    return df
+def auc(clf,df,features,label):
+    #== 计算准确率，auc等指标
+    df=predict(clf,df,features)
+    accu=metrics.accuracy_score(df[label].values, df['predict'].values)
+    auc=metrics.roc_auc_score(df[label],df['predict_proba'])
+    return {'accuracy':accu,'auc':auc}
+def featureImportance(clf,features):
+    '''
+    获取模型 特征权重
+    :param clf:
+    :param features:
+    :return:
+    '''
+    # Print Feature Importance:
+    feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
+    feat_imp = feat_imp[feat_imp > 0]
+    feat_imp=feat_imp.to_frame().reset_index()
+    feat_imp.columns=['feature','weight']
+    return feat_imp
--- a/models_obj/__init__.py
+++ b/models_obj/__init__.py
--- a/models_obj/dhb_cuishou_jianzhi_v3.pkl
+++ b/models_obj/dhb_cuishou_jianzhi_v3.pkl
--- a/mvp/dhb.py
+++ b/mvp/dhb.py
 import pandas as pd
-from data.datasource import mysqldb, mongodb
+from datasource import mysqldb, mongodb
 import time
 from dateutil.relativedelta import relativedelta
 import datetime
+import pickle
 '''
 model instructions : established a dhb obj which cotains attrubutes of dhb model
@@ -179,34 +181,34 @@ class dhb:
                'dhb_overview_ntdun_first_call_time',
                'dhb_overview_ntdun_last_call_time']
-    sql = '''
-    select ''' + str(features).replace('[', '').replace(']', '').replace('\'', '') + ''',applied_at,applied_from,applied_type,if(passdue_day>15,1,0) as target
-    from risk_analysis
-    where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
-    and transacted = 1
-    and dhb_flag =1
-    and datediff(now(),deadline) > 15
-    '''
    # default time interval
    start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00")
    end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00")
-    def __init__(self, features=None, sql=None, start_time_period=None, end_time_period=None):
+    def __init__(self, features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15):
        try:
            # if the para was not Series
            if features != None:
                self.features = features
        except Exception as e:
            print("'features' parameter type Error, it should be list or Series")
-            raise
            if start_time_period != None:
                self.start_time_period = start_time_period
            if end_time_period != None:
                self.end_time_period = end_time_period
            if sql != None:
                self.sql = sql
+            else:
+                sql = '''
+                select ''' + str(features).replace('[', '').replace(']', '').replace('\'','') + ''',applied_at,applied_from,applied_type,if(passdue_day>''' + str(passdue_day) + ''',1,0) as target
+                from risk_analysis
+                where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
+                and transacted = 1
+                and dhb_flag =1
+                and datediff(now(),deadline) > ''' + str(passdue_day) + '''
+                '''
    def dhb_features_extract(self):
@@ -287,9 +289,31 @@ class dhb:
        liftchart plot 
    '''
-    from data.datasource import mongodb
+    def dhb_predict_with_pkl(self,test,pkl='./dhb_cuishou_jianzhi_v3.pkl',features=features):
+        open_file = open(pkl, "rb")
+        model = pickle.load(open_file)
+        open_file.close()
+        return model.predict(test[features])
    def dhb_dataSketch(self,df, given_dataset=None, start_time_period = start_time_period, end_time_period = end_time_period,
                   applied_type=None, applied_from=None):
+        '''
+        instructions : build a comparasion
+        Params :
+            df - test dataset which was given
+            score - score column
+            target - label
+            start_time_period -
+            end_time_period -
+            applied_tpye -
+            applied_from -
+        Returns :
+            auc comparasion
+            liftchart plot
+        '''
        limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
        query = "{'order_id':1,'@key':1}"
        df_mongo = mongodb.querymongo(start_time_period, end_time_period,
@@ -324,14 +348,17 @@ class dhb:
        pivot.plot()
        return 1
-    def data_merge(self):
-        # merge data from mongodb & risk_analysis
-        return 1
    def vlm(self,feature):
        return 1
+    def psi(self,feature):
+        return 1
+    def liftchart(self,feature):
+        return 1

--- a/mvp/docxReport.py
+++ b/mvp/docxReport.py
-# Author : Jason Wang
-# latest update : May 6 2019
-# version control :
-#
-#######################################################################################################################
-import pandas as pd
-import numpy as np
-import datetime
-from data.analyis import filetool
-from data.analyis import datacal
-from models import lightgbm
-from matplotlib import pyplot as plt
-from data.graph import matplot
-# 选定的topnfeatures
--- a/mvp/rebuild/__init__.py
+++ b/mvp/rebuild/__init__.py
--- a/mvp/refit/__init__.py
+++ b/mvp/refit/__init__.py
--- a/mvp/test.csv
+++ b/mvp/test.csv
--- a/mvp/xgbreport.py
+++ b/mvp/xgbreport.py
@@ -3,9 +3,9 @@ import numpy as np
 import datetime
 from data.analyis import filetool
 from data.analyis import datacal
-from models import lightgbm
+from models_kit import lightgbm
 from matplotlib import pyplot as plt
-from data.graph import drawplot
+from graph import drawplot
 def report(dftrain,dftest,features,label,path,filename,kfold=10):

--- a/tools/__init__.py
+++ b/tools/__init__.py
--- a/tools/datacal.py
+++ b/tools/datacal.py
+import pandas as pd
+import numpy as np
+import datetime
+from sklearn.model_selection import train_test_split
+def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
+                             split_methods='random', time_label='applied_at'):
+    '''
+    instructions - train-test split (split only train & test when val_size equals None)
+    Params :
+        dataset
+        val_size - validation RATIO
+        tets_size - test set RATIO
+        stratify - stratify LABEL
+        random_state
+        split_methods - random or timeSeries
+        time_label - label that could identify date & time
+    '''
+    # split data as random
+    if split_methods == 'random':
+        df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
+        # df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
+        if val_size != None:
+            size = val_size / (1 - test_size)
+            df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
+        # case when validation set not exists
+        return df_train, df_val, df_test
+    # split data with time sequence
+    elif split_methods == 'timeSeries':
+        data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
+        df_test = data_tmp[: int(len(dataset) * test_size)]
+        df_train = data_tmp[int(len(dataset) * test_size):]
+        return df_train, df_test
+def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
+    '''
+    切分df 为训练集 和 验证集
+    :param xgb: xgboost classifier
+    :param df: dataframe
+    :param trainsplit: df 切分为训练集，验证集，支持 timeSeries，random，默认为 random
+    :param trainsplitRatio:如果是随机切分，则切分比例为 0.8为训练集
+    :param sort_col:如果为按照时间切分，则对 时间进行排序column
+    :return:
+    '''
+    dftrain=df.reset_index()
+    #== dftrain 中划分 训练集，验证集
+    if trainsplit=='random':
+        # 随机分配 train / val
+        train = dftrain.sample(frac=trainsplitRatio, random_state=7)
+        val = dftrain[~dftrain.index.isin(train.index)]
+    elif trainsplit=='timeSeries':
+        # 按时间序列分配 train /val
+        train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
+        val = dftrain[~dftrain.index.isin(train.index)]
+    else:
+        train = df
+        val = None
+    return train,val
+def cal_week(df,date_name,date_name_new):
+    '''
+    :param df: dateframe
+    :param date_name: eg applied_at
+    :return: %y-%m-%d 每周第一天
+    '''
+    columns = df.columns.tolist()
+    if date_name not in columns:
+        raise ('not found %' % date_name)
+    df[date_name] = pd.to_datetime(df[date_name])
+    df[date_name_new] = df[date_name].dt.strftime('%w')
+    df[date_name_new] = df[date_name_new].astype(int)
+    df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
+    df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
+    return df
+def cal_month(df,date_name,date_name_new):
+    '''
+    :param df: dateframe
+    :param date_name: eg applied_at
+    :return: %y-%m
+    '''
+    columns=df.columns.tolist()
+    if date_name not in columns:
+        raise('not found %' % date_name)
+    df[date_name]=pd.to_datetime(df[date_name])
+    df[date_name_new]=df[date_name].dt.strftime('%y-%m')
+    return df
+def cal_feature_grid(df,feature,bin=10,method=2):
+    '''
+    定义 N分位切割区间,负数单独一个区间，非负数N 切割
+    数据离散计算，默认等频；等宽 1 ，等频 2
+    :param df:dataframe
+    :param feature:
+    :param bin:
+    :param method: 1:等宽；2：等频；3：聚类；默认2
+    :return:
+    '''
+    #== 等宽为数据max-min / bin 即每个区间的宽度是一样的
+    #== 存在数据每个区间数量不一致
+    tmp=df.copy()
+    tmp[feature]=tmp[feature].astype(float)
+    tmp[feature].fillna(-1,inplace=True)
+    # 默认负数为单独一个区间
+    num = df[feature].nunique()
+    if method==1:
+        max=df[feature].max()
+        if max <0 :
+            max=0
+        if num < bin:
+            feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
+        else:
+            bin_index = [max*i / bin for i in range(0, bin + 1)]
+            feature_grid = sorted(set(bin_index) | set([-99999, -0.00001]))
+    else:
+        # 等频离散，保证每个区间的数量是尽量一致
+        if num < bin:
+            feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
+        else:
+            # == 负数单独一个区间,非负数n等份
+            bin_index = [i / bin for i in range(0, bin + 1)]
+            feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999, -0.00001]))
+    return feature_grid
+def cal_accume(df,feature,target,bin=10,classes=[]):
+    '''
+    groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
+    :param df:
+    :param feature:
+    :param target:
+    :param bin:
+    :param classes:
+    :return: 对feature 进行分段；计算每个区间的mean,count,sum 累计 count,mean ,sum
+    '''
+    df_out=cal_univar(df,feature,target,bin,classes=classes)
+    df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
+    df_out['acmSum']=df_out.groupby(classes)['sum'].cumsum()
+    df_out['acmMean']=df_out['acmSum']/df_out['acmCnt']
+    return df_out
+def cal_univar(df,feature,target,bin=10,classes=[]):
+    '''
+    groupby(classes) 分组,对feature 进行bin 分位，对各个分位进行 count,mean ,sum计算
+    :param df: dataframe
+    :param feature: feature in df.columns
+    :param target: in df.columns eg: count(target) mean(target)
+    :param bins:default =10
+    :param classes: 分组
+    :return:
+    '''
+    if df.shape[0]==0:
+        raise('no date')
+    columns=df.columns.tolist()
+    if target not in columns:
+        raise('not found %s' % target)
+    if feature not in columns:
+        raise('not found %s' % feature)
+    tmp=df.copy()
+    tmp[feature].fillna(-1, inplace=True)
+    # == bin 划分,feature 有可能 非数字
+    try:
+        tmp[feature]=tmp[feature].astype(float)
+        feature_grid = cal_feature_grid(tmp,feature,bin)
+        tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
+        tmp['grid'] = tmp['lbl'].cat.codes
+    except ValueError:
+        tmp['lbl']=tmp[feature]
+        tmp['grid']=tmp[feature]
+    if len(classes) > 0:
+        df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
+        df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
+        df_out=df_gp
+    else:
+        df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
+        df_all.columns = ['grid','lbl', 'count', 'mean','sum']
+        df_out = df_all
+    return df_out
+def cal_distribution(df,target,classes=[]):
+    '''
+    对 classes 分组，对target 计算count,mean
+    :param df: dataframe
+    :param target: cal mean(target),count(target)
+    :param classes:分组
+    :return:dataframe
+    '''
+    if df.shape[0]==0:
+        raise('no date')
+    columns=df.columns.tolist()
+    if target not in columns:
+        raise('not found target')
+    tmp=df.copy()
+    headers = classes + [ 'count', 'mean']
+    if len(classes) > 0:
+        df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
+        df_gp.columns=classes + ['count','mean']
+        df_out=df_gp
+    else:
+        all = [[tmp[target].count(),tmp[target].mean()]]
+        df_all = pd.DataFrame(all, columns=headers)
+        df_out=df_all
+    return df_out[headers]
+def cal_miss(df,feature,classes=[]):
+    '''
+    target: 计算 某个 特征的 缺失率
+    :param df: dataframe
+    :param feature ； field name in df.columns
+    :param classes : list 要分组的，如果为空，默认不分组
+    :return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
+    :argument warnning 分为 0值，非0值，负值，默认负数+缺失值均为负值处理
+    '''
+    if df.shape[0] <=0:
+        raise('no data')
+    columns=df.columns.tolist()
+    if feature not in columns:
+        raise('no feature')
+    tmp=df.copy()
+    try:
+        tmp[feature]=tmp[feature].astype(float)
+        tmp[feature].fillna(-1,inplace=True)
+        tmp['flag'] = '缺失值'
+        tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
+        tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
+    except:
+        tmp['flag'] = '缺失值'
+        tmp.loc[tmp[feature].notna(), 'flag'] = '未缺失'
+        tmp[feature].fillna('缺失', inplace=True)
+    headers = classes+['flag', 'cnt', 'match_rate']
+    if len(classes) > 0:
+        # == 分类型
+        df_gp = pd.merge(
+            tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
+            tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
+            on=classes, how='left'
+        )
+        df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
+        df_out = df_gp
+    else:
+        df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
+        df_out['cnt']=tmp.shape[0]
+        df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
+    return df_out[headers]
--- a/tools/filetool.py
+++ b/tools/filetool.py
+import os
+from docx import Document
+from docx.shared import Inches
+def buildDocument(path,filename):
+    if filename[-3:]!='doc':
+        if filename[-4:] !='docx':
+            raise ValueError('{} is not a word file'.format(filename))
+    if os.path.exists(os.path.join(path,filename)):
+        return Document(os.path.join(path,filename))
+    return Document()
+def saveDocument(document,path,filename):
+    if filename[-3:] != 'doc':
+        if filename[-4:] != 'docx':
+            raise ValueError('{} is not a word file'.format(filename))
+    return document.save(os.path.join(path,filename))
+def insert_table(document, cols, values):
+    # cols 为列名
+    # values 为值，list
+    table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
+    hdr_cells = table.rows[0].cells
+    for i in range(len(cols)):
+        hdr_cells[i].text = cols[i]
+    for value in values:
+        row_cells = table.add_row().cells
+        for i in range(len(cols)):
+            row_cells[i].text = str(value[i])
+    return document
\ No newline at end of file