#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from datetime import datetime
import pandas as pd
pd.options.display.max_columns = 1000
import pymongo
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics

# read mongodb mapping from excel
mapping_score = pd.read_excel("../query_score.xlsx",sheet_name='score_mongo').dropna(axis=0)
#mapping_variable = pd.read_excel("./mongodb.xlsx",sheet_name='variable').dropna(axis=0)

limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}"
passdue_day = 15

appliedType_type = {'1,2,3':'总体','1,2':'首贷','1':'首申','2':'复申','3':'复贷'}
path = "../plot/PSI_VAL/"
################################### plot PSI ##################################
def plotPSI(title,y_list,dateList,psi,missing,rows,cols,table_value,save_path):
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['savefig.dpi'] = 226 #图片像素 
    plt.rcParams['figure.dpi'] = 100 #分辨率
    fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth = 0.1)
        
    for y_index in range(len(y_list)):
        y = y_list[y_index]
        x = range(len(y))
        axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' PSI:'+str(psi[y_index])+'\n缺失率:'+str(missing[y_index])+'%')
      
    the_table = plt.table(cellText=table_value,
                      rowLabels=rows,
                      colLabels=cols,
                      colWidths=[0.91 / (len(cols)-1)] * len(cols),
                      loc='bottom')
    
    the_table.auto_set_font_size(False)
    the_table.set_fontsize(8)
    fig.subplots_adjust(bottom=0.2)   
    plt.grid()
    plt.ylabel('各分段样本占比'+' (%)')
    plt.legend()
    plt.xticks([])
    #plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
    fig.suptitle(title)    
    plt.savefig(save_path + title + ".png")
    plt.show()    
    return 1 

# draw liftchart
def liftchart(df,keyword,interval):
    # split bins with scores
        
    #nothing,interval = pd.qcut(df[df.loc[:,keyword]>0][keyword],10,retbins=True,duplicates='drop')
    # delete 'nothing' var cause its useless
    if len(df[df.loc[:,keyword]<0][keyword])>0:       
        bins_interval = interval.tolist()       
        bins_interval.append(-10000000)
        bins_interval.sort()
    else:
        bins_interval = interval
    df.loc[:,'bins'] = pd.cut(df[keyword],bins_interval,precision=6)
    # count of sample
    df_count = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()    
    df_zeros = pd.Series(np.zeros(df_count['overdue'].shape),index = df_count.index)
    # overdue samples  
    df = df[df.overdue == 1]    
    #df.loc[:,'bins'] = pd.cut(df[keyword],interval)
    df_overdue = df[['applied_at','bins','overdue']].groupby(['applied_at','bins']).count()    
    df_overdue = pd.concat([df_zeros,df_overdue],axis=1)['overdue'].fillna(0)
    
    y = df_overdue / df_count['overdue'].replace(0,1) * 100    
    rows = y.index.levels[0].tolist()
    cols = df['bins'].value_counts().sort_index().index.astype('str').tolist()

    return df_count['overdue'],df_overdue,y.round(3),rows,cols
  
############################## validation liftchart###############################
def plotLiftChart(title,y_list,dateList,aucri,auc,rows,cols,table_value,save_path):
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['savefig.dpi'] = 226 #图片像素 
    plt.rcParams['figure.dpi'] = 100 #分辨率
    fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
    
    for y_index in range(len(y_list)):
        y = y_list[y_index]
        x = range(len(y))
        axs.plot(x,y,marker='o',label=dateList[y_index][0:7] + ' (AUCRI:' + str(aucri[y_index])+ ') AUC: ' + str(auc[y_index]))

    the_table = plt.table(cellText=table_value,
                      rowLabels = rows,
                      colLabels = cols,
                      colWidths = [0.91 / (len(cols)-1)] * len(cols),
                      loc = 'bottom')
    the_table.auto_set_font_size(False)
    the_table.set_fontsize(8)
    fig.subplots_adjust(bottom = 0.2)   
    plt.legend()
    plt.grid()
    plt.ylabel('贷后首逾'+str(15)+'+ (%)')
    plt.xticks([])    
    fig.suptitle(title)
    plt.savefig(save_path + title + ".png")
    plt.show()    
    return 1  

def psi_bins(df,keyword,interval):
    df.loc[:,'bins'] = pd.cut(df[keyword],interval,precision=6)
    BM = df.groupby('bins').count()[keyword]
    BM_count = BM / BM.values.sum() * 100
    return BM_count

def querymongo(limit,query):    
    myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
    mydb = myclient["rc_dp_feature_pro"]
    mycol = mydb["rc_feature_analysis_timing_v2"]

    x = mycol.find(eval(limit),eval(query)) 
    myclient.close()
    return pd.DataFrame(list(x))

benchmark_start_date = "2018-12-21 00:00:00"
benchmark_end_date = "2019-01-22 00:00:00"

psi_start_date = "2019-03-01 00:00:00"
psi_end_date = "2019-06-01 00:00:00"

val_start_date = "2019-02-03 00:00:00"
val_end_date = "2019-05-03 00:00:00"

risk_analysis_config = {'user' : 'fengkong_read_only',
                        'password' : 'mT2HFUgI',
                        'host' : '172.20.6.9',
                        'port' : 9030,
                        'database' : 'risk_analysis',
                        'encoding' : 'utf8'}

import pymysql
def connect2DB(db_config):
    db = pymysql.connect(
        host = db_config['host'],
        port = db_config['port'],
        user = db_config['user'],
        passwd = db_config['password'],
        db = db_config['database'],
        charset = db_config['encoding'])
    return db

def query_sql(sql,db=risk_analysis_config):
    try:
        conn = connect2DB(db)
        df = pd.read_sql(sql,conn)
        conn.close() 
        return df
    except Exception as e:
        return 0   

    
def dataManipul(df,keyword,interval):   
    # df count of all records    
#    missing_rate = {}
#    df_count = df[['applied_at','bins']].groupby('applied_at')    
    # count dataframe separated by mon
    # set negative as null 
    df.dropna(axis=0)[keyword] = df.dropna(axis=0)[keyword].map(lambda x : np.nan if x < 0 else x )
    df_noneNA = df.dropna(axis = 0)
        
    df_count = df[['applied_at',keyword]].fillna(0).groupby('applied_at').count()
    df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
    df_missing = df_count - df_noneNA[['applied_at',keyword]].groupby('applied_at').count()
    df_missing = pd.concat([df_zeros,df_missing],axis=1)[keyword].fillna(0)
    missing_rate = df_missing / df_count[keyword].replace(0,1) * 100

    df_noneNA['bins'] = pd.cut(df_noneNA[keyword],interval,precision=6)
    cols = df_noneNA['bins'].value_counts().sort_index().index.astype('str')
    df_count = df_noneNA[['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
    df_zeros = pd.Series(np.zeros(df_count[keyword].shape),index = df_count.index)
    df_zero = df_noneNA[df_noneNA[keyword] == 0][['applied_at','bins',keyword]].groupby(['applied_at','bins']).count()
    df_zero = pd.concat([df_zeros,df_zero],axis=1)[keyword].fillna(0)
    zero_rate = df_zero / df_count[keyword].replace(0,1) * 100
    y = df_count / df_noneNA[['applied_at',keyword]].groupby('applied_at').count() * 100
    rows = y.index.levels[0].tolist()
    
    return zero_rate.round(1),missing_rate.round(1),rows,cols,y[keyword].round(1),df_count[keyword]

sql_bins = '''
SELECT order_no,transacted,IF(passdue_day>15,1,0) as overdue 
FROM risk_analysis
WHERE applied_at >= '@benchmark_start_date' and applied_at <= '@benchmark_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND repayment_status != 4
'''

sql_observation = '''
SELECT order_no,date_format(applied_at,'%Y-%m') as applied_at 
FROM risk_analysis
WHERE applied_at >= '@psi_start_date' and applied_at <= '@psi_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND repayment_status != 4
'''

######## calculate with natural mon ###########
"""
sql_passdueday = '''
SELECT order_no,date_format(loan_start_date,'%Y-%m') as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE applied_at >= '@val_start_date' and applied_at <= '@val_end_date'
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4
'''
"""
######## calculate with T-n mon ###########

sql_passdueday = '''
(SELECT order_no,'T-1' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -15 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
UNION ALL
(SELECT order_no,'T-2' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -45 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
UNION ALL
(SELECT order_no,'T-3' as applied_at,IF(passdue_day > @passdue_day,1,0) as overdue
FROM risk_analysis
WHERE DATE_FORMAT(deadline,'%Y-%m-%d') >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -105 DAY),'%Y-%m-%d') and DATE_FORMAT(deadline,'%Y-%m-%d') < DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -75 DAY),'%Y-%m-%d')
AND applied_from IN (@channelID)
AND applied_type IN (@appliedType)
AND transacted = 1
AND repayment_status != 4)
'''

# benchmark
#df = querymongo(limit.replace('@start_date',benchmark_start_date).replace('@end_date',benchmark_end_date),query.replace('@key',key))[['order_id',key]]

modelType = mapping_score.description.tolist()
modelList = mapping_score.score.tolist()
appliedTypeList = mapping_score.appliedType.tolist()
#channelIDList = mapping_score.channel.tolist()



conn = connect2DB(risk_analysis_config)
# extract channel list where except recalling channel 
sql_channel = '''
SELECT DISTINCT(applied_from),applied_channel
FROM risk_analysis
WHERE applied_from IN
(SELECT applied_from FROM risk_analysis
WHERE transacted = 1
AND loan_start_date >= DATE_FORMAT(DATE_ADD(NOW(),INTERVAL -1 MONTH),'%Y-%m-01') 
AND loan_start_date < DATE_FORMAT(NOW(),'%Y-%m-01')
and applied_from not in (159481,159486,159528)
GROUP BY 1
HAVING SUM(real_loan_amount) > 100000
ORDER BY sum(real_loan_amount) DESC)
'''



channel = {'1,214,217,198':'内部','159507':'浅橙','159537':'360金融','333':'融360','159384,159483':'平安','159561':'51公积金API'}

channelId = pd.read_sql(sql_channel,conn).applied_from
l=''
for i in channel.keys():
    l = l + i+','
l = eval('['+l+']')   
channel[str(channelId[channelId.map(lambda x : True if x not in l else False)].tolist()).strip('[').strip(']')] = '其他渠道'    
channel[str(channelId.tolist()).strip('[').strip(']')] = '全部渠道'

for modelVar in modelList:
    for appliedType in str(appliedTypeList[modelList.index(modelVar)]).split(';'):
        print('appliedType',appliedType)
        print('appliedTypeList[model_index]',appliedTypeList[modelList.index(modelVar)])
        for channelID in channel.keys():
            try:                               
                
                df_bins = querymongo(limit.replace('@start_date',benchmark_start_date).replace('@end_date',benchmark_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]        
                df_bins = df_bins.applymap(lambda x : np.nan if x == '' else x)
                df_bins[modelVar] = df_bins[modelVar].astype('float')
                df_offline = query_sql(sql_bins.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@benchmark_start_date',benchmark_start_date).replace('@benchmark_end_date',benchmark_end_date),risk_analysis_config)
                df_bins = pd.merge(df_bins,df_offline,how='right',left_on='order_id',right_on='order_no')[['transacted','overdue',modelVar]].dropna(axis=0)
                del df_offline
                     
                df_observation = querymongo(limit.replace('@start_date',psi_start_date).replace('@end_date',psi_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]        
                df_observation = df_observation.applymap(lambda x : np.nan if x == '' else x)
                df_observation[modelVar] = df_observation[modelVar].astype('float')
                df_offline = query_sql(sql_observation.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@psi_start_date',psi_start_date).replace('@psi_end_date',psi_end_date),risk_analysis_config)
                df_observation = pd.merge(df_observation,df_offline,how='right',left_on='order_id',right_on='order_no')[['applied_at',modelVar]]
                del df_offline

                #df_observation = query_sql(sql_observation.replace('@appliedType',appliedType).replace('@channelID',channelID),risk_analysis_config)   
                df_observation.loc[:,modelVar] = df_observation.loc[:,modelVar].map(lambda x : np.nan if x < 0 else x)
            #df_bins = df_bins.apply(lambda x :np.nan if x < 0 else x)
                Nothing,interval = pd.qcut(df_bins.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
                interval[0] = 0
                del Nothing
                BM_count = psi_bins(df_bins,modelVar,interval)
                zero_rate,missing_rate,dateList,cols,y,count = dataManipul(df_observation,modelVar,np.array(interval).round(6))
                #df_observation_with_bin = pd.cut(df_observation.dropna(axis=0)[modelVar],interval)
               # del df_bins   
                del interval
                value_tab = []
                rows = []
                y_list = []
                psi = []
                # plot line separated by mon 
                for mon in dateList:
                    y_list.append(y.loc[mon].values)
                    #value_tab.append(y.loc[mon].astype('str')+'%')
                    value_tab.append(count.loc[mon].astype('str')+'(zeroR:'+zero_rate.loc[mon].astype('str')+'%)')
                    #rows.append(str(mon)+' Value');
                    rows.append(str(mon)+' Count')
                    #(y-10).sum() / np.log10(y/10)
                    psi.append((((y.loc[mon]-BM_count) * np.log10(y.loc[mon]/BM_count)).sum()/100).round(3))
                plotPSI(modelType[modelList.index(modelVar)]+'-'+appliedType_type[appliedType]+'-' + channel[channelID] + ' PSI',y_list,dateList,psi,missing_rate,rows,cols,value_tab,path)

            except Exception as e:
                print("psi exception",e)
            try:
                # Overdue dataframe
                df_bins_auc = df_bins[df_bins.transacted == 1]
                del df_bins
                auc_BM = sklearn.metrics.roc_auc_score(df_bins_auc.overdue, df_bins_auc.loc[:,modelVar])
                print('AUC_BM: ',auc_BM)
                Nothing,interval = pd.qcut(df_bins_auc.loc[:,modelVar],10,retbins=True,precision=6,duplicates='drop')
                interval[0] = 0               
                del Nothing  
                                
                df_passdueday = querymongo(limit.replace('@start_date',val_start_date).replace('@end_date',val_end_date),query.replace('@key',modelVar))[['order_id',modelVar]]        
                df_passdueday = df_passdueday.applymap(lambda x : np.nan if x == '' else x)
                df_passdueday[modelVar] = df_passdueday[modelVar].astype('float')
                df_offline = query_sql(sql_passdueday.replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@val_start_date',val_start_date).replace('@val_end_date',val_end_date).replace('@passdue_day',str(passdue_day)),risk_analysis_config)
                df_passdueday = pd.merge(df_passdueday,df_offline,how='inner',left_on='order_id',right_on='order_no')[['applied_at','overdue',modelVar]].dropna(axis=0)
                #print('df_passdueday count: ',df_passdueday.shape)
                del df_offline
                                
               # df_passdueday = pd.read_sql(sql_passdueday.replace('@modelVar',modelVar).replace('@appliedType',appliedType).replace('@channelID',channelID).replace('@passdueday',str(passdueday)),conn)
                count,df_overdue,y,dateList,cols = liftchart(df_passdueday,modelVar,np.array(interval).round(6))
                
                value_tab = []
                rows = []
                y_list = []
                aucri = []
                auc = []
                for mon in dateList:
                    y_list.append(y.loc[mon].values)
                    #value_tab.append(y.loc[mon].astype('str')+'%')
                    value_tab.append(df_overdue.loc[mon].astype('str') + ' (总计 ' + count.loc[mon].astype('str') + ')' )
                    #rows.append(str(mon)+' OverdueRate');
                    rows.append(str(mon)+' Count')
                    aucri.append(round((sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar])/auc_BM),3))
                    auc.append(round(sklearn.metrics.roc_auc_score(df_passdueday[df_passdueday.applied_at==mon].overdue, df_passdueday[df_passdueday.applied_at==mon].loc[:,modelVar]),3))
                auc[-1] = str(auc[-1]) + '\n      AUC基准: ' + str(round(auc_BM,3)) 
                plotLiftChart(modelType[modelList.index(modelVar)] + '-' + appliedType_type[appliedType] + '-' + channel[channelID] + ' AUC WITH '+ str(15) + '+',y_list,dateList,aucri,auc,rows,cols,value_tab,path)

            except Exception as e:
                print("val exception",e)


# In[ ]:




