import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split


def cal_lift(df_list, score, target='target', qcut=10, retbin=False):
    '''
    instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
    Params :
        df - dataframe(注意一定是是放款集！！) list
        score - 模型分数
        target - label column
        qcut - quantiles
        retbins - return bins interval when 'retbins' is True, else False
    :return:
        liftchart pivot
    '''
    pivot = pd.DataFrame([])
    if type(df_list) == pd.DataFrame:
        df = df_list.copy()
        # fillin missing with -1
        df.fillna(value=-1,inplace=True)
        df = df[[score, target]]
        # create a bins column
        df_noneNA = [df[score] < 0]


        df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
        pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
        pivot = pd.concat([pivot, pivot_tmp], axis=1)
    if type(df_list) == list:
        print('none')
        for df in df_list:
            df = df.copy()
            df = df[[score, target]]
            # create a bins column
            df['bins'] = pd.qcut(df[score], q=qcut, precision=6, retbins=retbin, duplicates='drop')
            pivot_tmp = df[['bins', target]].groupby('bins').agg(['mean', 'count'])
            pivot = pd.concat([pivot, pivot_tmp], axis=1)
    return pivot[target]


def cal_univar(df, feature, target, qcut=10):
    '''
    instructions : return univar pivot
    Params:
        :param df: dataframe with unvariable & label target(overdue label)
        :param feature: single feature to
        :param target:
        :param qcut: N bins in the same frequency

    :return: univar pivot
    '''
    df = df.copy()
    df = df[[feature, target]]
    # fill missing with -1
    df.fillna(value=-1,inplace=True)
    df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
    pivot = df[[target,'bins']].groupby('bins').sum() / df[[target,'bins']].groupby('bins').count()
    return pivot[target]



def cal_pdp(df, score, feature, qcut=10):
    '''
    instructions : return pdp pivot
    :param df: dataframe of test set
    :param score: score that predicts by model
    :param feature:
    :param qcut:
    :return:
    '''
    df = df.copy()
    df = df[[feature, score]]
    df['bins'] = pd.qcut(df[feature], q=qcut, precision=6, retbins=False, duplicates='drop')
    pivot = df[[score,'bins']].groupby('bins').sum() / df[[score,'bins']].groupby('bins').count()
    return pivot[score]






def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
                             split_methods='random', time_label='applied_at'):
    '''
    instructions - train-test split (split only train & test when val_size equals None)

    Params :
        dataset
        val_size - validation RATIO
        tets_size - test set RATIO
        stratify - stratify LABEL
        random_state
        split_methods - random or timeSeries
        time_label - label that could identify date & time
    '''
    # split data as random
    if split_methods == 'random':
        df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
        # df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
        if val_size != None:
            size = val_size / (1 - test_size)
            df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
        # case when validation set not exists
        return df_train, df_val, df_test
    # split data with time sequence
    elif split_methods == 'timeSeries':
        data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
        df_test = data_tmp[: int(len(dataset) * test_size)]
        df_train = data_tmp[int(len(dataset) * test_size):]
        return df_train, df_test


def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
    '''
    切分df 为训练集 和 验证集
    :param xgb: xgboost classifier
    :param df: dataframe
    :param trainsplit: df 切分为训练集，验证集，支持 timeSeries，random，默认为 random
    :param trainsplitRatio:如果是随机切分，则切分比例为 0.8为训练集
    :param sort_col:如果为按照时间切分，则对 时间进行排序column
    :return:
    '''
    dftrain=df.reset_index()
    #== dftrain 中划分 训练集，验证集
    if trainsplit=='random':
        # 随机分配 train / val
        train = dftrain.sample(frac=trainsplitRatio, random_state=7)
        val = dftrain[~dftrain.index.isin(train.index)]
    elif trainsplit=='timeSeries':
        # 按时间序列分配 train /val
        train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
        val = dftrain[~dftrain.index.isin(train.index)]
    else:
        train = df
        val = None
    return train,val


def cal_week(df,date_name,date_name_new):
    '''
    :param df: dateframe
    :param date_name: eg applied_at
    :return: %y-%m-%d 每周第一天
    '''
    columns = df.columns.tolist()
    if date_name not in columns:
        raise ('not found %' % date_name)
    df[date_name] = pd.to_datetime(df[date_name])
    df[date_name_new] = df[date_name].dt.strftime('%w')
    df[date_name_new] = df[date_name_new].astype(int)
    df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
    df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
    return df


def cal_month(df,date_name,date_name_new):
    '''
    :param df: dateframe
    :param date_name: eg applied_at
    :return: %y-%m
    '''
    columns=df.columns.tolist()
    if date_name not in columns:
        raise('not found %' % date_name)
    df[date_name]=pd.to_datetime(df[date_name])
    df[date_name_new]=df[date_name].dt.strftime('%y-%m')
    return df
















def cal_feature_grid(df,feature,bin=10,method=2):
    '''
    定义 N分位切割区间,负数单独一个区间，非负数N 切割
    数据离散计算，默认等频；等宽 1 ，等频 2
    :param df:dataframe
    :param feature:
    :param bin:
    :param method: 1:等宽；2：等频；3：聚类；默认2
    :return:
    '''
    #== 等宽为数据max-min / bin 即每个区间的宽度是一样的
    #== 存在数据每个区间数量不一致
    tmp=df.copy()
    tmp[feature]=tmp[feature].astype(float)
    tmp[feature].fillna(-1,inplace=True)
    # 默认负数为单独一个区间
    num = df[feature].nunique()
    if method==1:
        max=df[feature].max()
        if max <0 :
            max=0
        if num < bin:
            feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
        else:
            bin_index = [max*i / bin for i in range(0, bin + 1)]
            feature_grid = sorted(set(bin_index) | set([-99999, -0.00001]))
    else:
        # 等频离散，保证每个区间的数量是尽量一致
        if num < bin:
            feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
        else:
            # == 负数单独一个区间,非负数n等份
            bin_index = [i / bin for i in range(0, bin + 1)]
            feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999, -0.00001]))
    return feature_grid

def cal_accume(df,feature,target,bin=10,classes=[]):
    '''
    groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
    :param df:
    :param feature:
    :param target:
    :param bin:
    :param classes:
    :return: 对feature 进行分段；计算每个区间的mean,count,sum 累计 count,mean ,sum
    '''
    df_out=cal_univar(df,feature,target,bin,classes=classes)
    df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
    df_out['acmSum']=df_out.groupby(classes)['sum'].cumsum()
    df_out['acmMean']=df_out['acmSum']/df_out['acmCnt']
    return df_out


# def cal_univar(df,feature,target,bin=10,classes=[]):
#     '''
#     groupby(classes) 分组,对feature 进行bin 分位，对各个分位进行 count,mean ,sum计算
#     :param df: dataframe
#     :param feature: feature in df.columns
#     :param target: in df.columns eg: count(target) mean(target)
#     :param bins:default =10
#     :param classes: 分组
#     :return:
#     '''
#     if df.shape[0]==0:
#         raise('no data')
#     columns=df.columns.tolist()
#     if target not in columns:
#         raise('not found %s' % target)
#     if feature not in columns:
#         raise('not found %s' % feature)
#
#     tmp=df.copy()
#     tmp[feature].fillna(-1, inplace=True)
#     # == bin 划分,feature 有可能 非数字
#     try:
#         tmp[feature] = tmp[feature].astype(float)
#         feature_grid = cal_feature_grid(tmp, feature, bin)
#         tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
#         tmp['grid'] = tmp['lbl'].cat.codes
#     except ValueError:
#         tmp['lbl']=tmp[feature]
#         tmp['grid']=tmp[feature]
#
#     if len(classes) > 0:
#         df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
#         df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
#         df_out=df_gp
#     else:
#         df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
#         df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
#         df_out = df_all
#     return df_out




def cal_distribution(df,target,classes=[]):
    '''
    对 classes 分组，对target 计算count,mean
    :param df: dataframe
    :param target: cal mean(target),count(target)
    :param classes:分组
    :return:dataframe
    '''
    if df.shape[0]==0:
        raise('no date')
    columns=df.columns.tolist()
    if target not in columns:
        raise('not found target')
    tmp=df.copy()
    headers = classes + [ 'count', 'mean']
    if len(classes) > 0:
        df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
        df_gp.columns=classes + ['count','mean']
        df_out=df_gp
    else:
        all = [[tmp[target].count(),tmp[target].mean()]]
        df_all = pd.DataFrame(all, columns=headers)
        df_out=df_all
    return df_out[headers]



def cal_miss(df,feature,classes=[]):
    '''
    target: 计算 某个 特征的 缺失率
    :param df: dataframe
    :param feature ； field name in df.columns
    :param classes : list 要分组的，如果为空，默认不分组
    :return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
    :argument warnning 分为 0值，非0值，负值，默认负数+缺失值均为负值处理
    '''
    if df.shape[0] <=0:
        raise('no data')
    columns=df.columns.tolist()
    if feature not in columns:
        raise('no feature')
    tmp=df.copy()
    try:
        tmp[feature]=tmp[feature].astype(float)
        tmp[feature].fillna(-1,inplace=True)
        tmp['flag'] = '缺失值'
        tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
        tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
    except:
        tmp['flag'] = '缺失值'
        tmp.loc[tmp[feature].notna(), 'flag'] = '未缺失'
        tmp[feature].fillna('缺失', inplace=True)

    headers = classes+['flag', 'cnt', 'match_rate']
    if len(classes) > 0:
        # == 分类型
        df_gp = pd.merge(
            tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
            tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
            on=classes, how='left'
        )
        df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
        df_out = df_gp
    else:
        df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
        df_out['cnt']=tmp.shape[0]
        df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
    return df_out[headers]

