import pandas as pd
import numpy as np
import datetime




def cal_week(df,date_name,date_name_new):
    '''
    :param df: dateframe
    :param date_name: eg applied_at
    :return: %y-%m-%d 每周第一天
    '''
    columns = df.columns.tolist()
    if date_name not in columns:
        raise ('not found %' % date_name)
    df[date_name] = pd.to_datetime(df[date_name])
    df[date_name_new] = df[date_name].dt.strftime('%w')
    df[date_name_new] = df[date_name_new].astype(int)
    df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
    df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
    return df


def cal_month(df,date_name,date_name_new):
    '''
    :param df: dateframe
    :param date_name: eg applied_at
    :return: %y-%m
    '''
    columns=df.columns.tolist()
    if date_name not in columns:
        raise('not found %' % date_name)
    df[date_name]=pd.to_datetime(df[date_name])
    df[date_name_new]=df[date_name].dt.strftime('%y-%m')
    return df


def cal_feature_grid(df,feature,bin=10):
    '''
    定义 N分位切割区间,负数单独一个区间，非负数N 切割
    :param df:
    :param feature:
    :param bin:
    :return:
    '''
    tmp=df.copy()
    tmp[feature].fillna(-1, inplace=True)

    num = tmp[feature].nunique()
    if num < bin:
        feature_grid = sorted(set(tmp[feature].unique().tolist())|set([-0.00001]))
    else:
        # == 负数单独一个区间,非负数n等份
        bin_index = [i / bin for i in range(0, bin + 1)]
        feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
    return feature_grid


def cal_univar(df,feature,target,bin=10,classes=[]):
    '''
    groupby(classes) 分组,对feature 进行bin 分位，对各个分位进行 count,mean 计算
    :param df: dataframe
    :param feature: feature in df.columns
    :param target: in df.columns eg: count(target) mean(target)
    :param bins:default =10
    :param classes: 分组
    :return:
    '''
    if df.shape[0]==0:
        raise('no date')
    columns=df.columns.tolist()
    if target not in columns:
        raise('not found %s' % target)
    if feature not in columns:
        raise('not found %s' % feature)

    tmp=df.copy()
    tmp[feature].fillna(-1, inplace=True)
    # == bin 划分,feature 有可能 非数字
    try:
        tmp[feature]=tmp[feature].astype(float)
        feature_grid = cal_feature_grid(tmp,feature,bin)
        tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
        tmp['grid'] = tmp['lbl'].cat.codes
    except ValueError:
        tmp['lbl']=tmp[feature]
        tmp['grid']=tmp[feature]

    if len(classes) > 0:
        df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean']}).reset_index()
        df_gp.columns = classes+['grid','lbl', 'count', 'mean']
        df_out=df_gp
    else:
        df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean']}).reset_index()
        df_all.columns = ['grid','lbl', 'count', 'mean']
        df_out = df_all
    return df_out




def cal_distribution(df,target,classes=[]):
    '''
    对 classes 分组，对target 计算count,mean
    :param df: dataframe
    :param target: cal mean(target),count(target)
    :param classes:分组
    :return:dataframe
    '''
    if df.shape[0]==0:
        raise('no date')
    columns=df.columns.tolist()
    if target not in columns:
        raise('not found target')
    tmp=df.copy()
    headers = classes + [ 'count', 'mean']
    if len(classes) > 0:
        df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
        df_gp.columns=classes + ['count','mean']
        df_out=df_gp
    else:
        all = [[tmp[target].count(),tmp[target].mean()]]
        df_all = pd.DataFrame(all, columns=headers)
        df_out=df_all
    return df_out[headers]



def cal_miss(df,feature,classes=[]):
    '''
    target: 计算 某个 特征的 缺失率
    :param df: dataframe
    :param feature ； field name in df.columns
    :param classes : list 要分组的，如果为空，默认不分组
    :return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
    :argument warnning 分为 0值，非0值，负值，默认负数+缺失值均为负值处理
    '''
    if df.shape[0] <=0:
        raise('no data')
    columns=df.columns.tolist()
    if feature not in columns:
        raise('no feature')
    tmp=df.copy()
    tmp[feature].fillna(-1,inplace=True)
    tmp['flag'] = '缺失值'
    tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
    tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'

    headers = classes+['flag', 'cnt', 'match_rate']
    if len(classes) > 0:
        # == 分类型
        df_gp = pd.merge(
            tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
            tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
            on=classes, how='left'
        )
        df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
        df_out = df_gp
    else:
        all = [[ '非0值', tmp.shape[0], round(tmp[tmp[feature] > 0].shape[0] / tmp.shape[0], 3)],
               [ '0值', tmp.shape[0], round( tmp[tmp[feature] == 0].shape[0] / tmp.shape[0], 3)],
               ['缺失值', tmp.shape[0], round(tmp[(tmp[feature] < 0)].shape[0] / tmp.shape[0], 3)]]
        df_all = pd.DataFrame(all, columns=headers)
        df_out=df_all
    return df_out[headers]