Commit 9763c9e6 authored by linfang.wang's avatar linfang.wang

等宽等频离散

parent 4791a5e8
......@@ -2,7 +2,6 @@ import pandas as pd
import numpy as np
import datetime
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切换df 为训练集 和 验证集
......@@ -59,58 +58,62 @@ def cal_month(df,date_name,date_name_new):
return df
# def cal_isometric(df,feature,bin=10,method=2):
# '''
# 等分计算,默认等频;等宽 1 ,等频 2 ,聚类 3
# :param df:
# :param feature:
# :param bin:
# :param method: 1:等宽;2:等频;3:聚类;默认2
# :return:
# '''
# if method==1:
#
def cal_feature_grid(df,feature,bin=10):
def cal_feature_grid(df,feature,bin=10,method=2):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
:param df:
数据离散计算,默认等频;等宽 1 ,等频 2
:param df:dataframe
:param feature:
:param bin:
:param method: 1:等宽;2:等频;3:聚类;默认2
:return:
'''
#== 等宽为数据max-min / bin 即每个区间的宽度是一样的
#== 存在数据每个区间数量不一致
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
num = tmp[feature].nunique()
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist())|set([-0.00001]))
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
# 默认负数为单独一个区间
num = df[feature].nunique()
if method==1:
max=df[feature].max()
if max <0 :
max=0
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
bin_index = [max*i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(bin_index) | set([-99999, -0.00001]))
else:
# == 负数单独一个区间,非负数n等份
bin_index = [i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
# 等频离散,保证每个区间的数量是尽量一致
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
# == 负数单独一个区间,非负数n等份
bin_index = [i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999, -0.00001]))
return feature_grid
def cal_accume(df,feature,target,bin=10,classes=[]):
'''
groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,坏样本数量,坏样本比例
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,mean ,sum
'''
df_out=cal_univar(df,feature,target,bin,classes=classes)
df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
df_out['acmEvent']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmEventRate']=df_out['acmEvent']/df_out['acmCnt']
df_out['acmSum']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmMean']=df_out['acmSum']/df_out['acmCnt']
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean 计算,累计count,mean
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
......@@ -193,10 +196,16 @@ def cal_miss(df,feature,classes=[]):
if feature not in columns:
raise('no feature')
tmp=df.copy()
tmp[feature].fillna(-1,inplace=True)
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
try:
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
except:
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature].notna(), 'flag'] = '未缺失'
tmp[feature].fillna('缺失', inplace=True)
headers = classes+['flag', 'cnt', 'match_rate']
if len(classes) > 0:
......@@ -209,9 +218,7 @@ def cal_miss(df,feature,classes=[]):
df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
df_out = df_gp
else:
all = [[ '非0值', tmp.shape[0], round(tmp[tmp[feature] > 0].shape[0] / tmp.shape[0], 3)],
[ '0值', tmp.shape[0], round( tmp[tmp[feature] == 0].shape[0] / tmp.shape[0], 3)],
['缺失值', tmp.shape[0], round(tmp[(tmp[feature] < 0)].shape[0] / tmp.shape[0], 3)]]
df_all = pd.DataFrame(all, columns=headers)
df_out=df_all
df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
df_out['cnt']=tmp.shape[0]
df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
return df_out[headers]
\ No newline at end of file
......@@ -13,7 +13,18 @@ plt.rc('font',**font_options)
def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
'''
x:x轴;y:y轴
:param df:dataframe
:param x:
:param y:
:param classes:分组,str
:param bin:
:param title:
:param xlabel:
:param ylabel:
:return:
'''
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment