Commit 0c209f31 authored by linfang.wang's avatar linfang.wang

计算类,分组统计

parent ad260520
import pandas as pd
import numpy as np
import datetime
def cal_week(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m-%d 每周第一天
'''
df[date_name] = pd.to_datetime(df[date_name])
df[date_name_new] = df[date_name].dt.strftime('%w')
df[date_name_new] = df[date_name_new].astype(int)
df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
return df
def cal_month(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m
'''
columns=df.columns.tolist()
if date_name not in columns:
raise('not found %' % date_name)
df[date_name]=pd.to_datetime(df[date_name])
df[date_name_new]=df[date_name].dt.strftime('%y-%m')
return df
def cal_feature_grid(df,feature,bin=10):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
:param df:
:param feature:
:param bin:
:return:
'''
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
num = tmp[feature].nunique()
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist())|set([-0.00001]))
else:
# == 负数单独一个区间,非负数n等份
bin_index = [i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999,-0.00001]))
return feature_grid
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean 计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
if feature not in columns:
raise('not found %s' % feature)
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature]=tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp,feature,bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature]
if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean']
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean']}).reset_index()
df_all.columns = ['grid','lbl', 'count', 'mean']
df_out = df_all
return df_out
def cal_distribution(df,target,classes=[]):
'''
对 classes 分组,对target 计算count,mean
:param df: dataframe
:param target: cal mean(target),count(target)
:param classes:分组
:return:dataframe
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found target')
tmp=df.copy()
headers = classes + [ 'count', 'mean']
if len(classes) > 0:
df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
df_gp.columns=classes + ['count','mean']
df_out=df_gp
else:
all = [[tmp[target].count(),tmp[target].mean()]]
df_all = pd.DataFrame(all, columns=headers)
df_out=df_all
return df_out[headers]
def cal_miss(df,feature,classes=[]):
'''
target: 计算 某个 特征的 缺失率
:param df: dataframe
:param feature ; field name in df.columns
:param classes : list 要分组的,如果为空,默认不分组
:return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
:argument warnning 分为 0值,非0值,负值,默认负数+缺失值均为负值处理
'''
if df.shape[0] <=0:
raise('no data')
columns=df.columns.tolist()
if feature not in columns:
raise('no feature')
tmp=df.copy()
tmp[feature].fillna(-1,inplace=True)
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
headers = classes+['flag', 'cnt', 'miss_rate']
if len(classes) > 0:
# == 分类型
df_gp = pd.merge(
tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
on=classes, how='left'
)
df_gp['miss_rate'] = np.round(1-df_gp.cnt1 / df_gp.cnt, 3)
df_out = df_gp
else:
all = [[ '非0值', tmp.shape[0], round(1 - tmp[tmp[feature] > 0].shape[0] / tmp.shape[0], 3)],
[ '0值', tmp.shape[0], round(1 - tmp[tmp[feature] == 0].shape[0] / tmp.shape[0], 3)],
['缺失值', tmp.shape[0], round(1 - tmp[(tmp[feature] < 0)].shape[0] / tmp.shape[0], 3)]]
df_all = pd.DataFrame(all, columns=headers)
df_out=df_all
return df_out[headers]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment