Commit b6509799 authored by 王家华's avatar 王家华

删除冗余代码目录

parent 15489661
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if split_methods == 'random':
df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if val_size != None:
size = val_size / (1 - test_size)
df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
# case when validation set not exists
return df_train, df_val, df_test
# split data with time sequence
elif split_methods == 'timeSeries':
data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
df_test = data_tmp[: int(len(dataset) * test_size)]
df_train = data_tmp[int(len(dataset) * test_size):]
return df_train, df_test
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切换df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain=df.reset_index()
#== dftrain 中划分 训练集,验证集
if trainsplit=='random':
# 随机分配 train / val
train = dftrain.sample(frac=trainsplitRatio, random_state=7)
val = dftrain[~dftrain.index.isin(train.index)]
elif trainsplit=='timeSeries':
# 按时间序列分配 train /val
train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
val = dftrain[~dftrain.index.isin(train.index)]
else:
train = df
val = None
return train,val
def cal_week(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m-%d 每周第一天
'''
columns = df.columns.tolist()
if date_name not in columns:
raise ('not found %' % date_name)
df[date_name] = pd.to_datetime(df[date_name])
df[date_name_new] = df[date_name].dt.strftime('%w')
df[date_name_new] = df[date_name_new].astype(int)
df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
return df
def cal_month(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m
'''
columns=df.columns.tolist()
if date_name not in columns:
raise('not found %' % date_name)
df[date_name]=pd.to_datetime(df[date_name])
df[date_name_new]=df[date_name].dt.strftime('%y-%m')
return df
def cal_feature_grid(df,feature,bin=10,method=2):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
数据离散计算,默认等频;等宽 1 ,等频 2
:param df:dataframe
:param feature:
:param bin:
:param method: 1:等宽;2:等频;3:聚类;默认2
:return:
'''
#== 等宽为数据max-min / bin 即每个区间的宽度是一样的
#== 存在数据每个区间数量不一致
tmp=df.copy()
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
# 默认负数为单独一个区间
num = df[feature].nunique()
if method==1:
max=df[feature].max()
if max <0 :
max=0
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
bin_index = [max*i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(bin_index) | set([-99999, -0.00001]))
else:
# 等频离散,保证每个区间的数量是尽量一致
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
# == 负数单独一个区间,非负数n等份
bin_index = [i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999, -0.00001]))
return feature_grid
def cal_accume(df,feature,target,bin=10,classes=[]):
'''
groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,mean ,sum
'''
df_out=cal_univar(df,feature,target,bin,classes=classes)
df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
df_out['acmSum']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmMean']=df_out['acmSum']/df_out['acmCnt']
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
if feature not in columns:
raise('not found %s' % feature)
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature]=tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp,feature,bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature]
if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid','lbl', 'count', 'mean','sum']
df_out = df_all
return df_out
def cal_distribution(df,target,classes=[]):
'''
对 classes 分组,对target 计算count,mean
:param df: dataframe
:param target: cal mean(target),count(target)
:param classes:分组
:return:dataframe
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found target')
tmp=df.copy()
headers = classes + [ 'count', 'mean']
if len(classes) > 0:
df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
df_gp.columns=classes + ['count','mean']
df_out=df_gp
else:
all = [[tmp[target].count(),tmp[target].mean()]]
df_all = pd.DataFrame(all, columns=headers)
df_out=df_all
return df_out[headers]
def cal_miss(df,feature,classes=[]):
'''
target: 计算 某个 特征的 缺失率
:param df: dataframe
:param feature ; field name in df.columns
:param classes : list 要分组的,如果为空,默认不分组
:return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
:argument warnning 分为 0值,非0值,负值,默认负数+缺失值均为负值处理
'''
if df.shape[0] <=0:
raise('no data')
columns=df.columns.tolist()
if feature not in columns:
raise('no feature')
tmp=df.copy()
try:
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
except:
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature].notna(), 'flag'] = '未缺失'
tmp[feature].fillna('缺失', inplace=True)
headers = classes+['flag', 'cnt', 'match_rate']
if len(classes) > 0:
# == 分类型
df_gp = pd.merge(
tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
on=classes, how='left'
)
df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
df_out = df_gp
else:
df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
df_out['cnt']=tmp.shape[0]
df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
return df_out[headers]
import os
from docx import Document
from docx.shared import Inches
def buildDocument(path,filename):
if filename[-3:]!='doc':
if filename[-4:] !='docx':
raise ValueError('{} is not a word file'.format(filename))
if os.path.exists(os.path.join(path,filename)):
return Document(os.path.join(path,filename))
return Document()
def saveDocument(document,path,filename):
if filename[-3:] != 'doc':
if filename[-4:] != 'docx':
raise ValueError('{} is not a word file'.format(filename))
return document.save(os.path.join(path,filename))
def insert_table(document, cols, values):
# cols 为列名
# values 为值,list
table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
hdr_cells = table.rows[0].cells
for i in range(len(cols)):
hdr_cells[i].text = cols[i]
for value in values:
row_cells = table.add_row().cells
for i in range(len(cols)):
row_cells[i].text = str(value[i])
return document
\ No newline at end of file
import pandas as pd
def mysql_query(sql,engine_sql):
'''
查询大量数据
:param sql:
:param engine_sql:查询器
:return:dataframe
'''
res=[]
#== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5001)
for tt in tmp:
res.append(tt)
return pd.concat(res)
\ No newline at end of file
import pymongo
import pandas as pd
import numpy as np
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}"
'''
instructions : query from mongoDB which should assign a defined list
Params :
limit - limit dict
query - qurey dict which contains keys that should be query
'''
def querymongo(start_time_period, end_time_period, limit, query):
myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
mydb = myclient["rc_dp_feature_pro"]
mycol = mydb["rc_feature_analysis_timing_v2"]
# all data
#x = mycol.find()
# approval data
#x = mycol.find({"wf_audit_result":"1"})
# gt greater than, lt less than. e = equals
x = mycol.find(eval(limit),eval(query))
myclient.close()
return pd.DataFrame(list(x))
[analysis_new]
db=analysis
host=172.30.4.63
port=3306
user=analysis_model
passwd=BGzTPQjDQqJ6PVnK
[risk_info]
db=risk_info
host=172.30.5.106
port=3306
user=sys_read
passwd=quant12345
[xyqb_feature]
db=xyqb_feature
host=xyqb-rule-db.quantgroups.com
port=6606
user=xyqb_rule_read
passwd=1q2w3e4r
[risk_analysis]
db=risk_analysis
host=172.20.6.9
port=9030
user=linfang_wang
passwd=BHWZ3zcZ
\ No newline at end of file
import os
from sqlalchemy import create_engine
import datetime
class sql_engine():
def __init__(self, db, db_name=None, echo=False):
"""
给出数据库名字,创建数据库连接
:param db:
:param db_name:
:param echo:
"""
try:
import Configparser
self.cf = Configparser.ConfigParser()
except:
import configparser
self.cf = configparser.ConfigParser()
self.cf.read(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'mysql_config.ini'))
host = self.cf.get(db, 'host')
user = self.cf.get(db, 'user')
passwd = self.cf.get(db, 'passwd')
port = int(self.cf.get(db, 'port'))
if not db_name:
db_name = self.cf.get(db, 'db')
try:
self.__engine = create_engine(
'mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
connect_args={'connect_timeout': 3600})
except:
self.__engine = create_engine(
'mysql+pymysql://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
connect_args={'connect_timeout': 3600})
def get_engine(self):
return self.__engine
if 'echo' not in vars():
echo = False
engine_feature = sql_engine('xyqb_feature', 'xyqb_rule').get_engine()
engine_risk = sql_engine('risk_info', 'risk_info').get_engine()
engine_analysis_new = sql_engine('analysis_new').get_engine()
engine_risk_analysis = sql_engine('risk_analysis').get_engine()
from pyplotz.pyplotz import PyplotZ
from pyplotz.pyplotz import plt
from data.analyis import datacal
import seaborn as sns
import pandas as pd
plt.rc('figure',figsize=(8,6))
font_options={
'weight':'bold',
'size':'14'
}
plt.rc('font',**font_options)
def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
'''
x:x轴;y:y轴
:param df:dataframe
:param x:
:param y:
:param classes:分组,str
:param bin:
:param title:
:param xlabel:
:param ylabel:
:return:
'''
# #== 单个TODO 待输出
plt.close('all')
if classes !='':
df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
#== 显示样本数量
df_fig = pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
values=['count'], aggfunc=['mean'])
df_fig=df_fig['mean']['count']
#== 行数
rows=df_fig.index.tolist()
n_rows=len(rows)
# 列数
cols=df_fig.columns.levels[0].categories.to_tuples().tolist()
n_cols=len(cols)
cell_text=df_fig.values.tolist()
plt.subplot(2, 1,1)
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out,'grid','acmMean',hue=classes,title=title+'累计',xlabel=xlabel,ylabel=ylabel)
else :
df_out = datacal.cal_accume(df, x, y, bin)
plt.subplot(2, 1, 1)
draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out, 'grid','acmMean', title=title+'累计', xlabel=xlabel, ylabel=ylabel)
plt.tight_layout()
# plt.show()
return plt
def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.close('all')
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.close('all')
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def draw_barplot(df,x,y,hue='',title=''):
'''
:param df: dataframe
:param x: 横坐标
:param y: 纵坐标
:param hue: 分类
:param title:
:return:fig
'''
pltz = PyplotZ()
pltz.enable_chinese()
fig = plt.figure()
plt.close('all')
sns.set(style="whitegrid")
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(1, 1, 1)
if hue != '':
sns.barplot(x, y, hue=hue, data=df, ax=ax)
else:
sns.barplot(x, y, data=df, ax=ax)
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz.xlabel(x)
pltz.ylabel(y)
pltz.title(title)
pltz.legend()
plt.grid()
# plt.show()
return fig
def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
'''
:param df: dataframe
:param x: 横坐标
:param y: 纵坐标
:param hue: 分类
:param title:
:return:fig
'''
pltz = PyplotZ()
pltz.enable_chinese()
# fig = plt.figure()
if hue != '':
for type in df[hue].unique().tolist():
# == 画图
tmp=df[df[hue]==type]
plt.plot(tmp[x], tmp[y], linestyle='dashed', marker='o',label=type)
else:
plt.plot(df[x], df[y], linestyle='dashed', marker='o')
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
if xlabel !='':
pltz.xlabel(xlabel)
else:
pltz.xlabel(x)
if ylabel !='':
pltz.ylabel(ylabel)
else:
pltz.ylabel(y)
pltz.title(title)
pltz.legend()
plt.grid()
# plt.show()
return plt
\ No newline at end of file
"""
Created on Thu Apr 18 11:32:06 2019
@author: wangjiahua
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
'''
instructions : visualization of pivot
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
metric = figure_arrangement // 10 * figure_arrangement % 10
for i in range(int(np.ceil(len(col) // metric))):
cols = col[i * metric:]
for fig_ith in range(len(cols)):
axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
axs.plot(index,dataset.loc[cols[fig_ith]])
axs.set_title(cols[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.tight_layout()
plt.show()
return 1
#fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
#
#for fig_ith in range(len(df.columns)):
# axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
# axs.plot(df.index,df.iloc[fig_ith])
# axs.set_title(col[])
#plt.tight_layout()
def plot_curve_multiCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
#metric = figure_arrangement // 10 * figure_arrangement % 10
#cols = col[i * metric:]
axs = plt.subplot(111)
for fig_ith in range(len(col)):
axs.plot(index,dataset.loc[col[fig_ith]],label=col[fig_ith])
axs.set_title(col[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.legend()
plt.tight_layout()
plt.show()
return 1
'''
'''
def plot_curve_mingle():
return 1
def density_chart(dataset,title):
for col in dataset.columns:
sns.kdeplot(dataset.loc[:,col],label = col)
plt.title(title)
plt.show()
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
# t1 = np.arange(0.0, 1.0, 0.01)
#
#
# for n in [1, 2, 3, 4]:
# plt.plot(t1, t1 ** n, label="n=%d" % n)
# leg = plt.legend(loc='best', ncol=4, mode="expand", shadow=True)
# leg.get_frame().set_alpha(alpha)
#
#
# # if this fig should be saved
# if fig_path != None:
# plt.savefig(fig_path + fig_name +'.png')
#
#
#
## for i in range(figure_arrangement%10):
## plt.subplots(,figsize=fig_size,linewidth=0.1)
#
# return 1
\ No newline at end of file
'''
数据清洗
1、缺失值处理标准
2、去重标准,文本转0-1 or 其他
3、调参,哪些参数,参数标准
'''
import sample
import yewudata
\ No newline at end of file
'''
目标:查询百融样本数据,百融样本特征
'''
import pandas as pd
from data.samples import sample
from data.datasource import dbquery
from data.datasource.mysqldb import engine_risk_analysis
from data.samples.yewudata import *
import os
'''
目的:获取电话邦特征,样本数据,数据源为风控分析库
'''
feature_file_name='/Users/wlf/PycharmProjects/model_mvp/data/samples/features/dhb.csv'
def get_feature():
return sample.get_feature_by_version(feature_file_name)
def query_sample(start_date,end_date,is_loan=True):
'''
默认提取放款集
:param start_date:
:param end_date:
:return:样本数据
'''
features=get_feature()
if is_loan:
sql='''
select loan_id,%s
from risk_analysis
where dhb_flag =1 and transacted=1 and applied=1
and applied_at >='%s' and applied_at<'%s'
''' % (','.join(features),start_date,end_date)
else:
sql='''
select loan_id,%s
from risk_analysis
where dhb_flag =1 and applied=1
and applied_at >='%s' and applied_at<'%s'
''' % (','.join(features),start_date,end_date)
df=dbquery.mysql_query(sql,engine_risk_analysis)
yewu=query_byloanid(df.loan_id.tolist())
df.loan_id=df.loan_id.astype(int)
yewu.loan_id=yewu.loan_id.astype(int)
df=pd.merge(df,yewu,on='loan_id',how='inner')
df.applied_at=pd.to_datetime(df.applied_at)
value_map = {
"近3天": 1,
"近4-5天": 2,
"近6-7天": 3,
"近8-15天": 4,
"近16-30天": 5,
"近31-60天": 6,
"近61-90天": 7,
"近91-120天": 8,
"近121-150天": 9,
"近151-180天": 10,
"180天前": 11,
"无": 0
}
cols = ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", "dhb_overview_ntdun_first_call_time",
"dhb_overview_ntdun_last_call_time"]
#== df.columns 中必须有特征的
cols=list(set(cols) & set(df.columns.tolist()))
if len(cols)>0:
df[cols] = df[cols].applymap(lambda x: value_map[x])
cols=df.columns.tolist()
if 'dhb_last_60_and_90_days_ntdun_call_avg_duration' in cols:
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
if 'dhb_overview_ntdun_call_duration_above60' in cols:
df.loc[df.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
if 'dhb_last_30_and_60_days_ntdun_call_total_duration' in cols:
df.loc[
df.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
if 'dhb_last_30_and_60_days_dun_call_in_duration' in cols:
df.loc[
df.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
if 'dhb_last_30_days_ntdun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
if 'dhb_last_30_days_ntdun_call_tel_total_nums' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
if 'dhb_last_30_days_dun_call_in_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
if 'dhb_overview_ntdun_call_total_duration' in cols:
df.loc[df.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
if 'dhb_overview_ntdun_call_in_times' in cols:
df.loc[df.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
if 'dhb_last_60_and_90_days_ntdun_call_in_duration' in cols:
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
if 'dhb_overview_dun_call_tel_total_nums' in cols:
df.loc[df.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
if 'dhb_last_30_days_dun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
if 'dhb_last_two_weeks_ntdun_call_in_duration' in cols:
df.loc[df.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
return df
# if __name__ == '__main__':
# features=sample.get_feature_by_version(feature_file_name)
# features=features[1:10]
# sample.save_features(features,feature_file_name)
feature,version
dhb_last_30_and_60_days_dun_call_avg_duration,1
dhb_last_30_and_60_days_dun_call_duration_above60,1
dhb_last_30_and_60_days_dun_call_duration_below15,1
dhb_last_30_and_60_days_dun_call_duration_between15_and_30,1
dhb_last_30_and_60_days_dun_call_in_duration,1
dhb_last_30_and_60_days_dun_call_in_times,1
dhb_last_30_and_60_days_dun_call_out_duration,1
dhb_last_30_and_60_days_dun_call_out_times,1
dhb_last_30_and_60_days_dun_call_tel_total_nums,1
dhb_last_30_and_60_days_dun_call_total_duration,1
dhb_last_30_and_60_days_dun_call_total_times,1
dhb_last_30_and_60_days_ntdun_call_avg_duration,1
dhb_last_30_and_60_days_ntdun_call_duration_above60,1
dhb_last_30_and_60_days_ntdun_call_duration_below15,1
dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30,1
dhb_last_30_and_60_days_ntdun_call_in_duration,1
dhb_last_30_and_60_days_ntdun_call_in_times,1
dhb_last_30_and_60_days_ntdun_call_out_duration,1
dhb_last_30_and_60_days_ntdun_call_out_times,1
dhb_last_30_and_60_days_ntdun_call_tel_total_nums,1
dhb_last_30_and_60_days_ntdun_call_total_duration,1
dhb_last_30_and_60_days_ntdun_call_total_times,1
dhb_last_30_days_dun_call_avg_duration,1
dhb_last_30_days_dun_call_duration_above60,1
dhb_last_30_days_dun_call_duration_below15,1
dhb_last_30_days_dun_call_duration_between15_and_30,1
dhb_last_30_days_dun_call_in_duration,1
dhb_last_30_days_dun_call_in_times,1
dhb_last_30_days_dun_call_out_duration,1
dhb_last_30_days_dun_call_out_times,1
dhb_last_30_days_dun_call_tel_total_nums,1
dhb_last_30_days_dun_call_total_duration,1
dhb_last_30_days_dun_call_total_times,1
dhb_last_30_days_ntdun_call_avg_duration,1
dhb_last_30_days_ntdun_call_duration_above60,1
dhb_last_30_days_ntdun_call_duration_below15,1
dhb_last_30_days_ntdun_call_duration_between15_and_30,1
dhb_last_30_days_ntdun_call_in_duration,1
dhb_last_30_days_ntdun_call_in_times,1
dhb_last_30_days_ntdun_call_out_duration,1
dhb_last_30_days_ntdun_call_out_times,1
dhb_last_30_days_ntdun_call_tel_total_nums,1
dhb_last_30_days_ntdun_call_total_duration,1
dhb_last_30_days_ntdun_call_total_times,1
dhb_last_60_and_90_days_dun_call_avg_duration,1
dhb_last_60_and_90_days_dun_call_duration_above60,1
dhb_last_60_and_90_days_dun_call_duration_below15,1
dhb_last_60_and_90_days_dun_call_duration_between15_and_30,1
dhb_last_60_and_90_days_dun_call_in_duration,1
dhb_last_60_and_90_days_dun_call_in_times,1
dhb_last_60_and_90_days_dun_call_out_duration,1
dhb_last_60_and_90_days_dun_call_out_times,1
dhb_last_60_and_90_days_dun_call_tel_total_nums,1
dhb_last_60_and_90_days_dun_call_total_duration,1
dhb_last_60_and_90_days_dun_call_total_times,1
dhb_last_60_and_90_days_ntdun_call_avg_duration,1
dhb_last_60_and_90_days_ntdun_call_duration_above60,1
dhb_last_60_and_90_days_ntdun_call_duration_below15,1
dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30,1
dhb_last_60_and_90_days_ntdun_call_in_duration,1
dhb_last_60_and_90_days_ntdun_call_in_times,1
dhb_last_60_and_90_days_ntdun_call_out_duration,1
dhb_last_60_and_90_days_ntdun_call_out_times,1
dhb_last_60_and_90_days_ntdun_call_tel_total_nums,1
dhb_last_60_and_90_days_ntdun_call_total_duration,1
dhb_last_60_and_90_days_ntdun_call_total_times,1
dhb_last_three_weeks_dun_call_avg_duration,1
dhb_last_three_weeks_dun_call_duration_above60,1
dhb_last_three_weeks_dun_call_duration_below15,1
dhb_last_three_weeks_dun_call_duration_between15_and_30,1
dhb_last_three_weeks_dun_call_in_duration,1
dhb_last_three_weeks_dun_call_in_times,1
dhb_last_three_weeks_dun_call_out_duration,1
dhb_last_three_weeks_dun_call_out_times,1
dhb_last_three_weeks_dun_call_tel_total_nums,1
dhb_last_three_weeks_dun_call_total_duration,1
dhb_last_three_weeks_dun_call_total_times,1
dhb_last_three_weeks_ntdun_call_avg_duration,1
dhb_last_three_weeks_ntdun_call_duration_above60,1
dhb_last_three_weeks_ntdun_call_duration_below15,1
dhb_last_three_weeks_ntdun_call_duration_between15_and_30,1
dhb_last_three_weeks_ntdun_call_in_duration,1
dhb_last_three_weeks_ntdun_call_in_times,1
dhb_last_three_weeks_ntdun_call_out_duration,1
dhb_last_three_weeks_ntdun_call_out_times,1
dhb_last_three_weeks_ntdun_call_tel_total_nums,1
dhb_last_three_weeks_ntdun_call_total_duration,1
dhb_last_three_weeks_ntdun_call_total_times,1
dhb_last_two_weeks_dun_call_avg_duration,1
dhb_last_two_weeks_dun_call_duration_above60,1
dhb_last_two_weeks_dun_call_duration_below15,1
dhb_last_two_weeks_dun_call_duration_between15_and_30,1
dhb_last_two_weeks_dun_call_in_duration,1
dhb_last_two_weeks_dun_call_in_times,1
dhb_last_two_weeks_dun_call_out_duration,1
dhb_last_two_weeks_dun_call_out_times,1
dhb_last_two_weeks_dun_call_tel_total_nums,1
dhb_last_two_weeks_dun_call_total_duration,1
dhb_last_two_weeks_dun_call_total_times,1
dhb_last_two_weeks_ntdun_call_avg_duration,1
dhb_last_two_weeks_ntdun_call_duration_above60,1
dhb_last_two_weeks_ntdun_call_duration_below15,1
dhb_last_two_weeks_ntdun_call_duration_between15_and_30,1
dhb_last_two_weeks_ntdun_call_in_duration,1
dhb_last_two_weeks_ntdun_call_in_times,1
dhb_last_two_weeks_ntdun_call_out_duration,1
dhb_last_two_weeks_ntdun_call_out_times,1
dhb_last_two_weeks_ntdun_call_tel_total_nums,1
dhb_last_two_weeks_ntdun_call_total_duration,1
dhb_last_two_weeks_ntdun_call_total_times,1
dhb_last_week_dun_call_avg_duration,1
dhb_last_week_dun_call_duration_above60,1
dhb_last_week_dun_call_duration_below15,1
dhb_last_week_dun_call_duration_between15_and_30,1
dhb_last_week_dun_call_in_duration,1
dhb_last_week_dun_call_in_times,1
dhb_last_week_dun_call_out_duration,1
dhb_last_week_dun_call_out_times,1
dhb_last_week_dun_call_tel_total_nums,1
dhb_last_week_dun_call_total_duration,1
dhb_last_week_dun_call_total_times,1
dhb_last_week_ntdun_call_avg_duration,1
dhb_last_week_ntdun_call_duration_above60,1
dhb_last_week_ntdun_call_duration_below15,1
dhb_last_week_ntdun_call_duration_between15_and_30,1
dhb_last_week_ntdun_call_in_duration,1
dhb_last_week_ntdun_call_in_times,1
dhb_last_week_ntdun_call_out_duration,1
dhb_last_week_ntdun_call_out_times,1
dhb_last_week_ntdun_call_tel_total_nums,1
dhb_last_week_ntdun_call_total_duration,1
dhb_last_week_ntdun_call_total_times,1
dhb_overview_dun_call_avg_duration,1
dhb_overview_dun_call_duration_above60,1
dhb_overview_dun_call_duration_below15,1
dhb_overview_dun_call_duration_between15_and_30,1
dhb_overview_dun_call_in_duration,1
dhb_overview_dun_call_in_times,1
dhb_overview_dun_call_out_duration,1
dhb_overview_dun_call_out_times,1
dhb_overview_dun_call_tel_total_nums,1
dhb_overview_dun_call_total_duration,1
dhb_overview_dun_call_total_times,1
dhb_overview_dun_first_call_time,1
dhb_overview_dun_last_call_time,1
dhb_overview_ntdun_call_avg_duration,1
dhb_overview_ntdun_call_duration_above60,1
dhb_overview_ntdun_call_duration_below15,1
dhb_overview_ntdun_call_duration_between15_and_30,1
dhb_overview_ntdun_call_in_duration,1
dhb_overview_ntdun_call_in_times,1
dhb_overview_ntdun_call_out_duration,1
dhb_overview_ntdun_call_out_times,1
dhb_overview_ntdun_call_tel_total_nums,1
dhb_overview_ntdun_call_total_duration,1
dhb_overview_ntdun_call_total_times,1
dhb_overview_ntdun_first_call_time,1
dhb_overview_ntdun_last_call_time,1
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
import os
'''
读取特征文件
'''
def get_features_from_file(feature_file_name):
'''
从feature 文件中读取feature
:return: df,columns=['feature','version']
'''
print('当前目录:',os.path.abspath('.'))
df_feature=pd.read_csv(feature_file_name)
return df_feature
def get_feature_by_version(feature_file_name,version=None):
'''
根据feature 的版本号,获取该版本下的feature,如果不指定,则获取最新的版本号
:param version:int 负数为不指定或者
:return:list
'''
df_feature = get_features_from_file(feature_file_name)
if (version ==None) or (version<1):
version=df_feature.version.max()
return df_feature[df_feature.version == version].feature.tolist()
def save_features(features,feature_file_name):
'''
针对新的feature,同维护的feature文档比较,如果同最新版的特征一样,那么无需保存,如果不一致,则作为新的一版特征进行保存
:param features:list
:return:
'''
f2=get_feature_by_version(feature_file_name)
if (set(f2) & set(features)) == (set(f2) | set(features)):
print('features are already newest,not need to save')
return True
else:
#== 更新特征
tmp=pd.DataFrame(features,columns=['feature'])
df_feature=get_features_from_file(feature_file_name)
version=df_feature.version.max()+1
tmp['version']=version
columns=['feature','version']
df_feature=pd.concat([df_feature[columns],tmp[columns]])
df_feature[columns].to_csv(feature_file_name,index=None,encoding='utf8')
def cal_sample_date(last_sample_max_date=None,passdue_day=15):
'''
提取样本数据,基于上次样本的last_sample_max_date,和 passdue_day 提取有表现的到目前的数据
如果不指定 last_sample_max_date ,则以当前时间为基准,提取有passdue_day表现的近3个月的样本数据
:param last_sample_min_date:上一次样本最早时间 精确到天,格式为 '%Y-%m-%d'
:param last_sample_max_date:上一次样本最新时间 精确到天,格式为 '%Y-%m-%d'
:param passdue_day:查看表现的,比如逾期15天表现的样本
:return:start_date,end_date,可提取样本的最早时间,最晚时间
'''
base_date=datetime.datetime.now().date()
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== 提取的样本数据不得超过base_date
base_date=base_date+relativedelta(days=-(passdue_day+5),months=-1)
if last_sample_max_date is None:
start_date=base_date+relativedelta(months=-3)
end_date=base_date
else:
#last_sample_max_date 为基准,计算
if type(last_sample_max_date)==str:
last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d %H:%M:%S').date()
if last_sample_max_date >=base_date:
last_sample_max_date=base_date
start_date=last_sample_max_date
end_date=base_date
return start_date,end_date
def read_record():
file_name = 'record.txt'
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt','train_auc',
'test_min_date', 'test_max_date', 'test_cnt', 'test_auc', 'update_date']
if os.path.exists(file_name):
df = pd.read_csv('record.txt')
else:
df = pd.DataFrame(columns=cols)
return df
def get_records(model_name):
'''
获取某一个模型下的所有的迭代的记录
:param model_name:
:return:
'''
df=read_record()
df_select = df[df.model_name == model_name]
df_select.sort_values(['update_date'], ascending=False, inplace=True)
return df_select
def get_last_record(model_name):
'''
获取指定模型的上一次迭代模型的样本信息
:param model_name:
:return:
'''
df_select=get_records(model_name)
if df_select.shape == 0:
return df_select
return df_select.head(1)
def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
train_min_date=None,train_max_date=None,train_cnt=None,train_auc=None,
test_min_date=None,test_max_date=None,test_cnt=None,test_auc=None):
'''
model_name,update_date 组成唯一健;如果有值,则更新,否则不进行更新
:param model_name:
:param min_date:
:param max_date:
:param sample_cnt:
:param train_min_date:
:param train_max_date:
:param train_cnt:
:param train_auc:
:param test_min_date:
:param test_max_date:
:param test_cnt:
:param test_auc:
:return:
'''
df_all=read_record()
df_all.reset_index(inplace=True)
#== 获取当下的记录
df_record=get_records(model_name)
df_record=df_record[df_record.update_date==datetime.date()]
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt', 'train_auc',
'test_min_date', 'test_max_date', 'test_cnt', 'test_auc', 'update_date']
if df_record.shape[0]==0:
df_record=pd.DataFrame(columns=cols)
df_record['model_name']=model_name
df_record['update_date']=datetime.date()
else:
df_all = df_all[~df_all.index.isin(df_record.index)]
df_record=__update__(df_record,'min_date',min_date)
df_record = __update__(df_record, 'max_date', max_date)
df_record = __update__(df_record, 'sample_cnt', sample_cnt)
df_record = __update__(df_record, 'train_min_date', train_min_date)
df_record = __update__(df_record, 'train_max_date', train_max_date)
df_record = __update__(df_record, 'train_cnt', train_cnt)
df_record = __update__(df_record, 'train_auc', train_auc)
df_record = __update__(df_record, 'test_min_date', test_min_date)
df_record = __update__(df_record, 'test_max_date', test_max_date)
df_record = __update__(df_record, 'test_cnt', test_cnt)
df_record = __update__(df_record, 'test_auc', test_auc)
pd.concat([df_all[cols],df_record[cols]]).to_csv('record.txt',index=None,encoding='utf8')
def __update__(df,name,value):
if value is not None:
df[name]=value
return df
import pandas as pd
from data.datasource.mysqldb import *
from data.datasource import dbquery
'''
目的:提供业务数据,包括 order_no,loan_id,用户类型,支持策略用risk_info,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
'''
def query_risk_info(order_nos):
'''
直接使用了线上的用户类型,而非分析库的计算得到的用户类型
:param order_nos:
:return:order_no,user_loan_type-- 策略使用的用户类型,审核拒绝原因,是否黑名单拒绝或其他拒绝
'''
# audit_result :1: 通过,其他未通过
res = []
sql = '''
select biz_no as order_no,
ifnull(JSON_EXTRACT( audit_context_data, '$.user_loan_type_v3'),
JSON_EXTRACT(audit_context_data, '$.user_loan_type_v4')) as user_loan_type,
JSON_EXTRACT(audit_context_data, '$.filter_info_in_black') qg_black,
JSON_EXTRACT(audit_context_data, '$."third_data_source#xy_neg_code"') as xy_black,
audit_result
from biz_audit_log_data
where biz_no in %s
'''
for i in range(0, len(order_nos), 1000):
print('----exe sql %d---- ' % i)
res.append(pd.read_sql(sql % str(tuple(order_nos[i:i + 1000])), engine_risk))
df= pd.concat(res)
df.drop_duplicates(['order_no'],inplace=True)
df['reason']='未知'
df.loc[df.audit_result==1,'reason']='审核通过'
df.loc[(df.audit_result!=1) & (df.qg_black> 0),'reason']='命中QG黑名单'
df.loc[(df.audit_result != 1) & (df.xy_black=='0'), 'reason'] = '命中新颜黑名单'
return df[['order_no','uesr_loan_type','reason']]
def query_byloanid(loan_ids):
'''
数据源为分析库
:param loan_ids:list 放款集
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
'''
sql='''
select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
where t1.loan_id in %s
group by 1,2,3,4,5,6,7,8
'''
res=[]
for i in range(0, len(loan_ids), 1000):
print('----exe sql %d---- ' % i)
res.append(pd.read_sql(sql % str(tuple(loan_ids[i:i + 1000])), engine_analysis_new))
df = pd.concat(res)
df.order_no = df.order_no.apply(lambda x:x.decode('utf8'))
#== 剔除重复数据
df.sort_values(['loan_id'],ascending=True,inplace=True)
df.drop_duplicates(['loan_id'],keep='last',inplace=True)
return df
def query_by_orderno(order_nos):
'''
数据源为分析库
:param order_nos:list 放款集
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单']
'''
sql = '''
select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
where t1.order_no in %s
group by 1,2,3,4,5,6,7,8
'''
res = []
for i in range(0, len(order_nos), 1000):
print('----exe sql %d---- ' % i)
res.append(pd.read_sql(sql % str(tuple(order_nos[i:i + 1000])), engine_analysis_new))
df = pd.concat(res)
df.order_no = df.order_no.apply(lambda x: x.decode('utf8'))
# == 剔除重复数据
df.sort_values(['loan_id'], ascending=True, inplace=True)
df.drop_duplicates(['loan_id'], keep='last', inplace=True)
return df
def query_bydate(start_date,end_date,is_loan=True):
'''
获取指定日期的数据,如果 is_loan=True ,则为放款集 否则为申请集
:param start_date:
:param end_date:
:return:dataframe
'''
if is_loan:
sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
where t1.applied=1
and t1.applied_at >= '%s' and t1.applied_at < '%s'
group by 1,2,3,4,5,6,7,8
''' % (start_date,end_date)
else:
sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
where t1.applied=1
and t1.applied_at >= '%s' and t1.applied_at < '%s'
group by 1,2,3,4,5,6,7,8
''' % (start_date,end_date)
df = dbquery.mysql_query(sql,engine_analysis_new)
df.sort_values(['loan_id'],ascending=True,inplace=True)
df.drop_duplicates(['loan_id'],keep='last',inplace=True)
tmp=query_risk_info(df.order_no.tolist())
return pd.merge(df,tmp,on='order_no',how='left')
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
params_lgb = {
'task': 'train', # 用途
'application': 'binary', # 用于二分类
'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round': 150, # 迭代次数
'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds': None,
# 'objective': 'regression', # 目标函数
'max_depth': 4,
'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params = params.copy()
print(type(df_train), type(df_val))
# training params
if adds_on != None:
for i in adds_on.keys():
params[i] = adds_on[i]
# convert DataFrame to binary format
lgb_train = lgb.Dataset(df_train[features], df_train[target])
lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params = params.copy()
lgb_train = lgb.Dataset(train[features], train[target])
lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
# create a ndarray shapes 1*n
topn = np.zeros(topN)
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
for deepth in np.arange(2, 7, 1):
for leaves in np.arange(2, 2 ** deepth, 2):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
boost_round = pd.Series(cv_result['auc-mean']).argmax()
# if anyone greater than item in topn list(either of them)
if (auc_score > topn).any():
# find the worst one / lowest AUC
topn[topn.argmin()] = auc_score
para = {}
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para
return optimal_para, lgb_train, lgb_val, topn
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
def lightGBM_gridCV(param_validation, params=params_lgb):
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
param_test = {
'max_depth': np.arange(2, 7, 1),
'num_leaves': np.arange(20, 200, 10),
}
estimator = LGBMRegressor(
num_leaves=50,
max_depth=13,
learning_rate=0.1,
n_estimators=1000,
objective='binary',
min_child_weight=1,
param['metric'] = ['auc', 'binary_logloss'],
subsample = 0.8,
colsample_bytree = 0.8,
nthread = 7
)
gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
gsearch.fit(values, labels)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
return 1
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
'''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
def buildClf(params=params_lgb):
'''
instructions : build a lgb classifier
Params :
'''
return lgbm.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
# == 模型训练
grid_search.fit(dftrain[features], dftrain[resp])
# == 获取最优参数
return grid_search
##############################################################################
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics
target = 'target'
import xgboost as xgb
# default parameters
params_xgb = {
'learning_rate': 0.1,
'n_estimators': 200,
'max_depth': 3,
'min_child_weight': 1,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'binary:logistic',
'nthread': 4,
'scale_pos_weight': 1,
'seed': 27
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def xgb_train(params, train, val, features, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain = xgb.DMatrix(train[features], train[target])
dval = xgb.DMatrix(val[features], val[target])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf.fit(train[features], train['target'])
# xgbm = xgb.train(params,dtrain)
returnAUC(xgb_clf, train, val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return xgb_clf
#############################################################################
def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5):
'''
创建 XGBClassifier instance
:param max_depth:叶子节点深度,值越大越容易过拟合。可使用CV 进行调节-- booster 参数
:param learning_rate:学习率,alias eta-- booster 参数
:param n_estimators:number of trees
:param verbosity:0:silent;3:debug replace silent 是否输出模型迭代信息-- 通用参数
:param objective:目标学习函数-- 学习目标参数
binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
multi:softmax 使用softmax的多分类器,返回预测的类别(不是概率)。 需要设置num_class(类别数目)。
multi:softprob 和multi:softmax参数一样,但是返回的是每个数据属于各个类别的概率
:param booster:gbtree gblinear dart-- 通用参数
:param n_jobs: replaces nthread 进程数-- 通用参数
:param gamma:如果损失函数下降,则分裂节点。控制最小损失函数下降值-- booster 参数
:param min_child_weight:最小叶子节点样本权重和。避免过拟合,使用cv进行调整,值大,防过拟合,亦可能欠拟合-- booster 参数
:param max_delta_step:限制每棵树权重改变的最大步长。0:无约束,>0 保守-- booster 参数
:param subsample:这个参数控制对于每棵树,随机采样的比例
:param colsample_bytree:用来控制每棵随机采样的列数的占比(每一列是一个特征)。
:param reg_alpha:L1 正则项参数
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed,统一设置为7,仅为随机可复现
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7
)
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
'''
模型自动调参
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param features:
:param resp:
:param kfold:
:return:
'''
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数
return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if dftrain[features].shape[0]==0:
raise(' NO train data !!!! ')
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features], label=dftrain[resp])
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
def predict(clf,df,features):
'''
计算预测值
:param clf:
:param df:
:param features:
:return:
'''
df['predict']=clf.predict(df[features])
df['predict_proba']=clf.predict_proba(df[features])[:,1]
return df
def auc(clf,df,features,label):
#== 计算准确率,auc等指标
df=predict(clf,df,features)
accu=metrics.accuracy_score(df[label].values, df['predict'].values)
auc=metrics.roc_auc_score(df[label],df['predict_proba'])
return {'accuracy':accu,'auc':auc}
def featureImportance(clf,features):
'''
获取模型 特征权重
:param clf:
:param features:
:return:
'''
# Print Feature Importance:
feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
feat_imp = feat_imp[feat_imp > 0]
feat_imp=feat_imp.to_frame().reset_index()
feat_imp.columns=['feature','weight']
return feat_imp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment