Commit 15489661 authored by 王家华's avatar 王家华

调整了代码结构

parent da357ba7
import pandas as pd
def mysql_query(sql,engine_sql):
'''
查询大量数据
:param sql:
:param engine_sql:查询器
:return:dataframe
'''
res=[]
#== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5001)
for tt in tmp:
res.append(tt)
return pd.concat(res)
\ No newline at end of file
import pymongo
import pandas as pd
import numpy as np
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}"
'''
instructions : query from mongoDB which should assign a defined list
Params :
limit - limit dict
query - qurey dict which contains keys that should be query
'''
def querymongo(start_time_period, end_time_period, limit, query):
myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
mydb = myclient["rc_dp_feature_pro"]
mycol = mydb["rc_feature_analysis_timing_v2"]
# all data
#x = mycol.find()
# approval data
#x = mycol.find({"wf_audit_result":"1"})
# gt greater than, lt less than. e = equals
x = mycol.find(eval(limit),eval(query))
myclient.close()
return pd.DataFrame(list(x))
[analysis_new]
db=analysis
host=172.30.4.63
port=3306
user=analysis_model
passwd=BGzTPQjDQqJ6PVnK
[risk_info]
db=risk_info
host=172.30.5.106
port=3306
user=sys_read
passwd=quant12345
[xyqb_feature]
db=xyqb_feature
host=xyqb-rule-db.quantgroups.com
port=6606
user=xyqb_rule_read
passwd=1q2w3e4r
[risk_analysis]
db=risk_analysis
host=172.20.6.9
port=9030
user=linfang_wang
passwd=BHWZ3zcZ
\ No newline at end of file
import os
from sqlalchemy import create_engine
import datetime
class sql_engine():
def __init__(self, db, db_name=None, echo=False):
"""
给出数据库名字,创建数据库连接
:param db:
:param db_name:
:param echo:
"""
try:
import Configparser
self.cf = Configparser.ConfigParser()
except:
import configparser
self.cf = configparser.ConfigParser()
self.cf.read(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'mysql_config.ini'))
host = self.cf.get(db, 'host')
user = self.cf.get(db, 'user')
passwd = self.cf.get(db, 'passwd')
port = int(self.cf.get(db, 'port'))
if not db_name:
db_name = self.cf.get(db, 'db')
try:
self.__engine = create_engine(
'mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
connect_args={'connect_timeout': 3600})
except:
self.__engine = create_engine(
'mysql+pymysql://%s:%s@%s:%s/%s?charset=utf8' % (user, passwd, host, port, db_name), echo=echo,
connect_args={'connect_timeout': 3600})
def get_engine(self):
return self.__engine
if 'echo' not in vars():
echo = False
engine_feature = sql_engine('xyqb_feature', 'xyqb_rule').get_engine()
engine_risk = sql_engine('risk_info', 'risk_info').get_engine()
engine_analysis_new = sql_engine('analysis_new').get_engine()
engine_risk_analysis = sql_engine('risk_analysis').get_engine()
from pyplotz.pyplotz import PyplotZ
from pyplotz.pyplotz import plt
from data.analyis import datacal
import seaborn as sns
import pandas as pd
plt.rc('figure',figsize=(8,6))
font_options={
'weight':'bold',
'size':'14'
}
plt.rc('font',**font_options)
def liftchart(df,x,y,classes='',bin=10,title='',xlabel='',ylabel=''):
'''
x:x轴;y:y轴
:param df:dataframe
:param x:
:param y:
:param classes:分组,str
:param bin:
:param title:
:param xlabel:
:param ylabel:
:return:
'''
# #== 单个TODO 待输出
plt.close('all')
if classes !='':
df_out = datacal.cal_accume(df, x, y, bin, classes=[classes])
#== 显示样本数量
df_fig = pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
values=['count'], aggfunc=['mean'])
df_fig=df_fig['mean']['count']
#== 行数
rows=df_fig.index.tolist()
n_rows=len(rows)
# 列数
cols=df_fig.columns.levels[0].categories.to_tuples().tolist()
n_cols=len(cols)
cell_text=df_fig.values.tolist()
plt.subplot(2, 1,1)
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out,'grid','acmMean',hue=classes,title=title+'累计',xlabel=xlabel,ylabel=ylabel)
else :
df_out = datacal.cal_accume(df, x, y, bin)
plt.subplot(2, 1, 1)
draw_lineplot(df_out, 'grid','mean', title=title, xlabel=xlabel, ylabel=ylabel)
plt.subplot(2, 1, 2)
draw_lineplot(df_out, 'grid','acmMean', title=title+'累计', xlabel=xlabel, ylabel=ylabel)
plt.tight_layout()
# plt.show()
return plt
def univarchart(df,x,y,bin=10,classes='',title='',xlabel='',ylabel=''):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.close('all')
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def pdpchart(df,x,y,bin=10,classes='',title='',xlabel='模型分',ylabel='逾期率'):
'''
特征与label的关系图,y为label
:param df:
:return:
'''
plt.close('all')
plt.subplot(1, 1, 1)
if classes !='':
df_out = datacal.cal_univar(df, x, y, bin, classes=[classes])
draw_lineplot(df_out,'grid','mean',hue=classes,title=title,xlabel=xlabel,ylabel=ylabel)
else:
df_out = datacal.cal_univar(df, x, y, bin)
draw_lineplot(df_out, 'grid', 'mean', title=title, xlabel=xlabel, ylabel=ylabel)
# plt.show()
return plt
def draw_barplot(df,x,y,hue='',title=''):
'''
:param df: dataframe
:param x: 横坐标
:param y: 纵坐标
:param hue: 分类
:param title:
:return:fig
'''
pltz = PyplotZ()
pltz.enable_chinese()
fig = plt.figure()
plt.close('all')
sns.set(style="whitegrid")
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(1, 1, 1)
if hue != '':
sns.barplot(x, y, hue=hue, data=df, ax=ax)
else:
sns.barplot(x, y, data=df, ax=ax)
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz.xlabel(x)
pltz.ylabel(y)
pltz.title(title)
pltz.legend()
plt.grid()
# plt.show()
return fig
def draw_lineplot(df,x,y,hue='',title='',xlabel='',ylabel=''):
'''
:param df: dataframe
:param x: 横坐标
:param y: 纵坐标
:param hue: 分类
:param title:
:return:fig
'''
pltz = PyplotZ()
pltz.enable_chinese()
# fig = plt.figure()
if hue != '':
for type in df[hue].unique().tolist():
# == 画图
tmp=df[df[hue]==type]
plt.plot(tmp[x], tmp[y], linestyle='dashed', marker='o',label=type)
else:
plt.plot(df[x], df[y], linestyle='dashed', marker='o')
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
if xlabel !='':
pltz.xlabel(xlabel)
else:
pltz.xlabel(x)
if ylabel !='':
pltz.ylabel(ylabel)
else:
pltz.ylabel(y)
pltz.title(title)
pltz.legend()
plt.grid()
# plt.show()
return plt
\ No newline at end of file
"""
Created on Thu Apr 18 11:32:06 2019
@author: wangjiahua
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
'''
instructions : visualization of pivot
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
metric = figure_arrangement // 10 * figure_arrangement % 10
for i in range(int(np.ceil(len(col) // metric))):
cols = col[i * metric:]
for fig_ith in range(len(cols)):
axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
axs.plot(index,dataset.loc[cols[fig_ith]])
axs.set_title(cols[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.tight_layout()
plt.show()
return 1
#fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
#
#for fig_ith in range(len(df.columns)):
# axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
# axs.plot(df.index,df.iloc[fig_ith])
# axs.set_title(col[])
#plt.tight_layout()
def plot_curve_multiCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
#metric = figure_arrangement // 10 * figure_arrangement % 10
#cols = col[i * metric:]
axs = plt.subplot(111)
for fig_ith in range(len(col)):
axs.plot(index,dataset.loc[col[fig_ith]],label=col[fig_ith])
axs.set_title(col[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.legend()
plt.tight_layout()
plt.show()
return 1
'''
'''
def plot_curve_mingle():
return 1
def density_chart(dataset,title):
for col in dataset.columns:
sns.kdeplot(dataset.loc[:,col],label = col)
plt.title(title)
plt.show()
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
# t1 = np.arange(0.0, 1.0, 0.01)
#
#
# for n in [1, 2, 3, 4]:
# plt.plot(t1, t1 ** n, label="n=%d" % n)
# leg = plt.legend(loc='best', ncol=4, mode="expand", shadow=True)
# leg.get_frame().set_alpha(alpha)
#
#
# # if this fig should be saved
# if fig_path != None:
# plt.savefig(fig_path + fig_name +'.png')
#
#
#
## for i in range(figure_arrangement%10):
## plt.subplots(,figsize=fig_size,linewidth=0.1)
#
# return 1
\ No newline at end of file
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
params_lgb = {
'task': 'train', # 用途
'application': 'binary', # 用于二分类
'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round': 150, # 迭代次数
'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds': None,
# 'objective': 'regression', # 目标函数
'max_depth': 4,
'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
params - default params
df_train - training set
df_val - validation set
features - feature list of dataset
adds_on - parameters dict which would assign as training parameters
target - tagert column or label list of samples
'''
params = params.copy()
print(type(df_train), type(df_val))
# training params
if adds_on != None:
for i in adds_on.keys():
params[i] = adds_on[i]
# convert DataFrame to binary format
lgb_train = lgb.Dataset(df_train[features], df_train[target])
lgb_val = lgb.Dataset(df_val[features], df_val[target], reference=lgb_train)
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_fold=5):
'''
instructions : find optimal parameters with lgbm
Parameters :
params - default parameters (dict format)
target_params - parameter which would be tuning
features - features list
train - training set
val - validation set
target - target label
topN - top N optimal parameters
cv_fold - k folders CV
'''
# reassign as a duplication
params = params.copy()
lgb_train = lgb.Dataset(train[features], train[target])
lgb_val = lgb.Dataset(val[features], val[target], reference=lgb_train)
# create a ndarray shapes 1*n
topn = np.zeros(topN)
# make sure that memory can afford
print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
optimal_para = list(topn)
for deepth in np.arange(2, 7, 1):
for leaves in np.arange(2, 2 ** deepth, 2):
params['max_depth'] = deepth
params['num_leaves'] = leaves
print("parameter combination : ", 'max_depth ', deepth, 'num_leaves ', leaves)
cv_result = lgb.cv(params, lgb_train, seed=7, nfold=cv_fold, verbose_eval=False)
# return max auc(best performance)
auc_score = pd.Series(cv_result['auc-mean']).max()
print('auc ', auc_score)
boost_round = pd.Series(cv_result['auc-mean']).argmax()
# if anyone greater than item in topn list(either of them)
if (auc_score > topn).any():
# find the worst one / lowest AUC
topn[topn.argmin()] = auc_score
para = {}
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para
return optimal_para, lgb_train, lgb_val, topn
# training_curve.append(train_auc)
# validation_curve.append(val_auc)
# auc_matrix = pd.concat([pd.Series(training_curve),pd.Series(validation_curve)],index=['trainingAUC','validationAUC'],axis=1)
# print(auc_matrix)
#
# plt.plot(candidate_list, training_curve,label='training')
# plt.plot(candidate_list, validation_curve,label='validation')
# plt.legend()
# plt.show()
#
# return validation_curve[:3]
# pending here 这个函数没有测
# def lightGBM_gridCV(param_validation, params=params_lgb):
# # make sure that memory can afford
# print('Memory Occupancy Rate: ' + (str)(psutil.virtual_memory().percent) + '%')
#
# param_test = {
# 'max_depth': np.arange(2, 7, 1),
# 'num_leaves': np.arange(20, 200, 10),
# }
# estimator = LGBMRegressor(
# num_leaves=50,
# max_depth=13,
# learning_rate=0.1,
# n_estimators=1000,
# objective='binary',
# min_child_weight=1,
# param['metric'] = ['auc', 'binary_logloss'],
# subsample = 0.8,
# colsample_bytree = 0.8,
# nthread = 7
# )
# gsearch = GridSearchCV(estimator, param_grid=param_test, scoring='roc_auc', cv=5)
# gsearch.fit(values, labels)
# gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_
# return 1
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
'''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
def buildClf(params=params_lgb):
'''
instructions : build a lgb classifier
Params :
'''
return lgbm.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=-1, cv=kfold, verbose=2, iid=True, refit=True)
# == 模型训练
grid_search.fit(dftrain[features], dftrain[resp])
# == 获取最优参数
return grid_search
##############################################################################
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn import metrics
target = 'target'
import xgboost as xgb
# default parameters
params_xgb = {
'learning_rate': 0.1,
'n_estimators': 200,
'max_depth': 3,
'min_child_weight': 1,
'gamma': 0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'binary:logistic',
'nthread': 4,
'scale_pos_weight': 1,
'seed': 27
}
def returnAUC(clf, training_set, validation_set, features, target='target'):
'''
instructions : return AUC of training set & test set
Parameters :
clf - classifier training object
training_set - training dataset
validation_set -
features - features of training set
target - X_test labels
'''
train_auc = roc_auc_score(training_set[target], clf.predict(training_set[features]))
val_auc = roc_auc_score(validation_set[target], clf.predict(validation_set[features]))
print('training set AUC : ', train_auc)
print('validation set AUC : ', val_auc)
return train_auc, val_auc
def xgb_train(params, train, val, features, target='target'):
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
dtrain = xgb.DMatrix(train[features], train[target])
dval = xgb.DMatrix(val[features], val[target])
# xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf = xgb.XGBClassifier(params_xgb)
xgb_clf.fit(train[features], train['target'])
# xgbm = xgb.train(params,dtrain)
returnAUC(xgb_clf, train, val, features)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return xgb_clf
#############################################################################
def buildClf(max_depth=4,learning_rate=0.05, n_estimators=5000, gamma=0,
min_child_weight=1, max_delta_step=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5):
'''
创建 XGBClassifier instance
:param max_depth:叶子节点深度,值越大越容易过拟合。可使用CV 进行调节-- booster 参数
:param learning_rate:学习率,alias eta-- booster 参数
:param n_estimators:number of trees
:param verbosity:0:silent;3:debug replace silent 是否输出模型迭代信息-- 通用参数
:param objective:目标学习函数-- 学习目标参数
binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
multi:softmax 使用softmax的多分类器,返回预测的类别(不是概率)。 需要设置num_class(类别数目)。
multi:softprob 和multi:softmax参数一样,但是返回的是每个数据属于各个类别的概率
:param booster:gbtree gblinear dart-- 通用参数
:param n_jobs: replaces nthread 进程数-- 通用参数
:param gamma:如果损失函数下降,则分裂节点。控制最小损失函数下降值-- booster 参数
:param min_child_weight:最小叶子节点样本权重和。避免过拟合,使用cv进行调整,值大,防过拟合,亦可能欠拟合-- booster 参数
:param max_delta_step:限制每棵树权重改变的最大步长。0:无约束,>0 保守-- booster 参数
:param subsample:这个参数控制对于每棵树,随机采样的比例
:param colsample_bytree:用来控制每棵随机采样的列数的占比(每一列是一个特征)。
:param reg_alpha:L1 正则项参数
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed,统一设置为7,仅为随机可复现
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=0,silent=0,objective='binary:logistic',
booster='gbtree',n_jobs=-1,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,seed=7
)
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
'''
模型自动调参
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param features:
:param resp:
:param kfold:
:return:
'''
# kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=2,cv=kfold,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features],dftrain[resp])
#== 获取最优参数
return grid_search
def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:type useTrainCV: object
:param clf:XGBClassifier
:param dftrain:训练集
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
:param cv_folds: N 折交叉验证
:param early_stopping_rounds:添加数loss变化不大这个状态持续的轮数,达到这个数就退出训练过程
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
if dftrain[features].shape[0]==0:
raise(' NO train data !!!! ')
if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features], label=dftrain[resp])
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=kfold,
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
def predict(clf,df,features):
'''
计算预测值
:param clf:
:param df:
:param features:
:return:
'''
df['predict']=clf.predict(df[features])
df['predict_proba']=clf.predict_proba(df[features])[:,1]
return df
def auc(clf,df,features,label):
#== 计算准确率,auc等指标
df=predict(clf,df,features)
accu=metrics.accuracy_score(df[label].values, df['predict'].values)
auc=metrics.roc_auc_score(df[label],df['predict_proba'])
return {'accuracy':accu,'auc':auc}
def featureImportance(clf,features):
'''
获取模型 特征权重
:param clf:
:param features:
:return:
'''
# Print Feature Importance:
feat_imp = pd.Series(clf.get_booster().get_fscore(), features).sort_values(ascending=False, na_position='last')
feat_imp = feat_imp[feat_imp > 0]
feat_imp=feat_imp.to_frame().reset_index()
feat_imp.columns=['feature','weight']
return feat_imp
import pandas as pd import pandas as pd
from data.datasource import mysqldb, mongodb from datasource import mysqldb, mongodb
import time import time
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
import datetime import datetime
import pickle
''' '''
model instructions : established a dhb obj which cotains attrubutes of dhb model model instructions : established a dhb obj which cotains attrubutes of dhb model
...@@ -179,34 +181,34 @@ class dhb: ...@@ -179,34 +181,34 @@ class dhb:
'dhb_overview_ntdun_first_call_time', 'dhb_overview_ntdun_first_call_time',
'dhb_overview_ntdun_last_call_time'] 'dhb_overview_ntdun_last_call_time']
sql = '''
select ''' + str(features).replace('[', '').replace(']', '').replace('\'', '') + ''',applied_at,applied_from,applied_type,if(passdue_day>15,1,0) as target
from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1
and dhb_flag =1
and datediff(now(),deadline) > 15
'''
# default time interval # default time interval
start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00") start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00")
end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00") end_time_period = (datetime.date.today() - relativedelta(days=+17)).strftime("%Y-%m-%d 00:00:00")
def __init__(self, features=None, sql=None, start_time_period=None, end_time_period=None): def __init__(self, features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15):
try: try:
# if the para was not Series # if the para was not Series
if features != None: if features != None:
self.features = features self.features = features
except Exception as e: except Exception as e:
print("'features' parameter type Error, it should be list or Series") print("'features' parameter type Error, it should be list or Series")
raise
if start_time_period != None: if start_time_period != None:
self.start_time_period = start_time_period self.start_time_period = start_time_period
if end_time_period != None: if end_time_period != None:
self.end_time_period = end_time_period self.end_time_period = end_time_period
if sql != None: if sql != None:
self.sql = sql self.sql = sql
else:
sql = '''
select ''' + str(features).replace('[', '').replace(']', '').replace('\'','') + ''',applied_at,applied_from,applied_type,if(passdue_day>''' + str(passdue_day) + ''',1,0) as target
from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1
and dhb_flag =1
and datediff(now(),deadline) > ''' + str(passdue_day) + '''
'''
def dhb_features_extract(self): def dhb_features_extract(self):
...@@ -287,9 +289,31 @@ class dhb: ...@@ -287,9 +289,31 @@ class dhb:
liftchart plot liftchart plot
''' '''
from data.datasource import mongodb def dhb_predict_with_pkl(self,test,pkl='./dhb_cuishou_jianzhi_v3.pkl',features=features):
open_file = open(pkl, "rb")
model = pickle.load(open_file)
open_file.close()
return model.predict(test[features])
def dhb_dataSketch(self,df, given_dataset=None, start_time_period = start_time_period, end_time_period = end_time_period, def dhb_dataSketch(self,df, given_dataset=None, start_time_period = start_time_period, end_time_period = end_time_period,
applied_type=None, applied_from=None): applied_type=None, applied_from=None):
'''
instructions : build a comparasion
Params :
df - test dataset which was given
score - score column
target - label
start_time_period -
end_time_period -
applied_tpye -
applied_from -
Returns :
auc comparasion
liftchart plot
'''
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}" limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}" query = "{'order_id':1,'@key':1}"
df_mongo = mongodb.querymongo(start_time_period, end_time_period, df_mongo = mongodb.querymongo(start_time_period, end_time_period,
...@@ -324,14 +348,17 @@ class dhb: ...@@ -324,14 +348,17 @@ class dhb:
pivot.plot() pivot.plot()
return 1 return 1
def data_merge(self):
# merge data from mongodb & risk_analysis
return 1
def vlm(self,feature): def vlm(self,feature):
return 1 return 1
def psi(self,feature):
return 1
def liftchart(self,feature):
return 1
......
# Author : Jason Wang
# latest update : May 6 2019
# version control :
#
#######################################################################################################################
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import lightgbm
from matplotlib import pyplot as plt
from data.graph import matplot
# 选定的topnfeatures
This diff is collapsed.
...@@ -3,9 +3,9 @@ import numpy as np ...@@ -3,9 +3,9 @@ import numpy as np
import datetime import datetime
from data.analyis import filetool from data.analyis import filetool
from data.analyis import datacal from data.analyis import datacal
from models import lightgbm from models_kit import lightgbm
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from graph import drawplot
def report(dftrain,dftest,features,label,path,filename,kfold=10): def report(dftrain,dftest,features,label,path,filename,kfold=10):
......
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
instructions - train-test split (split only train & test when val_size equals None)
Params :
dataset
val_size - validation RATIO
tets_size - test set RATIO
stratify - stratify LABEL
random_state
split_methods - random or timeSeries
time_label - label that could identify date & time
'''
# split data as random
if split_methods == 'random':
df_train, df_test = train_test_split_general(dataset,val_size=None,stratify=None,split_methods='timeSeries')
# df_train, df_test = train_test_split(dataset, test_size=test_size, random_state=random_state)
if val_size != None:
size = val_size / (1 - test_size)
df_train, df_val = train_test_split(df_train, test_size=size, random_state=random_state)
# case when validation set not exists
return df_train, df_val, df_test
# split data with time sequence
elif split_methods == 'timeSeries':
data_tmp = dataset.sort_values(by=[time_label], axis=0, ascending=False)
df_test = data_tmp[: int(len(dataset) * test_size)]
df_train = data_tmp[int(len(dataset) * test_size):]
return df_train, df_test
def split_train_val(df, trainsplit = 'random', trainsplitRatio = 0.8, sort_col=None):
'''
切分df 为训练集 和 验证集
:param xgb: xgboost classifier
:param df: dataframe
:param trainsplit: df 切分为训练集,验证集,支持 timeSeries,random,默认为 random
:param trainsplitRatio:如果是随机切分,则切分比例为 0.8为训练集
:param sort_col:如果为按照时间切分,则对 时间进行排序column
:return:
'''
dftrain=df.reset_index()
#== dftrain 中划分 训练集,验证集
if trainsplit=='random':
# 随机分配 train / val
train = dftrain.sample(frac=trainsplitRatio, random_state=7)
val = dftrain[~dftrain.index.isin(train.index)]
elif trainsplit=='timeSeries':
# 按时间序列分配 train /val
train = dftrain.sort_values(by=sort_col).head(int(len(dftrain) * trainsplitRatio))
val = dftrain[~dftrain.index.isin(train.index)]
else:
train = df
val = None
return train,val
def cal_week(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m-%d 每周第一天
'''
columns = df.columns.tolist()
if date_name not in columns:
raise ('not found %' % date_name)
df[date_name] = pd.to_datetime(df[date_name])
df[date_name_new] = df[date_name].dt.strftime('%w')
df[date_name_new] = df[date_name_new].astype(int)
df[date_name_new] = df.apply(lambda x: x[date_name] + datetime.timedelta(days=-x[date_name_new]), axis=1)
df[date_name_new] = pd.to_datetime(df[date_name_new]).dt.date
return df
def cal_month(df,date_name,date_name_new):
'''
:param df: dateframe
:param date_name: eg applied_at
:return: %y-%m
'''
columns=df.columns.tolist()
if date_name not in columns:
raise('not found %' % date_name)
df[date_name]=pd.to_datetime(df[date_name])
df[date_name_new]=df[date_name].dt.strftime('%y-%m')
return df
def cal_feature_grid(df,feature,bin=10,method=2):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
数据离散计算,默认等频;等宽 1 ,等频 2
:param df:dataframe
:param feature:
:param bin:
:param method: 1:等宽;2:等频;3:聚类;默认2
:return:
'''
#== 等宽为数据max-min / bin 即每个区间的宽度是一样的
#== 存在数据每个区间数量不一致
tmp=df.copy()
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
# 默认负数为单独一个区间
num = df[feature].nunique()
if method==1:
max=df[feature].max()
if max <0 :
max=0
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
bin_index = [max*i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(bin_index) | set([-99999, -0.00001]))
else:
# 等频离散,保证每个区间的数量是尽量一致
if num < bin:
feature_grid = sorted(set(tmp[feature].unique().tolist()) | set([-0.00001]))
else:
# == 负数单独一个区间,非负数n等份
bin_index = [i / bin for i in range(0, bin + 1)]
feature_grid = sorted(set(tmp[tmp[feature] >= 0][feature].quantile(bin_index)) | set([-99999, -0.00001]))
return feature_grid
def cal_accume(df,feature,target,bin=10,classes=[]):
'''
groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,mean ,sum
'''
df_out=cal_univar(df,feature,target,bin,classes=classes)
df_out['acmCnt']=df_out.groupby(classes)['count'].cumsum()
df_out['acmSum']=df_out.groupby(classes)['sum'].cumsum()
df_out['acmMean']=df_out['acmSum']/df_out['acmCnt']
return df_out
def cal_univar(df,feature,target,bin=10,classes=[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean ,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
if feature not in columns:
raise('not found %s' % feature)
tmp=df.copy()
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature]=tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp,feature,bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
tmp['grid']=tmp[feature]
if len(classes) > 0:
df_gp = tmp.groupby(classes+['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_gp.columns = classes+['grid','lbl', 'count', 'mean','sum']
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid','lbl', 'count', 'mean','sum']
df_out = df_all
return df_out
def cal_distribution(df,target,classes=[]):
'''
对 classes 分组,对target 计算count,mean
:param df: dataframe
:param target: cal mean(target),count(target)
:param classes:分组
:return:dataframe
'''
if df.shape[0]==0:
raise('no date')
columns=df.columns.tolist()
if target not in columns:
raise('not found target')
tmp=df.copy()
headers = classes + [ 'count', 'mean']
if len(classes) > 0:
df_gp=tmp.groupby(classes).agg({target:['count','mean']}).reset_index()
df_gp.columns=classes + ['count','mean']
df_out=df_gp
else:
all = [[tmp[target].count(),tmp[target].mean()]]
df_all = pd.DataFrame(all, columns=headers)
df_out=df_all
return df_out[headers]
def cal_miss(df,feature,classes=[]):
'''
target: 计算 某个 特征的 缺失率
:param df: dataframe
:param feature ; field name in df.columns
:param classes : list 要分组的,如果为空,默认不分组
:return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
:argument warnning 分为 0值,非0值,负值,默认负数+缺失值均为负值处理
'''
if df.shape[0] <=0:
raise('no data')
columns=df.columns.tolist()
if feature not in columns:
raise('no feature')
tmp=df.copy()
try:
tmp[feature]=tmp[feature].astype(float)
tmp[feature].fillna(-1,inplace=True)
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature] == 0, 'flag'] = '0值'
tmp.loc[tmp[feature] > 0, 'flag'] = '非0值'
except:
tmp['flag'] = '缺失值'
tmp.loc[tmp[feature].notna(), 'flag'] = '未缺失'
tmp[feature].fillna('缺失', inplace=True)
headers = classes+['flag', 'cnt', 'match_rate']
if len(classes) > 0:
# == 分类型
df_gp = pd.merge(
tmp.groupby(classes)[feature].count().reset_index().rename(columns={feature: "cnt"}),
tmp.groupby(classes+['flag'])[feature].count().reset_index().rename(columns={feature: "cnt1"}),
on=classes, how='left'
)
df_gp['match_rate'] = np.round(df_gp.cnt1 / df_gp.cnt, 3)
df_out = df_gp
else:
df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
df_out['cnt']=tmp.shape[0]
df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
return df_out[headers]
import os
from docx import Document
from docx.shared import Inches
def buildDocument(path,filename):
if filename[-3:]!='doc':
if filename[-4:] !='docx':
raise ValueError('{} is not a word file'.format(filename))
if os.path.exists(os.path.join(path,filename)):
return Document(os.path.join(path,filename))
return Document()
def saveDocument(document,path,filename):
if filename[-3:] != 'doc':
if filename[-4:] != 'docx':
raise ValueError('{} is not a word file'.format(filename))
return document.save(os.path.join(path,filename))
def insert_table(document, cols, values):
# cols 为列名
# values 为值,list
table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
hdr_cells = table.rows[0].cells
for i in range(len(cols)):
hdr_cells[i].text = cols[i]
for value in values:
row_cells = table.add_row().cells
for i in range(len(cols)):
row_cells[i].text = str(value[i])
return document
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment