Commit 9b10189a authored by 王家华's avatar 王家华

plot函数加表格出异常暂时没法全部解决,调用了model tools的方法画图

parent bd18c3b0
This diff is collapsed.
This diff is collapsed.
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import roc_auc_score
from models_kit import lightgbm
from models_kit import xgboost
def topN_feature_importance(classifier, clf ,mode , topN=20):
def topN_feature_importance_plot(model, clf, title="untitled", save_path='./mvp/plots/', topN=20):
'''
plot feature importance squence
params:
classifier
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
model.plot_importance(clf, max_num_features=topN)
plt.title("Feature Importances")
path = save_path + title + "_featureImportance.png"
plt.savefig(path)
plt.show()
return path
def topN_feature_importance_list(features, clf, topN=3):
'''
instructions : return topN_feature_importance dataframe
:param features:
:param clf:
:param topN:
:return:
'''
importanct_feat = pd.DataFrame({
'column': features,
'importance': clf.feature_importance(),
}).sort_values(by='importance', ascending=False).column.tolist()[:3]
return importanct_feat
def model_selection(algorthm,clf,df_train,df_val,df_test,target,score,optimal_model,model_obj):
# model matrix 存储不同模型指标的矩阵
model_matrix_index = ['name', 'Params', 'trainAUC', 'validationAUC']
model_matrix = pd.DataFrame(['NULL', 'NULL', roc_auc_score(df_train[target], df_train[score]),
roc_auc_score(df_train[target], df_train[score])], index=model_matrix_index,
columns=['线上模型'])
# 定义最优参指针
pointer = 0
# 遍历最优参组合
for param in optimal_para:
if algorthm == "lightGBM":
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, model_obj.features,
adds_on=param, target=target)
model_matrix = pd.concat([model_matrix,
pd.DataFrame(['lightGBM', param, train_auc, val_auc], index=model_matrix_index,
columns=[pointer])], axis=1)
pointer += 1
# 简单选取一下validation set auc 最高的 params
best_params = model_matrix.T.sort_values(by='validationAUC', ascending=False).iloc[0, :].loc['Params']
\ No newline at end of file
......@@ -210,7 +210,7 @@ class dhb:
and datediff(now(),deadline) > ''' + str(passdue_day) + '''
'''
def dhb_features_extract(self,df):
def dhb_features_prepocessing(self,dhb_loan):
try:
value_map = {
"近3天": 1,
......@@ -229,12 +229,12 @@ class dhb:
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
self.end_time_period))
# print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
# self.end_time_period))
dhb_loan = pd.read_sql(
self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
mysqldb.engine_risk_analysis)
# dhb_loan = pd.read_sql(
# self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
# mysqldb.engine_risk_analysis)
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
......@@ -267,9 +267,9 @@ class dhb:
dhb_loan.loc[
dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
# dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
# print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
# time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
# ignore exceptions such as "colmns doesn't exist"
except Exception as e:
print("data preprocessing ERR ",e)
......
import pandas as pd
import numpy as np
'''
@allocator V1.0
'''
#################################################### report settings ###################################################
from models_obj import dhb_obj
import datetime
from tools import datacal
import pandas as pd
import os
from mvp import refit
from mvp import rebuild
from models_obj import dhb_obj
###### global variable ######
# label
target = 'target'
#############################
dhb = dhb_obj.dhb()
df_sample = dhb.dhb_features_extract()
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
print('period of time: ',dhb.start_time_period,'-',dhb.end_time_period)
print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__':
# data extraction
''' ## Old Edition here
# if total sample more than 30000, it would use train-validation-test
# else use CV to parameters tuning
# if len(df_sample) >= 30000:
# df_train,df_val,df_test = datacal.train_test_split_general(df_sample, val_size=0.25, test_size=0.25, stratify='target', random_state=7)
# else:
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
'''
# 默认取样本方法
df_train, df_val, df_test = datacal.train_test_split_general()
# model refit
#xgboost
xgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
xgb_model_auc['training_auc'] = None
xgb_model_auc['val_auc'] = None
#xgbreport.report(df_train, df_test, df_val, features, target, '','dhb模型迭代报告.doc', kfold = 2)
## 待加入 : xgb 各dataset的 auc, KA 渠道 / 客群 的 auc
#ligthtgbm
lgb_model_auc = {'training_auc' : None, 'val_auc' : None, 'test_auc' : None}
lgb_model_auc['training_auc'] = None
lgb_model_auc['val_auc'] = None
#dftrain,dftest = datacal.split_train_val(df_sample,trainsplit = 'timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# merge as single dataframe full of models
#pd.DataFrame(xgb_model)
# dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
# df=dhb.dhb_features_extract()
# print(df.columns.tolist())
# print(df.target.unique())
# label='target'
# features=dhb.get_feature()
# df[features]=df[features].astype(float)
# df['target']=df['target'].astype(int)
# print('----feature---',len(features))
# df=pd.read_csv('test.csv')
#== 模型名称
model_name='dhb'
#== 目标是15天
passdue_day=15
df_log=sample.get_last_record(model_name)
if df_log.shape[0]==1:
start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
else:
start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
start_date='2019-01-01'
end_date='2019-01-10'
print(start_date,end_date)
df_sample=dhb.query_sample(start_date,end_date)
df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
df_sample['label']=1
df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 记录样本信息
# sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
# train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
from tools import datacal
# 渠道列表
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
# 申请类型列表
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
# workspace 路径
worksapce = 'E:\\bla\\model_mvp\\'
# 样本路径
sample_path = 'E:\\model\\model_mvp\\mvp\\sample.csv'
# N+标签
target = 'target'
#################################################### report settings #############################################################################
# 线上模型分字段
score = 'score'
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
# 预测模型分字段
prediction = 'predict'
# refit / rebuild sequence
# 报告生成路径
report_path = worksapce
# 报告名称
report_name = "lgb_report.docx"
# 切换到workspace目录下 避免相对路径不能识别问题
os.chdir(worksapce)
#################################################### training settings #################################################
# 生成电话帮对象(使用默认参数)
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 需要对特征进行调整时,在这里直接dhb.features = 赋值即可
# 提取样本
df_sample = dhb.dhb_features_extract()
# 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
#df_sample = dhb.dhb_features_extract()
# 这里直接使用csv读入样本
df_sample = pd.read_csv(sample_path,engine='python')
# 电话帮数据处理
# report sequence
# 自定义方法 / 默认数据处理方法
df_sample = dhb.dhb_features_prepocessing(df_sample)
# 备份df_sample
#df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify=target,
random_state=7, split_methods='random',
time_label='applied_at')
del df_sample
# 模型refit
model_matrix, lgbm = refit.model_fit(df_sample, dhb, target, score)
print(model_matrix)
# 生成报告
status = refit.model_report(lgbm, df_train, df_val, df_test, dhb, target,
score, prediction, report_path, report_name, applied_from, applied_type, topN=3)
......
This diff is collapsed.
......@@ -22,9 +22,16 @@ def saveDocument(document,path,filename):
raise ValueError('{} is not a word file'.format(filename))
return document.save(os.path.join(path,filename))
def insert_table(document, cols, values):
# cols 为列名
# values 为值,list
def insert_table(document,df):
'''
instructions : plot table which insert into docx
:param document: document obj
:param df: dataframe
:return:
'''
cols = df.columns
values = df.values
table = document.add_table(rows=1, cols=len(cols),style='Medium Grid 1 Accent 1')
hdr_cells = table.rows[0].cells
for i in range(len(cols)):
......@@ -32,5 +39,8 @@ def insert_table(document, cols, values):
for value in values:
row_cells = table.add_row().cells
for i in range(len(cols)):
row_cells[i].text = str(value[i])
return document
\ No newline at end of file
if type(value[i])==str:
row_cells[i].text = value[i]
else:
row_cells[i].text = str(value[i])
return document
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment