Commit a132f117 authored by 王家华's avatar 王家华

update report nane

parent f1f45079
......@@ -221,4 +221,5 @@ def cal_miss(df,feature,classes=[]):
df_out=tmp.groupby('flag')[feature].count().reset_index().rename(columns={feature:'cnt1'})
df_out['cnt']=tmp.shape[0]
df_out['match_rate']=np.round(df_out['cnt1']/df_out['cnt'],3)
return df_out[headers]
\ No newline at end of file
return df_out[headers]
import pymongo
import pandas as pd
import numpy as np
limit = "{'wf_created_at': {'$gte': '@start_date', '$lt': '@end_date'}}"
query = "{'order_id':1,'@key':1}"
'''
instructions : query from mongoDB which should assign a defined list
Params :
limit - limit dict
query - qurey dict which contains keys that should be query
'''
def querymongo(start_time_period, end_time_period, limit, query):
myclient = pymongo.MongoClient("mongodb://rc_dp_feature_user:qgrcdpfeature_2019@172.20.1.150:20000/?authSource=rc_dp_feature_pro")
mydb = myclient["rc_dp_feature_pro"]
mycol = mydb["rc_feature_analysis_timing_v2"]
# all data
#x = mycol.find()
# approval data
#x = mycol.find({"wf_audit_result":"1"})
# gt greater than, lt less than. e = equals
x = mycol.find(eval(limit),eval(query))
myclient.close()
return pd.DataFrame(list(x))
"""
Created on Thu Apr 18 11:32:06 2019
@author: wangjiahua
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
metric = figure_arrangement // 10 * figure_arrangement % 10
for i in range(int(np.ceil(len(col) // metric))):
cols = col[i * metric:]
for fig_ith in range(len(cols)):
axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
axs.plot(index,dataset.loc[cols[fig_ith]])
axs.set_title(cols[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.tight_layout()
plt.show()
return 1
#fig,axs = plt.subplots(1,1,figsize=(16,9),linewidth=0.1)
#
#for fig_ith in range(len(df.columns)):
# axs = plt.subplot(figure_arrangement * 10 + 1 + fig_ith)
# axs.plot(df.index,df.iloc[fig_ith])
# axs.set_title(col[])
#plt.tight_layout()
def plot_curve_multiCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
fig_path = None):
col = dataset.columns
index = pd.Series(dataset.index.sort_values()).astype(str)
plt.figure(figsize=fig_size)
#metric = figure_arrangement // 10 * figure_arrangement % 10
#cols = col[i * metric:]
axs = plt.subplot(111)
for fig_ith in range(len(col)):
axs.plot(index,dataset.loc[col[fig_ith]],label=col[fig_ith])
axs.set_title(col[fig_ith],fontsize = 7)
plt.xticks(fontsize = 5)
plt.yticks(fontsize = 5)
plt.grid()
if x_label != None:
axs.set_xlabel(x_label, fontsize = 5)
if y_label != None:
axs.set_ylabel(y_label, fontsize = 5)
plt.legend()
plt.tight_layout()
plt.show()
return 1
'''
'''
def plot_curve_mingle():
return 1
def density_chart(dataset,title):
for col in dataset.columns:
sns.kdeplot(dataset.loc[:,col],label = col)
plt.title(title)
plt.show()
def learning_curve():
def pdp_chart():
return 1
def uniVarChart():
return 1
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
# t1 = np.arange(0.0, 1.0, 0.01)
#
#
# for n in [1, 2, 3, 4]:
# plt.plot(t1, t1 ** n, label="n=%d" % n)
# leg = plt.legend(loc='best', ncol=4, mode="expand", shadow=True)
# leg.get_frame().set_alpha(alpha)
#
#
# # if this fig should be saved
# if fig_path != None:
# plt.savefig(fig_path + fig_name +'.png')
#
#
#
## for i in range(figure_arrangement%10):
## plt.subplots(,figsize=fig_size,linewidth=0.1)
#
# return 1
\ No newline at end of file
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, mean_squared_error
import numpy
import pandas
params = {
'task': 'train', #用途
'application':'binary', #用于二分类
'boosting_type': 'gbdt', # 设置提升类型
'num_boost_round':100, #迭代次数
'learning_rate': 0.01, # 学习速率
'metric': {'logloss', 'auc'}, # 评估函数
'early_stopping_rounds':None,
# 'objective': 'regression', # 目标函数
'max_depth':4,
'num_leaves': 20, # 叶子节点数
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
'''
instructions : training lightgbm model with specified params
Parameters :
dataset -
features - feature list of dataset
target - tagert column or label list of samples
'''
def lgb_train(params,training_set,features,target):
lgb_train = lgb.Dataset(training_set[features],training_set[target])
import pandas as pd
import mysqldb
'''
model instructions : established a dhb obj which cotains attrubutes of dhb model
Params Constructor:
_init__(self,features=None,sql=None,start_time_period=None,end_time_period=None)
API :
.features - default features
.start_time_period
.end_time_period
.dhb_features_extract() extract dhb features
.dhb_comparasion(start_time_period = self.start_time_period, end_time_period = self.end_time_period, applied_type = None,applied_from = None)
'''
class dhb():
# features as Series format
features = ['dhb_last_30_and_60_days_dun_call_avg_duration',
'dhb_last_30_and_60_days_dun_call_duration_above60',
'dhb_last_30_and_60_days_dun_call_duration_below15',
'dhb_last_30_and_60_days_dun_call_duration_between15_and_30',
'dhb_last_30_and_60_days_dun_call_in_duration',
'dhb_last_30_and_60_days_dun_call_in_times',
'dhb_last_30_and_60_days_dun_call_out_duration',
'dhb_last_30_and_60_days_dun_call_out_times',
'dhb_last_30_and_60_days_dun_call_tel_total_nums',
'dhb_last_30_and_60_days_dun_call_total_duration',
'dhb_last_30_and_60_days_dun_call_total_times',
'dhb_last_30_and_60_days_ntdun_call_avg_duration',
'dhb_last_30_and_60_days_ntdun_call_duration_above60',
'dhb_last_30_and_60_days_ntdun_call_duration_below15',
'dhb_last_30_and_60_days_ntdun_call_duration_between15_and_30',
'dhb_last_30_and_60_days_ntdun_call_in_duration',
'dhb_last_30_and_60_days_ntdun_call_in_times',
'dhb_last_30_and_60_days_ntdun_call_out_duration',
'dhb_last_30_and_60_days_ntdun_call_out_times',
'dhb_last_30_and_60_days_ntdun_call_tel_total_nums',
'dhb_last_30_and_60_days_ntdun_call_total_duration',
'dhb_last_30_and_60_days_ntdun_call_total_times',
'dhb_last_30_days_dun_call_avg_duration',
'dhb_last_30_days_dun_call_duration_above60',
'dhb_last_30_days_dun_call_duration_below15',
'dhb_last_30_days_dun_call_duration_between15_and_30',
'dhb_last_30_days_dun_call_in_duration',
'dhb_last_30_days_dun_call_in_times',
'dhb_last_30_days_dun_call_out_duration',
'dhb_last_30_days_dun_call_out_times',
'dhb_last_30_days_dun_call_tel_total_nums',
'dhb_last_30_days_dun_call_total_duration',
'dhb_last_30_days_dun_call_total_times',
'dhb_last_30_days_ntdun_call_avg_duration',
'dhb_last_30_days_ntdun_call_duration_above60',
'dhb_last_30_days_ntdun_call_duration_below15',
'dhb_last_30_days_ntdun_call_duration_between15_and_30',
'dhb_last_30_days_ntdun_call_in_duration',
'dhb_last_30_days_ntdun_call_in_times',
'dhb_last_30_days_ntdun_call_out_duration',
'dhb_last_30_days_ntdun_call_out_times',
'dhb_last_30_days_ntdun_call_tel_total_nums',
'dhb_last_30_days_ntdun_call_total_duration',
'dhb_last_30_days_ntdun_call_total_times',
'dhb_last_60_and_90_days_dun_call_avg_duration',
'dhb_last_60_and_90_days_dun_call_duration_above60',
'dhb_last_60_and_90_days_dun_call_duration_below15',
'dhb_last_60_and_90_days_dun_call_duration_between15_and_30',
'dhb_last_60_and_90_days_dun_call_in_duration',
'dhb_last_60_and_90_days_dun_call_in_times',
'dhb_last_60_and_90_days_dun_call_out_duration',
'dhb_last_60_and_90_days_dun_call_out_times',
'dhb_last_60_and_90_days_dun_call_tel_total_nums',
'dhb_last_60_and_90_days_dun_call_total_duration',
'dhb_last_60_and_90_days_dun_call_total_times',
'dhb_last_60_and_90_days_ntdun_call_avg_duration',
'dhb_last_60_and_90_days_ntdun_call_duration_above60',
'dhb_last_60_and_90_days_ntdun_call_duration_below15',
'dhb_last_60_and_90_days_ntdun_call_duration_between15_and_30',
'dhb_last_60_and_90_days_ntdun_call_in_duration',
'dhb_last_60_and_90_days_ntdun_call_in_times',
'dhb_last_60_and_90_days_ntdun_call_out_duration',
'dhb_last_60_and_90_days_ntdun_call_out_times',
'dhb_last_60_and_90_days_ntdun_call_tel_total_nums',
'dhb_last_60_and_90_days_ntdun_call_total_duration',
'dhb_last_60_and_90_days_ntdun_call_total_times',
'dhb_last_three_weeks_dun_call_avg_duration',
'dhb_last_three_weeks_dun_call_duration_above60',
'dhb_last_three_weeks_dun_call_duration_below15',
'dhb_last_three_weeks_dun_call_duration_between15_and_30',
'dhb_last_three_weeks_dun_call_in_duration',
'dhb_last_three_weeks_dun_call_in_times',
'dhb_last_three_weeks_dun_call_out_duration',
'dhb_last_three_weeks_dun_call_out_times',
'dhb_last_three_weeks_dun_call_tel_total_nums',
'dhb_last_three_weeks_dun_call_total_duration',
'dhb_last_three_weeks_dun_call_total_times',
'dhb_last_three_weeks_ntdun_call_avg_duration',
'dhb_last_three_weeks_ntdun_call_duration_above60',
'dhb_last_three_weeks_ntdun_call_duration_below15',
'dhb_last_three_weeks_ntdun_call_duration_between15_and_30',
'dhb_last_three_weeks_ntdun_call_in_duration',
'dhb_last_three_weeks_ntdun_call_in_times',
'dhb_last_three_weeks_ntdun_call_out_duration',
'dhb_last_three_weeks_ntdun_call_out_times',
'dhb_last_three_weeks_ntdun_call_tel_total_nums',
'dhb_last_three_weeks_ntdun_call_total_duration',
'dhb_last_three_weeks_ntdun_call_total_times',
'dhb_last_two_weeks_dun_call_avg_duration',
'dhb_last_two_weeks_dun_call_duration_above60',
'dhb_last_two_weeks_dun_call_duration_below15',
'dhb_last_two_weeks_dun_call_duration_between15_and_30',
'dhb_last_two_weeks_dun_call_in_duration',
'dhb_last_two_weeks_dun_call_in_times',
'dhb_last_two_weeks_dun_call_out_duration',
'dhb_last_two_weeks_dun_call_out_times',
'dhb_last_two_weeks_dun_call_tel_total_nums',
'dhb_last_two_weeks_dun_call_total_duration',
'dhb_last_two_weeks_dun_call_total_times',
'dhb_last_two_weeks_ntdun_call_avg_duration',
'dhb_last_two_weeks_ntdun_call_duration_above60',
'dhb_last_two_weeks_ntdun_call_duration_below15',
'dhb_last_two_weeks_ntdun_call_duration_between15_and_30',
'dhb_last_two_weeks_ntdun_call_in_duration',
'dhb_last_two_weeks_ntdun_call_in_times',
'dhb_last_two_weeks_ntdun_call_out_duration',
'dhb_last_two_weeks_ntdun_call_out_times',
'dhb_last_two_weeks_ntdun_call_tel_total_nums',
'dhb_last_two_weeks_ntdun_call_total_duration',
'dhb_last_two_weeks_ntdun_call_total_times',
'dhb_last_week_dun_call_avg_duration',
'dhb_last_week_dun_call_duration_above60',
'dhb_last_week_dun_call_duration_below15',
'dhb_last_week_dun_call_duration_between15_and_30',
'dhb_last_week_dun_call_in_duration',
'dhb_last_week_dun_call_in_times',
'dhb_last_week_dun_call_out_duration',
'dhb_last_week_dun_call_out_times',
'dhb_last_week_dun_call_tel_total_nums',
'dhb_last_week_dun_call_total_duration',
'dhb_last_week_dun_call_total_times',
'dhb_last_week_ntdun_call_avg_duration',
'dhb_last_week_ntdun_call_duration_above60',
'dhb_last_week_ntdun_call_duration_below15',
'dhb_last_week_ntdun_call_duration_between15_and_30',
'dhb_last_week_ntdun_call_in_duration',
'dhb_last_week_ntdun_call_in_times',
'dhb_last_week_ntdun_call_out_duration',
'dhb_last_week_ntdun_call_out_times',
'dhb_last_week_ntdun_call_tel_total_nums',
'dhb_last_week_ntdun_call_total_duration',
'dhb_last_week_ntdun_call_total_times',
'dhb_overview_dun_call_avg_duration',
'dhb_overview_dun_call_duration_above60',
'dhb_overview_dun_call_duration_below15',
'dhb_overview_dun_call_duration_between15_and_30',
'dhb_overview_dun_call_in_duration',
'dhb_overview_dun_call_in_times',
'dhb_overview_dun_call_out_duration',
'dhb_overview_dun_call_out_times',
'dhb_overview_dun_call_tel_total_nums',
'dhb_overview_dun_call_total_duration',
'dhb_overview_dun_call_total_times',
'dhb_overview_dun_first_call_time',
'dhb_overview_dun_last_call_time',
'dhb_overview_ntdun_call_avg_duration',
'dhb_overview_ntdun_call_duration_above60',
'dhb_overview_ntdun_call_duration_below15',
'dhb_overview_ntdun_call_duration_between15_and_30',
'dhb_overview_ntdun_call_in_duration',
'dhb_overview_ntdun_call_in_times',
'dhb_overview_ntdun_call_out_duration',
'dhb_overview_ntdun_call_out_times',
'dhb_overview_ntdun_call_tel_total_nums',
'dhb_overview_ntdun_call_total_duration',
'dhb_overview_ntdun_call_total_times',
'dhb_overview_ntdun_first_call_time']
'''
instructions :
build a constructor of dhb
Params:
'''
# ime period set as default
start_time_period = (datetime.date.today() - relativedelta(months=+7)).strftime("%Y-%m-%d 00:00:00")
end_time_period = (datetime.date.today() - relativedelta(days=+16)).strftime("%Y-%m-%d 00:00:00")
def __init__(self,overdue_days=15,features=None,sql=None,start_time_period=None,end_time_period=None):
try:
if features != None:
self.features = features
if sql != None:
self.sql = sql
else:
sql = "select "+str(features).strip('[').strip(']')+''',if(passdue_day>'''+str(overdue_days)+''',1,0) as target, applied_at, applied_from, applied_type
from risk_analysis
where applied_at >= '@start_time_period' and applied_at < '@end_time_period'
and transacted = 1
and dhb_flag =1
and datediff(now(),deadline) > '''+str(overdue_days)+'''
'''
if start_time_period != None:
self.start_time_period = start_time_period
if end_time_period != None:
self.end_time_period = end_time_period
except Exception as e:
print('Parameters Error:\n',e)
'''
instrucions : extract dhb features from risk_analysis
Params : nothing yet
returns : dhb features
'''
def dhb_features_extract(self):
value_map = {
"近3天":1,
"近4-5天":2,
"近6-7天":3,
"近8-15天":4,
"近16-30天":5,
"近31-60天":6,
"近61-90天":7,
"近91-120天":8,
"近121-150天":9,
"近151-180天":10,
"180天前":11,
"无":0
}
#print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
dhb_loan = query_sql(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(lambda x : value_map[x])
# manipul category datatype which includes sequences
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = pd.get_dummies(df[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]],columns=["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"])
# limit the upper boundary
dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42,"dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25,"dhb_overview_ntdun_call_duration_above60"] = 25
dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration>= 800,"dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
dhb_loan.loc[dhb_loan.dhb_last_30_and_60_days_dun_call_in_duration >= 1600,"dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
dhb_loan.loc[dhb_loan.dhb_last_30_days_ntdun_call_total_duration>= 2500,"dhb_last_30_days_ntdun_call_total_duration"] = 2500
dhb_loan.loc[dhb_loan.dhb_last_30_days_ntdun_call_tel_total_nums>= 25,"dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_in_duration >= 1000,"dhb_last_30_days_dun_call_in_duration"] = 1000
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_total_duration >= 3000,"dhb_overview_ntdun_call_total_duration"] = 3000
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_in_times>= 25,"dhb_overview_ntdun_call_in_times"] = 25
dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_in_duration>= 1000,"dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums>= 22,"dhb_overview_dun_call_tel_total_nums"] = 22
dhb_loan.loc[dhb_loan.dhb_last_30_days_dun_call_total_duration>= 1100,"dhb_last_30_days_dun_call_total_duration"] = 1100
dhb_loan.loc[dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration>= 300,"dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——"+str(datetime.date.today())+".csv")
print( time.strftime('%Y.%m.%d %H:%M:%S',time.localtime(time.time())) +"提取了dhb {}+ ".format(str(overdue_days)) + self.start_time_period + "to" + self.end_time_period + "时段样本")
return dhb_loan
def dhb_comparasion(df=None,start_time_period = self.start_time_period, end_time_period = self.end_time_period, applied_type = None,applied_from = None):
df_mongo = pymongodb(start_time_period, end_time_period, limit, "{'order_id':1,'model_exec_data_source#dhb':1}")
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import xgboost
from matplotlib import pyplot as plt
from data.graph import drawplot
import dhb
dhb = dhb()
df_dhb = dhb.dhb_features_extract()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment