Commit fe8f7148 authored by linfang.wang's avatar linfang.wang

加入MySQL 连接

parent b980367e
...@@ -8,7 +8,8 @@ def mysql_query(sql,engine_sql): ...@@ -8,7 +8,8 @@ def mysql_query(sql,engine_sql):
:return:dataframe :return:dataframe
''' '''
res=[] res=[]
tmp=pd.read_sql(sql,engine_sql,chunksize=10000) #== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5000)
for tt in tmp: for tt in tmp:
res.append(tt) res.append(tt)
return pd.concat(res) return pd.concat(res)
\ No newline at end of file
...@@ -4,3 +4,5 @@ ...@@ -4,3 +4,5 @@
2、去重标准,文本转0-1 or 其他 2、去重标准,文本转0-1 or 其他
3、调参,哪些参数,参数标准 3、调参,哪些参数,参数标准
''' '''
import sample
import yewudata
\ No newline at end of file
...@@ -3,11 +3,12 @@ from data.samples import sample ...@@ -3,11 +3,12 @@ from data.samples import sample
from data.datasource import dbquery from data.datasource import dbquery
from data.datasource.mysqldb import engine_risk_analysis from data.datasource.mysqldb import engine_risk_analysis
from data.samples.yewudata import * from data.samples.yewudata import *
import os
''' '''
目的:获取电话邦特征,样本数据,数据源为风控分析库 目的:获取电话邦特征,样本数据,数据源为风控分析库
''' '''
feature_file_name='features/dhb.csv' feature_file_name='/Users/wlf/PycharmProjects/model_mvp/data/samples/features/dhb.csv'
def get_feature(): def get_feature():
return sample.get_feature_by_version(feature_file_name) return sample.get_feature_by_version(feature_file_name)
...@@ -53,24 +54,40 @@ def query_sample(start_date,end_date,is_loan=True): ...@@ -53,24 +54,40 @@ def query_sample(start_date,end_date,is_loan=True):
} }
cols = ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", "dhb_overview_ntdun_first_call_time", cols = ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", "dhb_overview_ntdun_first_call_time",
"dhb_overview_ntdun_last_call_time"] "dhb_overview_ntdun_last_call_time"]
#== df.columns 中必须有特征的
cols=list(set(cols) & set(df.columns.tolist()))
if len(cols)>0:
df[cols] = df[cols].applymap(lambda x: value_map[x]) df[cols] = df[cols].applymap(lambda x: value_map[x])
cols=df.columns.tolist()
if 'dhb_last_60_and_90_days_ntdun_call_avg_duration' in cols:
df.loc[ df.loc[
df.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42 df.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
if 'dhb_overview_ntdun_call_duration_above60' in cols:
df.loc[df.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25 df.loc[df.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
if 'dhb_last_30_and_60_days_ntdun_call_total_duration' in cols:
df.loc[ df.loc[
df.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800 df.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
if 'dhb_last_30_and_60_days_dun_call_in_duration' in cols:
df.loc[ df.loc[
df.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600 df.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
if 'dhb_last_30_days_ntdun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500 df.loc[df.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
if 'dhb_last_30_days_ntdun_call_tel_total_nums' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25 df.loc[df.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
if 'dhb_last_30_days_dun_call_in_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000 df.loc[df.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
if 'dhb_overview_ntdun_call_total_duration' in cols:
df.loc[df.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000 df.loc[df.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
if 'dhb_overview_ntdun_call_in_times' in cols:
df.loc[df.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25 df.loc[df.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
if 'dhb_last_60_and_90_days_ntdun_call_in_duration' in cols:
df.loc[ df.loc[
df.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000 df.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
if 'dhb_overview_dun_call_tel_total_nums' in cols:
df.loc[df.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22 df.loc[df.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
if 'dhb_last_30_days_dun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100 df.loc[df.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
if 'dhb_last_two_weeks_ntdun_call_in_duration' in cols:
df.loc[df.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300 df.loc[df.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
return df return df
......
...@@ -157,12 +157,3 @@ dhb_overview_ntdun_call_total_duration,1 ...@@ -157,12 +157,3 @@ dhb_overview_ntdun_call_total_duration,1
dhb_overview_ntdun_call_total_times,1 dhb_overview_ntdun_call_total_times,1
dhb_overview_ntdun_first_call_time,1 dhb_overview_ntdun_first_call_time,1
dhb_overview_ntdun_last_call_time,1 dhb_overview_ntdun_last_call_time,1
dhb_last_30_and_60_days_dun_call_duration_above60,2
dhb_last_30_and_60_days_dun_call_duration_below15,2
dhb_last_30_and_60_days_dun_call_duration_between15_and_30,2
dhb_last_30_and_60_days_dun_call_in_duration,2
dhb_last_30_and_60_days_dun_call_in_times,2
dhb_last_30_and_60_days_dun_call_out_duration,2
dhb_last_30_and_60_days_dun_call_out_times,2
dhb_last_30_and_60_days_dun_call_tel_total_nums,2
dhb_last_30_and_60_days_dun_call_total_duration,2
...@@ -11,6 +11,7 @@ def get_features_from_file(feature_file_name): ...@@ -11,6 +11,7 @@ def get_features_from_file(feature_file_name):
从feature 文件中读取feature 从feature 文件中读取feature
:return: df,columns=['feature','version'] :return: df,columns=['feature','version']
''' '''
print('当前目录:',os.path.abspath('.'))
df_feature=pd.read_csv(feature_file_name) df_feature=pd.read_csv(feature_file_name)
return df_feature return df_feature
...@@ -53,7 +54,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15): ...@@ -53,7 +54,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
:param passdue_day:查看表现的,比如逾期15天表现的样本 :param passdue_day:查看表现的,比如逾期15天表现的样本
:return:start_date,end_date,可提取样本的最早时间,最晚时间 :return:start_date,end_date,可提取样本的最早时间,最晚时间
''' '''
base_date=datetime.date() base_date=datetime.datetime.now().date()
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等 #== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== 提取的样本数据不得超过base_date #== 提取的样本数据不得超过base_date
base_date=base_date+relativedelta(days=-(passdue_day+5),months=-1) base_date=base_date+relativedelta(days=-(passdue_day+5),months=-1)
...@@ -63,7 +64,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15): ...@@ -63,7 +64,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
else: else:
#last_sample_max_date 为基准,计算 #last_sample_max_date 为基准,计算
if type(last_sample_max_date)==str: if type(last_sample_max_date)==str:
last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d').date() last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d %H:%M:%S').date()
if last_sample_max_date >=base_date: if last_sample_max_date >=base_date:
last_sample_max_date=base_date last_sample_max_date=base_date
start_date=last_sample_max_date start_date=last_sample_max_date
...@@ -81,7 +82,7 @@ def read_record(): ...@@ -81,7 +82,7 @@ def read_record():
df = pd.DataFrame(columns=cols) df = pd.DataFrame(columns=cols)
return df return df
def get_record(model_name): def get_records(model_name):
''' '''
获取某一个模型下的所有的迭代的记录 获取某一个模型下的所有的迭代的记录
:param model_name: :param model_name:
...@@ -98,7 +99,7 @@ def get_last_record(model_name): ...@@ -98,7 +99,7 @@ def get_last_record(model_name):
:param model_name: :param model_name:
:return: :return:
''' '''
df_select=get_record(model_name) df_select=get_records(model_name)
if df_select.shape == 0: if df_select.shape == 0:
return df_select return df_select
return df_select.head(1) return df_select.head(1)
...@@ -125,7 +126,7 @@ def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None, ...@@ -125,7 +126,7 @@ def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
df_all=read_record() df_all=read_record()
df_all.reset_index(inplace=True) df_all.reset_index(inplace=True)
#== 获取当下的记录 #== 获取当下的记录
df_record=get_record(model_name) df_record=get_records(model_name)
df_record=df_record[df_record.update_date==datetime.date()] df_record=df_record[df_record.update_date==datetime.date()]
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt', cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt', 'train_auc', 'train_min_date', 'train_max_date', 'train_cnt', 'train_auc',
......
...@@ -43,8 +43,8 @@ def query_byloanid(loan_ids): ...@@ -43,8 +43,8 @@ def query_byloanid(loan_ids):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 :return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
''' '''
sql=''' sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day, select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse, t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day max(t2.passdue_day) as max_passdue_day
from loan_application t1 from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4 left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...@@ -69,8 +69,8 @@ def query_by_orderno(order_nos): ...@@ -69,8 +69,8 @@ def query_by_orderno(order_nos):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单'] :return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单']
''' '''
sql = ''' sql = '''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day, select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse, t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day max(t2.passdue_day) as max_passdue_day
from loan_application t1 from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4 left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...@@ -98,8 +98,8 @@ def query_bydate(start_date,end_date,is_loan=True): ...@@ -98,8 +98,8 @@ def query_bydate(start_date,end_date,is_loan=True):
''' '''
if is_loan: if is_loan:
sql=''' sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day, select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse, t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day max(t2.passdue_day) as max_passdue_day
from loan_application t1 from loan_application t1
join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4 join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...@@ -109,8 +109,8 @@ def query_bydate(start_date,end_date,is_loan=True): ...@@ -109,8 +109,8 @@ def query_bydate(start_date,end_date,is_loan=True):
''' % (start_date,end_date) ''' % (start_date,end_date)
else: else:
sql=''' sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day, select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse, t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day max(t2.passdue_day) as max_passdue_day
from loan_application t1 from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4 left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment