Commit fe8f7148 authored by linfang.wang's avatar linfang.wang

加入MySQL 连接

parent b980367e
......@@ -8,7 +8,8 @@ def mysql_query(sql,engine_sql):
:return:dataframe
'''
res=[]
tmp=pd.read_sql(sql,engine_sql,chunksize=10000)
#== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5000)
for tt in tmp:
res.append(tt)
return pd.concat(res)
\ No newline at end of file
......@@ -4,3 +4,5 @@
2、去重标准,文本转0-1 or 其他
3、调参,哪些参数,参数标准
'''
import sample
import yewudata
\ No newline at end of file
......@@ -3,11 +3,12 @@ from data.samples import sample
from data.datasource import dbquery
from data.datasource.mysqldb import engine_risk_analysis
from data.samples.yewudata import *
import os
'''
目的:获取电话邦特征,样本数据,数据源为风控分析库
'''
feature_file_name='features/dhb.csv'
feature_file_name='/Users/wlf/PycharmProjects/model_mvp/data/samples/features/dhb.csv'
def get_feature():
return sample.get_feature_by_version(feature_file_name)
......@@ -53,24 +54,40 @@ def query_sample(start_date,end_date,is_loan=True):
}
cols = ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", "dhb_overview_ntdun_first_call_time",
"dhb_overview_ntdun_last_call_time"]
#== df.columns 中必须有特征的
cols=list(set(cols) & set(df.columns.tolist()))
if len(cols)>0:
df[cols] = df[cols].applymap(lambda x: value_map[x])
cols=df.columns.tolist()
if 'dhb_last_60_and_90_days_ntdun_call_avg_duration' in cols:
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
if 'dhb_overview_ntdun_call_duration_above60' in cols:
df.loc[df.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
if 'dhb_last_30_and_60_days_ntdun_call_total_duration' in cols:
df.loc[
df.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
if 'dhb_last_30_and_60_days_dun_call_in_duration' in cols:
df.loc[
df.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
if 'dhb_last_30_days_ntdun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
if 'dhb_last_30_days_ntdun_call_tel_total_nums' in cols:
df.loc[df.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
if 'dhb_last_30_days_dun_call_in_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
if 'dhb_overview_ntdun_call_total_duration' in cols:
df.loc[df.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
if 'dhb_overview_ntdun_call_in_times' in cols:
df.loc[df.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
if 'dhb_last_60_and_90_days_ntdun_call_in_duration' in cols:
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
if 'dhb_overview_dun_call_tel_total_nums' in cols:
df.loc[df.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
if 'dhb_last_30_days_dun_call_total_duration' in cols:
df.loc[df.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
if 'dhb_last_two_weeks_ntdun_call_in_duration' in cols:
df.loc[df.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
return df
......
......@@ -157,12 +157,3 @@ dhb_overview_ntdun_call_total_duration,1
dhb_overview_ntdun_call_total_times,1
dhb_overview_ntdun_first_call_time,1
dhb_overview_ntdun_last_call_time,1
dhb_last_30_and_60_days_dun_call_duration_above60,2
dhb_last_30_and_60_days_dun_call_duration_below15,2
dhb_last_30_and_60_days_dun_call_duration_between15_and_30,2
dhb_last_30_and_60_days_dun_call_in_duration,2
dhb_last_30_and_60_days_dun_call_in_times,2
dhb_last_30_and_60_days_dun_call_out_duration,2
dhb_last_30_and_60_days_dun_call_out_times,2
dhb_last_30_and_60_days_dun_call_tel_total_nums,2
dhb_last_30_and_60_days_dun_call_total_duration,2
......@@ -11,6 +11,7 @@ def get_features_from_file(feature_file_name):
从feature 文件中读取feature
:return: df,columns=['feature','version']
'''
print('当前目录:',os.path.abspath('.'))
df_feature=pd.read_csv(feature_file_name)
return df_feature
......@@ -53,7 +54,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
:param passdue_day:查看表现的,比如逾期15天表现的样本
:return:start_date,end_date,可提取样本的最早时间,最晚时间
'''
base_date=datetime.date()
base_date=datetime.datetime.now().date()
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== 提取的样本数据不得超过base_date
base_date=base_date+relativedelta(days=-(passdue_day+5),months=-1)
......@@ -63,7 +64,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
else:
#last_sample_max_date 为基准,计算
if type(last_sample_max_date)==str:
last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d').date()
last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d %H:%M:%S').date()
if last_sample_max_date >=base_date:
last_sample_max_date=base_date
start_date=last_sample_max_date
......@@ -81,7 +82,7 @@ def read_record():
df = pd.DataFrame(columns=cols)
return df
def get_record(model_name):
def get_records(model_name):
'''
获取某一个模型下的所有的迭代的记录
:param model_name:
......@@ -98,7 +99,7 @@ def get_last_record(model_name):
:param model_name:
:return:
'''
df_select=get_record(model_name)
df_select=get_records(model_name)
if df_select.shape == 0:
return df_select
return df_select.head(1)
......@@ -125,7 +126,7 @@ def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
df_all=read_record()
df_all.reset_index(inplace=True)
#== 获取当下的记录
df_record=get_record(model_name)
df_record=get_records(model_name)
df_record=df_record[df_record.update_date==datetime.date()]
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt', 'train_auc',
......
......@@ -43,8 +43,8 @@ def query_byloanid(loan_ids):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
'''
sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse,
select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
......@@ -69,8 +69,8 @@ def query_by_orderno(order_nos):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单']
'''
sql = '''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse,
select t1.loan_id,t1.user_id,t1.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
......@@ -98,8 +98,8 @@ def query_bydate(start_date,end_date,is_loan=True):
'''
if is_loan:
sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse,
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
......@@ -109,8 +109,8 @@ def query_bydate(start_date,end_date,is_loan=True):
''' % (start_date,end_date)
else:
sql='''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no==1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval==1,'审核通过','审核未通过')) as refuse,
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment