Commit a3823e67 authored by linfang.wang's avatar linfang.wang

update 之前的模型

parent fe8f7148
......@@ -9,7 +9,7 @@ def mysql_query(sql,engine_sql):
'''
res=[]
#== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5000)
tmp=pd.read_sql(sql,engine_sql,chunksize=5001)
for tt in tmp:
res.append(tt)
return pd.concat(res)
\ No newline at end of file
......@@ -37,7 +37,10 @@ def query_sample(start_date,end_date,is_loan=True):
''' % (','.join(features),start_date,end_date)
df=dbquery.mysql_query(sql,engine_risk_analysis)
yewu=query_byloanid(df.loan_id.tolist())
df.loan_id=df.loan_id.astype(int)
yewu.loan_id=yewu.loan_id.astype(int)
df=pd.merge(df,yewu,on='loan_id',how='inner')
df.applied_at=pd.to_datetime(df.applied_at)
value_map = {
"近3天": 1,
"近4-5天": 2,
......
......@@ -3,7 +3,8 @@ import numpy as np
import datetime
from mvp import xgbreport
from data.analyis import datacal
from mvp import dhb
# from mvp import dhb
from data.samples import dhb,sample
if __name__ == '__main__':
# features=[
......@@ -47,16 +48,36 @@ if __name__ == '__main__':
# 'third_data_source#xy_pan_newqueryAorgAcount',
# 'third_data_source#xy_pan_newqueryAsumAcount'
# ]
dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
df=dhb.dhb_features_extract()
print(df.columns.tolist())
print(df.target.unique())
label='target'
features=dhb.get_feature()
df[features]=df[features].astype(float)
df['target']=df['target'].astype(int)
print('----feature---',len(features))
# dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
# df=dhb.dhb_features_extract()
# print(df.columns.tolist())
# print(df.target.unique())
# label='target'
# features=dhb.get_feature()
# df[features]=df[features].astype(float)
# df['target']=df['target'].astype(int)
# print('----feature---',len(features))
# df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
#== 模型名称
model_name='dhb'
#== 目标是15天
passdue_day=15
df_log=sample.get_last_record(model_name)
if df_log.shape[0]==1:
start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
else:
start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
start_date='2019-01-01'
end_date='2019-01-10'
print(start_date,end_date)
df_sample=dhb.query_sample(start_date,end_date)
df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
df_sample['label']=1
df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 记录样本信息
# sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
# train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment