Commit da357ba7 authored by 王家华's avatar 王家华

新增readme,lgb调参

parents f2e2d5cf a3823e67
...@@ -9,7 +9,7 @@ def mysql_query(sql,engine_sql): ...@@ -9,7 +9,7 @@ def mysql_query(sql,engine_sql):
''' '''
res=[] res=[]
#== palo 每次查询不超过10000 #== palo 每次查询不超过10000
tmp=pd.read_sql(sql,engine_sql,chunksize=5000) tmp=pd.read_sql(sql,engine_sql,chunksize=5001)
for tt in tmp: for tt in tmp:
res.append(tt) res.append(tt)
return pd.concat(res) return pd.concat(res)
\ No newline at end of file
...@@ -37,7 +37,10 @@ def query_sample(start_date,end_date,is_loan=True): ...@@ -37,7 +37,10 @@ def query_sample(start_date,end_date,is_loan=True):
''' % (','.join(features),start_date,end_date) ''' % (','.join(features),start_date,end_date)
df=dbquery.mysql_query(sql,engine_risk_analysis) df=dbquery.mysql_query(sql,engine_risk_analysis)
yewu=query_byloanid(df.loan_id.tolist()) yewu=query_byloanid(df.loan_id.tolist())
df.loan_id=df.loan_id.astype(int)
yewu.loan_id=yewu.loan_id.astype(int)
df=pd.merge(df,yewu,on='loan_id',how='inner') df=pd.merge(df,yewu,on='loan_id',how='inner')
df.applied_at=pd.to_datetime(df.applied_at)
value_map = { value_map = {
"近3天": 1, "近3天": 1,
"近4-5天": 2, "近4-5天": 2,
......
...@@ -4,12 +4,17 @@ import datetime ...@@ -4,12 +4,17 @@ import datetime
from mvp import xgbreport from mvp import xgbreport
from mvp import lgbreport from mvp import lgbreport
from data.analyis import datacal from data.analyis import datacal
from models import xgboost from models import xgboost
from models import lightgbm from models import lightgbm
from mvp import dhb from mvp import dhb
# from mvp import dhb
from data.samples import dhb,sample
dhb = dhb.dhb() dhb = dhb.dhb()
df_sample = dhb.dhb_features_extract() df_sample = dhb.dhb_features_extract()
target = 'target' target = 'target'
...@@ -23,6 +28,7 @@ print('----no.',len(features),'of samples of dhb----') ...@@ -23,6 +28,7 @@ print('----no.',len(features),'of samples of dhb----')
if __name__ == '__main__': if __name__ == '__main__':
# data extraction # data extraction
''' ## Old Edition here ''' ## Old Edition here
...@@ -69,3 +75,38 @@ if __name__ == '__main__': ...@@ -69,3 +75,38 @@ if __name__ == '__main__':
# dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
# df=dhb.dhb_features_extract()
# print(df.columns.tolist())
# print(df.target.unique())
# label='target'
# features=dhb.get_feature()
# df[features]=df[features].astype(float)
# df['target']=df['target'].astype(int)
# print('----feature---',len(features))
# df=pd.read_csv('test.csv')
#== 模型名称
model_name='dhb'
#== 目标是15天
passdue_day=15
df_log=sample.get_last_record(model_name)
if df_log.shape[0]==1:
start_date,end_date=sample.cal_sample_date(df_log.max_date[0],passdue_day)
else:
start_date, end_date = sample.cal_sample_date(passdue_day=passdue_day)
start_date='2019-01-01'
end_date='2019-01-10'
print(start_date,end_date)
df_sample=dhb.query_sample(start_date,end_date)
df_sample['applied_at'] = pd.to_datetime(df_sample['applied_at'])
df_sample['label']=1
df_sample.loc[df_sample.passdue_day >= passdue_day,'label']=0
dftrain,dftest=datacal.split_train_val(df_sample,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
# 记录样本信息
# sample.save_model_record(model_name,min_date=df_sample.applied_at.min(),max_date=df_sample.applied_at.max(),sample_cnt=df_sample.shape[0],
# train_min_date=dftrain.applied_at.min(),train_max_date=dftrain.applied_at.max(),train_cnt=dftrain.shape[0],
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment