Commit a1429476 authored by linfang.wang's avatar linfang.wang

电话邦跑

parent 03588f52
...@@ -77,6 +77,8 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metr ...@@ -77,6 +77,8 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metr
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html# :param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return: :return:
''' '''
if dftrain[features].shape[0]==0:
raise(' NO train data !!!! ')
if useTrainCV: if useTrainCV:
# kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7) # kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
xgb_param = clf.get_xgb_params() xgb_param = clf.get_xgb_params()
......
...@@ -49,8 +49,14 @@ if __name__ == '__main__': ...@@ -49,8 +49,14 @@ if __name__ == '__main__':
# ] # ]
dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00') dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
df=dhb.dhb_features_extract() df=dhb.dhb_features_extract()
print(df.columns.tolist())
print(df.target.unique())
label='target' label='target'
features=dhb.get_feature()
df[features]=df[features].astype(float)
df['target']=df['target'].astype(int)
print('----feature---',len(features))
# df=pd.read_csv('test.csv') # df=pd.read_csv('test.csv')
dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at') dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')
xgbreport.report(dftrain,dftest,dhb.features,label,'','tmp.doc') xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
...@@ -7,7 +7,7 @@ from models import xgboost ...@@ -7,7 +7,7 @@ from models import xgboost
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from data.graph import drawplot from data.graph import drawplot
def report(dftrain,dftest,features,label,path,filename): def report(dftrain,dftest,features,label,path,filename,kfold=10):
''' '''
dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
:param dftrain: :param dftrain:
...@@ -22,34 +22,34 @@ def report(dftrain,dftest,features,label,path,filename): ...@@ -22,34 +22,34 @@ def report(dftrain,dftest,features,label,path,filename):
document.add_heading('xgboost 算法运行报告') document.add_heading('xgboost 算法运行报告')
clf=xgboost.buildClf() clf=xgboost.buildClf()
document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params())) document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
clf=xgboost.modelfit(clf,dftrain,features,label) clf=xgboost.modelfit(clf,dftrain,features,label,kfold=kfold)
document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label))) document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label)))
document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label))) document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))
document.add_heading('调整参数') document.add_heading('调整参数')
max_depth=[2,3] max_depth=[2,3]
min_child_weight=range(1,4,1) min_child_weight=range(1,4,1)
document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label) document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label,kfold=kfold)
# gamma # gamma
gamma=[i/10 for i in range(0,5)] gamma=[i/10 for i in range(0,5)]
document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label) document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label,kfold=kfold)
# subsample colsample_bytree # subsample colsample_bytree
subsample=[0.8,0.9,1] subsample=[0.8,0.9,1]
colsample_bytree=[0.8,0.9,1] colsample_bytree=[0.8,0.9,1]
document, clf = tun_params(document, clf, dftrain, dftest, document, clf = tun_params(document, clf, dftrain, dftest,
{'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label) {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label,kfold=kfold)
# reg_alpha # reg_alpha
reg_alpha=[0.001,0.01,0.1,1,10] reg_alpha=[0.001,0.01,0.1,1,10]
document, clf = tun_params(document, clf, dftrain, dftest, document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_alpha': reg_alpha}, features, label) {'reg_alpha': reg_alpha}, features, label,kfold=kfold)
# reg_lambda # reg_lambda
reg_lambda = [0.001, 0.01, 0.1, 1, 10] reg_lambda = [0.001, 0.01, 0.1, 1, 10]
document, clf = tun_params(document, clf, dftrain, dftest, document, clf = tun_params(document, clf, dftrain, dftest,
{'reg_lambda': reg_lambda}, features, label) {'reg_lambda': reg_lambda}, features, label,kfold=kfold)
#==生成模型最后的报告,各个特征的单变量图,PDP,liftchart #==生成模型最后的报告,各个特征的单变量图,PDP,liftchart
dftrain=xgboost.predict(clf,dftrain,features) dftrain=xgboost.predict(clf,dftrain,features)
...@@ -103,10 +103,10 @@ def report(dftrain,dftest,features,label,path,filename): ...@@ -103,10 +103,10 @@ def report(dftrain,dftest,features,label,path,filename):
def tun_params(document,clf,dftrain,dftest,params,features,label): def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10):
for i in dict(params).keys(): for i in dict(params).keys():
document.add_paragraph('调参{},取值{}'.format(i,params[i])) document.add_paragraph('调参{},取值{}'.format(i,params[i]))
grid_search = xgboost.automodelfit(clf, params,dftrain, features, label) grid_search = xgboost.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
clf = grid_search.best_estimator_ clf = grid_search.best_estimator_
document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params())) document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
#== #==
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment