电话邦跑

a1429476 · linfang.wang · 03588f52 · a1429476 · a1429476 · a1429476
Commit a1429476 authored Apr 22, 2019 by linfang.wang
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 10 deletions

xgboost.py models/xgboost.py +2 -0

allocator.py mvp/allocator.py +7 -1

xgbreport.py mvp/xgbreport.py +9 -9

No files found.
--- a/models/xgboost.py
+++ b/models/xgboost.py
@@ -77,6 +77,8 @@ def modelfit(clf, dftrain, features, resp,useTrainCV = True, kfold=10, eval_metr
    :param eval_metric 同 目标函数 objective 有关，取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
    :return:
    '''
+    if dftrain[features].shape[0]==0:
+        raise(' NO train data !!!! ')
    if useTrainCV:
        # kflod = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=7)
        xgb_param = clf.get_xgb_params()

--- a/mvp/allocator.py
+++ b/mvp/allocator.py
@@ -49,8 +49,14 @@ if __name__ == '__main__':
    # ]
    dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
    df=dhb.dhb_features_extract()
+    print(df.columns.tolist())
+    print(df.target.unique())
    label='target'
+    features=dhb.get_feature()
+    df[features]=df[features].astype(float)
+    df['target']=df['target'].astype(int)
+    print('----feature---',len(features))
    # df=pd.read_csv('test.csv')
    dftrain,dftest=datacal.split_train_val(df,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col='applied_at')

-    xgbreport.report(dftrain,dftest,dhb.features,label,'','tmp.doc')
+    xgbreport.report(dftrain,dftest,features,label,'','tmp.doc',kfold=2)
--- a/mvp/xgbreport.py
+++ b/mvp/xgbreport.py
@@ -7,7 +7,7 @@ from models import xgboost
 from matplotlib import pyplot as plt
 from data.graph import drawplot

-def report(dftrain,dftest,features,label,path,filename):
+def report(dftrain,dftest,features,label,path,filename,kfold=10):
    '''
    dftrain,dftest 中必然有 字段 applied_at,applied_channel,applied_type
    :param dftrain:
@@ -22,34 +22,34 @@ def report(dftrain,dftest,features,label,path,filename):
    document.add_heading('xgboost 算法运行报告')
    clf=xgboost.buildClf()
    document.add_paragraph('初始化参数运行{}'.format(clf.get_xgb_params()))
-    clf=xgboost.modelfit(clf,dftrain,features,label)
+    clf=xgboost.modelfit(clf,dftrain,features,label,kfold=kfold)
    document.add_paragraph('模型训练集{}'.format(xgboost.auc(clf,dftrain,features,label)))
    document.add_paragraph('模型测试集{}'.format(xgboost.auc(clf, dftest, features, label)))

    document.add_heading('调整参数')
    max_depth=[2,3]
    min_child_weight=range(1,4,1)
-    document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label)
+    document, clf = tun_params(document, clf, dftrain, dftest, {'max_depth': max_depth,'min_child_weight':min_child_weight}, features, label,kfold=kfold)

    # gamma
    gamma=[i/10 for i in range(0,5)]
-    document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label)
+    document,clf=tun_params(document,clf,dftrain,dftest,{'gamma':gamma},features,label,kfold=kfold)

    # subsample colsample_bytree
    subsample=[0.8,0.9,1]
    colsample_bytree=[0.8,0.9,1]
    document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label)
+                               {'subsample': subsample, 'colsample_bytree': colsample_bytree}, features, label,kfold=kfold)

    # reg_alpha
    reg_alpha=[0.001,0.01,0.1,1,10]
    document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'reg_alpha': reg_alpha}, features, label)
+                               {'reg_alpha': reg_alpha}, features, label,kfold=kfold)

    # reg_lambda
    reg_lambda = [0.001, 0.01, 0.1, 1, 10]
    document, clf = tun_params(document, clf, dftrain, dftest,
-                               {'reg_lambda': reg_lambda}, features, label)
+                               {'reg_lambda': reg_lambda}, features, label,kfold=kfold)

    #==生成模型最后的报告，各个特征的单变量图，PDP，liftchart
    dftrain=xgboost.predict(clf,dftrain,features)
@@ -103,10 +103,10 @@ def report(dftrain,dftest,features,label,path,filename):



-def tun_params(document,clf,dftrain,dftest,params,features,label):
+def tun_params(document,clf,dftrain,dftest,params,features,label,kfold=10):
    for i in dict(params).keys():
        document.add_paragraph('调参{},取值{}'.format(i,params[i]))
-    grid_search = xgboost.automodelfit(clf, params,dftrain, features, label)
+    grid_search = xgboost.automodelfit(clf, params,dftrain, features, label,kfold=kfold)
    clf = grid_search.best_estimator_
    document.add_paragraph('模型训练参数{}'.format(clf.get_xgb_params()))
    #==