加入MySQL 连接

313190e7 · linfang.wang · 981da436 · 313190e7 · 313190e7
Commit 313190e7 authored Apr 11, 2019 by linfang.wang
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 49 deletions

drawplot.py data/graph/drawplot.py +2 -2

xgboost.py model/xgboost.py +34 -47

No files found.
--- a/data/graph/drawplot.py
+++ b/data/graph/drawplot.py
@@ -48,7 +48,7 @@ def draw_barplot(df,x,y,hue='',title=''):
        sns.barplot(x, y, hue=hue, data=df, ax=ax)
    else:
        sns.barplot(x, y, data=df, ax=ax)
-    pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
+    # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
    pltz.xlabel(x)
    pltz.ylabel(y)
    pltz.title(title)
@@ -78,7 +78,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
            plt.plot(tmp[x], tmp[y], linestyle='dashed', marker='o',label=type)
    else:
        plt.plot(df[x], df[y], linestyle='dashed', marker='o')
-    pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
+    # pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
    pltz.xlabel(x)
    pltz.ylabel(y)
    pltz.title(title)

--- a/model/xgboost.py
+++ b/model/xgboost.py
@@ -65,63 +65,52 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
    :param reg_lambda:L2 正则项参数
    :param scale_pos_weight:一般为负样本数/正样本数
    :param base_score:
-    :param random_state: replace seed
-    :param missing:缺失值
+    :param random_state: replace seed,统一设置为7，仅为随机可复现
    :return:XGBClassifier
    '''
    return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
-                             verbosity=2,objective='binary:logistic',
-                             booster='gbtree',n_jobs=2,gamma=gamma,min_child_weight=min_child_weight,
+                             verbosity=True,objective='binary:logistic',
+                             booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
                             max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
                             reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
-                             base_score=base_score,random_state=7,missing=-9999999
+                             base_score=base_score,random_state=7,seed=7
                             )

-def automodelfit(dftrain,features,resp, kfold=10,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col=None):
+
+def buildParamGrid(learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3],gamma=[i/10 for i in range(0,5)],max_depth=[2,3],
+                   min_child_weight=[1,2,3,4,5,6],subsample=[i/10 for i in range(6,10)],colsample_bytree=[i/10 for i in range(6,10)],
+                   reg_alpha=[0.001,0.01,0.05,0.1,1,10],reg_lambda=[0.001,0.01,0.05,0.1,1,10]):
+    param_grid = dict(learning_rate=learning_rate, gamma=gamma, max_depth=max_depth, min_child_weight=min_child_weight,
+                      subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha,
+                      reg_lambda=reg_lambda)
+    return param_grid
+
+def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
    '''
-    模型自动调参，指定标准为 roc_auc
-    :param df:
+    模型自动调参
+    :param clf : XGBClassifier
+    :param param_grid : dict,调参的区间设定，buildParamGrid
+    :param scoring : 调参 评估标准 默认 roc_auc
+    :param dftrain:
    :param features:
    :param resp:
    :param kfold:
-    :param trainsplit:
-    :param trainsplitRatio:
-    :param sort_col:
    :return:
    '''
-    clf=buildClf()
-    learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3]
-    gamma=[i/10 for i in range(0,5)]
-    max_depth=[2,3]
-    min_child_weight=[1,2,3,4,5,6]
-    subsample=[i/10 for i in range(6,10)]
-    colsample_bytree=[i/10 for i in range(6,10)]
-    reg_alpha=[0.001,0.01,0.05,0.1,1,10]
-    reg_lambda=[0.001,0.01,0.05,0.1,1,10]
-
-    param_grid=dict(learning_rate=learning_rate,gamma=gamma,max_depth=max_depth,min_child_weight=min_child_weight,
-                    subsample=subsample,colsample_bytree=colsample_bytree,reg_alpha=reg_alpha,reg_lambda=reg_lambda)
-
    kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
-    grid_search=GridSearchCV(clf,param_grid,scoring='roc_auc',n_jobs=-1,cv=kflod,verbose=2,iid=True,refit=True)
+    grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=-1,cv=kflod,verbose=2,iid=True,refit=True)
+    #== 模型训练
    grid_search.fit(dftrain[features].values,dftrain[resp].values)
    #== 获取最优参数
-    return grid_search.best_estimator_
-
-
-
-
+    return grid_search


-
-
-
-def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stopping_rounds=20,
-            eval_metric='auc',trainsplit='random',trainsplitRatio=0.8,sort_col=None):
+def modelfit(clf, dftrain, features, resp, dfval=None,useTrainCV = True, cv_folds=10, eval_metric='auc',early_stopping_rounds=20):
    '''
    模型训练
    :param clf:XGBClassifier
-    :param df:
+    :param dftrain:训练集
+    :param dfval 验证集用于模型训练，early_stopping_rounds
    :param features: 特征
    :param resp:label
    :param useTrainCV:if True  call cv function,目的是调节参数 n_estimators
@@ -130,26 +119,24 @@ def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stop
    :param eval_metric 同 目标函数 objective 有关，取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
    :return:
    '''
-    dftrain,dfval=split_train_val(df,features,resp,trainsplit,trainsplitRatio,sort_col)
    if useTrainCV:
        xgb_param = clf.get_xgb_params()
-        xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values,missing=-9999999)
+        xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds,
-            metrics=(['auc']), early_stopping_rounds=early_stopping_rounds , verbose_eval = 100)
+            metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
        clf.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data and save the model
-    if not dfval:
-        clf.fit(dftrain[features].values,dftrain[resp].values,
-                eval_set=(dfval[features].values,dfval[resp].values),
+    if not dfval==None:
+        #== 如果有验证集的话，则无需进行cv运算 基于验证集auc early_stopping_rounds
+        clf.fit(dftrain[features], dftrain[resp],eval_set=[(dftrain[features], dftrain[resp]), (dfval[features], dfval[resp])],
                eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
    else:
-        clf.fit(dftrain[features].values, dftrain[resp].values,
-                eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
-
+        clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
    return clf


+
 def predict(clf,df,features):
    '''
    计算预测值
@@ -158,8 +145,8 @@ def predict(clf,df,features):
    :param features:
    :return:
    '''
-    df['predict']=clf.predict(df[features].values)
-    df['predict_proba']=clf.predict_proba(df[features].values)[:1]
+    df['predict']=clf.predict(df[features])
+    df['predict_proba']=clf.predict_proba(df[features])[:,1]
    return df

 def featureImportance(clf,features):