Commit 313190e7 authored by linfang.wang's avatar linfang.wang

加入MySQL 连接

parent 981da436
......@@ -48,7 +48,7 @@ def draw_barplot(df,x,y,hue='',title=''):
sns.barplot(x, y, hue=hue, data=df, ax=ax)
else:
sns.barplot(x, y, data=df, ax=ax)
pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz.xlabel(x)
pltz.ylabel(y)
pltz.title(title)
......@@ -78,7 +78,7 @@ def draw_lineplot(df,x,y,hue='',title=''):
plt.plot(tmp[x], tmp[y], linestyle='dashed', marker='o',label=type)
else:
plt.plot(df[x], df[y], linestyle='dashed', marker='o')
pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
# pltz.xticks(range(len(df[x].unique().tolist())), df[x].unique().tolist())
pltz.xlabel(x)
pltz.ylabel(y)
pltz.title(title)
......
......@@ -65,63 +65,52 @@ def buildClf(max_depth=2,learning_rate=0.1, n_estimators=5000, gamma=0,
:param reg_lambda:L2 正则项参数
:param scale_pos_weight:一般为负样本数/正样本数
:param base_score:
:param random_state: replace seed
:param missing:缺失值
:param random_state: replace seed,统一设置为7,仅为随机可复现
:return:XGBClassifier
'''
return xgb.XGBClassifier(max_depth=max_depth,learning_rate=learning_rate,n_estimators=n_estimators,
verbosity=2,objective='binary:logistic',
booster='gbtree',n_jobs=2,gamma=gamma,min_child_weight=min_child_weight,
verbosity=True,objective='binary:logistic',
booster='gbtree',n_jobs=2,nthread=2,gamma=gamma,min_child_weight=min_child_weight,
max_delta_step=max_delta_step,subsample=subsample,colsample_bytree=colsample_bytree,
reg_alpha=reg_alpha,reg_lambda=reg_lambda,scale_pos_weight=scale_pos_weight,
base_score=base_score,random_state=7,missing=-9999999
base_score=base_score,random_state=7,seed=7
)
def automodelfit(dftrain,features,resp, kfold=10,trainsplit='timeSeries',trainsplitRatio=0.8,sort_col=None):
def buildParamGrid(learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3],gamma=[i/10 for i in range(0,5)],max_depth=[2,3],
min_child_weight=[1,2,3,4,5,6],subsample=[i/10 for i in range(6,10)],colsample_bytree=[i/10 for i in range(6,10)],
reg_alpha=[0.001,0.01,0.05,0.1,1,10],reg_lambda=[0.001,0.01,0.05,0.1,1,10]):
param_grid = dict(learning_rate=learning_rate, gamma=gamma, max_depth=max_depth, min_child_weight=min_child_weight,
subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha,
reg_lambda=reg_lambda)
return param_grid
def automodelfit(clf,param_grid,dftrain,features,resp, kfold=10,scoring='roc_auc'):
'''
模型自动调参,指定标准为 roc_auc
:param df:
模型自动调参
:param clf : XGBClassifier
:param param_grid : dict,调参的区间设定,buildParamGrid
:param scoring : 调参 评估标准 默认 roc_auc
:param dftrain:
:param features:
:param resp:
:param kfold:
:param trainsplit:
:param trainsplitRatio:
:param sort_col:
:return:
'''
clf=buildClf()
learning_rate=[0.001,0.01,0.05,0.1,0.2,0.3]
gamma=[i/10 for i in range(0,5)]
max_depth=[2,3]
min_child_weight=[1,2,3,4,5,6]
subsample=[i/10 for i in range(6,10)]
colsample_bytree=[i/10 for i in range(6,10)]
reg_alpha=[0.001,0.01,0.05,0.1,1,10]
reg_lambda=[0.001,0.01,0.05,0.1,1,10]
param_grid=dict(learning_rate=learning_rate,gamma=gamma,max_depth=max_depth,min_child_weight=min_child_weight,
subsample=subsample,colsample_bytree=colsample_bytree,reg_alpha=reg_alpha,reg_lambda=reg_lambda)
kflod=StratifiedKFold(n_splits=kfold,shuffle=True,random_state=7)
grid_search=GridSearchCV(clf,param_grid,scoring='roc_auc',n_jobs=-1,cv=kflod,verbose=2,iid=True,refit=True)
grid_search=GridSearchCV(clf,param_grid,scoring=scoring,n_jobs=-1,cv=kflod,verbose=2,iid=True,refit=True)
#== 模型训练
grid_search.fit(dftrain[features].values,dftrain[resp].values)
#== 获取最优参数
return grid_search.best_estimator_
return grid_search
def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stopping_rounds=20,
eval_metric='auc',trainsplit='random',trainsplitRatio=0.8,sort_col=None):
def modelfit(clf, dftrain, features, resp, dfval=None,useTrainCV = True, cv_folds=10, eval_metric='auc',early_stopping_rounds=20):
'''
模型训练
:param clf:XGBClassifier
:param df:
:param dftrain:训练集
:param dfval 验证集用于模型训练,early_stopping_rounds
:param features: 特征
:param resp:label
:param useTrainCV:if True call cv function,目的是调节参数 n_estimators
......@@ -130,26 +119,24 @@ def modelfit(clf, df, features, resp, useTrainCV = True, cv_folds=10, early_stop
:param eval_metric 同 目标函数 objective 有关,取值https://xgboost.readthedocs.io/en/latest/python/python_api.html#
:return:
'''
dftrain,dfval=split_train_val(df,features,resp,trainsplit,trainsplitRatio,sort_col)
if useTrainCV:
xgb_param = clf.get_xgb_params()
xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values,missing=-9999999)
xgtrain = xgb.DMatrix(dftrain[features].values, label=dftrain[resp].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=clf.get_params()['n_estimators'], nfold=cv_folds,
metrics=(['auc']), early_stopping_rounds=early_stopping_rounds , verbose_eval = 100)
metrics=eval_metric, early_stopping_rounds=early_stopping_rounds,verbose_eval=True)
clf.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data and save the model
if not dfval:
clf.fit(dftrain[features].values,dftrain[resp].values,
eval_set=(dfval[features].values,dfval[resp].values),
if not dfval==None:
#== 如果有验证集的话,则无需进行cv运算 基于验证集auc early_stopping_rounds
clf.fit(dftrain[features], dftrain[resp],eval_set=[(dftrain[features], dftrain[resp]), (dfval[features], dfval[resp])],
eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
else:
clf.fit(dftrain[features].values, dftrain[resp].values,
eval_metric=eval_metric,early_stopping_rounds=early_stopping_rounds)
clf.fit(dftrain[features], dftrain[resp],eval_metric=eval_metric)
return clf
def predict(clf,df,features):
'''
计算预测值
......@@ -158,8 +145,8 @@ def predict(clf,df,features):
:param features:
:return:
'''
df['predict']=clf.predict(df[features].values)
df['predict_proba']=clf.predict_proba(df[features].values)[:1]
df['predict']=clf.predict(df[features])
df['predict_proba']=clf.predict_proba(df[features])[:,1]
return df
def featureImportance(clf,features):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment