lightgbm实战-二分类问题(贝叶斯优化下调参方法)

tech2026-04-07  1

# use bayes_opt from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score,KFold from bayes_opt import BayesianOptimization import numpy as np # 产生随机分类数据集,10个特征, 2个类别 x, y = make_classification(n_samples=1000,n_features=10,n_classes=2) # 尝试一下用未调参的随机森林模型进行交叉验证 rf = RandomForestClassifier() # 这里会输出5个值,取得均值 cv_score = cross_val_score(rf, x, y, scoring="f1", cv=5).mean() cv_score # 定义一个函数,输入一些超参数,这些超参数就是需要进行调整的参数 def rf_cv(n_estimators, min_samples_split, max_features, max_depth): cv_score = cross_val_score( RandomForestClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), max_features=float(max_features), max_depth=int(max_depth), random_state=2), x, y, scoring="f1", cv=5).mean() # 必须返回一个值,如果像误差的话(回归算法)这里是需要加上一个负号的 return cv_score rf_bo = BayesianOptimization( rf_cv, {'n_estimators': (10, 250), 'min_samples_split': (2, 25), 'max_features': (0.1, 0.999), 'max_depth': (5, 15)} ) # 输出不同迭代参数组合下的得分 rf_bo.maximize() # 输出最高得分下的参数组合 rf_bo.max

# 带入最佳参数进行计算 rf = RandomForestClassifier(random_state=2,max_depth=12,max_features=0.2694,min_samples_split=6,n_estimators=103) np.mean(cross_val_score(rf, x, y, cv=4, scoring='f1'))

使用lightgbm尝试一波

# 设置几个参数 def lgb_cv(colsample_bytree, min_child_samples, num_leaves, subsample, max_depth): model = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary', colsample_bytree=float(colsample_bytree), learning_rate=0.01, min_child_samples=int(min_child_samples), min_child_weight=0.001, n_estimators=800, n_jobs=-1, num_leaves=int(num_leaves), random_state=None, reg_alpha=0.0, reg_lambda=0.0,max_depth=int(max_depth), subsample=float(subsample)) cv_score = cross_val_score(model, x, y, scoring="f1", cv=5).mean() return cv_score # 使用贝叶斯优化 lgb_bo = BayesianOptimization( lgb_cv, {'colsample_bytree': (0.7,1), 'min_child_samples': (2, 25), 'num_leaves': (5, 250), 'subsample': (0.7, 1), 'max_depth':(2,10)} ) lgb_bo.maximize() lgb_bo.max # 将优化好的参数带入进行使用 model = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary', colsample_bytree=0.7111, learning_rate=0.01, min_child_samples=9, min_child_weight=0.001, n_estimators=800, n_jobs=-1, num_leaves=188, random_state=None, reg_alpha=0.0, reg_lambda=0.0,max_depth=2, subsample=0.91) cv_score = cross_val_score(model, x, y, scoring="f1", cv=5).mean() cv_score # 结果就不放出来了,也就是一个值,大家可以模仿一下,跑跑代码。

接下里是采用lightgbm的自带的train方法配合交叉验证

def LGB_CV( max_depth, num_leaves, min_data_in_leaf, feature_fraction, bagging_fraction, lambda_l1 ): # 这里就不采用交叉验证的cv_score = cross_val_score(model, x, y, scoring="f1", cv=5) kf = KFold(n_splits=5,shuffle=True) # f是准备存储预测值的,交叉验证下,用五份数据作为验证集,最后将这五份数据放回f里 f = np.zeros(x.shape[0]) for index, (train_index, val_index) in enumerate(kf.split(x)): print("fold--{}".format(index)) train_data = lgb.Dataset(x[train_index], label=y[train_index]) val_data = lgb.Dataset(x[val_index], label=y[val_index]) params = {'num_leaves': int(num_leaves), 'min_data_in_leaf': int(min_data_in_leaf), 'objective':'binary', 'max_depth': int(max_depth), 'learning_rate': 0.01, "boosting": "gbdt", "feature_fraction": feature_fraction, "bagging_fraction": bagging_fraction , "metric": 'auc', "lambda_l1": lambda_l1, } # 因为是交叉验证的算法,这里直接使用train,valid_sets就是要评估的数据集 model = lgb.train(params, train_data, valid_sets=[train_data, val_data],num_boost_round=700, verbose_eval=500,early_stopping_rounds = 20) # 返回迭代中最好的数据,这里的predict里面的数据(不需要经过dataset)不需要再进行转化,如果是xgboost就需要,需要把x_test进行转化DMatrix(x_test),这里x_test不包含类别特征 f[val_index] = model.predict(x[val_index], num_iteration=model.best_iteration) # predict里面的验证集不需要进行dataset,但是xgboost算法时需要dmatrix,并且只需要DMatrix(x_test),这里x_test不包含类别特征,很多地方这里都会出错,直接带着类别就去预测 del model, train_index, val_index # 由于输出的是概率值,转化为0,1的整型值 f = np.array([1 if i>0.5 else 0 for i in oof]) return metrics.f1_score(f, y) # 最后进行调参 LGB_CV( max_depth=5, num_leaves=32, min_data_in_leaf=15, feature_fraction=0.8, bagging_fraction=0.8, lambda_l1=None ) # 采用贝叶斯优化算法 lgb_ba = BayesianOptimization(LGB_CV, {"max_depth":(2,12), "num_leaves":(5,130), "min_data_in_leaf":(5,30), "feature_fraction":(0.7,1), "bagging_fraction":(0.7,1), "lambda_l1":(0,6)}) lgb_ba.maximize() lgb_ba.max["params"]

# 把设置好的参数带入 kf = KFold(n_splits=5,shuffle=True) f = np.zeros(x.shape[0]) # 设置测试集数据 x_test = x[:200] y_test = y[:200] prediction = np.zeros(x_test.shape[0]) for index, (train_index, val_index) in enumerate(kf.split(x)): print("fold--{}".format(index)) train_data = lgb.Dataset(x[train_index], label=y[train_index]) val_data = lgb.Dataset(x[val_index], label=y[val_index]) params = {'num_leaves': 44, 'min_data_in_leaf': 19, 'objective':'binary', 'max_depth': 11, 'learning_rate': 0.01, "boosting": "gbdt", "feature_fraction": 0.81, "bagging_fraction": 0.84 , "metric": 'auc', "lambda_l1": 1.8, } # 因为是交叉验证的算法,这里直接使用train,valid_sets就是要评估的数据集 model = lgb.train(params, train_data, valid_sets=[train_data, val_data],num_boost_round=700, verbose_eval=500,early_stopping_rounds=20) f[val_index] = model.predict(x[val_index], num_iteration=model.best_iteration) # predict里面的数据不需要进行dataset prediction +=model.predict(x_test)num_iteration=model.best_iteration)/kf.n_splits metrics.f1_score(np.array([1 if i>0.5 else 0 for i in prediction]), y_test)

题外话

这是kaggle上面的一种做法,也是非常高效的

最新回复(0)