from sklearn
.datasets
import make_classification
from sklearn
.ensemble
import RandomForestClassifier
from sklearn
.model_selection
import cross_val_score
,KFold
from bayes_opt
import BayesianOptimization
import numpy
as np
x
, y
= make_classification
(n_samples
=1000,n_features
=10,n_classes
=2)
rf
= RandomForestClassifier
()
cv_score
= cross_val_score
(rf
, x
, y
, scoring
="f1", cv
=5).mean
()
cv_score
def rf_cv(n_estimators
, min_samples_split
, max_features
, max_depth
):
cv_score
= cross_val_score
(
RandomForestClassifier
(n_estimators
=int(n_estimators
),
min_samples_split
=int(min_samples_split
),
max_features
=float(max_features
),
max_depth
=int(max_depth
),
random_state
=2),
x
, y
, scoring
="f1", cv
=5).mean
()
return cv_score
rf_bo
= BayesianOptimization
(
rf_cv
,
{'n_estimators': (10, 250),
'min_samples_split': (2, 25),
'max_features': (0.1, 0.999),
'max_depth': (5, 15)}
)
rf_bo
.maximize
()
rf_bo
.max
rf
= RandomForestClassifier
(random_state
=2,max_depth
=12,max_features
=0.2694,min_samples_split
=6,n_estimators
=103)
np
.mean
(cross_val_score
(rf
, x
, y
, cv
=4, scoring
='f1'))
使用lightgbm尝试一波
def lgb_cv(colsample_bytree
, min_child_samples
, num_leaves
, subsample
, max_depth
):
model
= lgb
.LGBMClassifier
(boosting_type
='gbdt',objective
='binary',
colsample_bytree
=float(colsample_bytree
), learning_rate
=0.01,
min_child_samples
=int(min_child_samples
), min_child_weight
=0.001,
n_estimators
=800, n_jobs
=-1, num_leaves
=int(num_leaves
),
random_state
=None, reg_alpha
=0.0, reg_lambda
=0.0,max_depth
=int(max_depth
),
subsample
=float(subsample
))
cv_score
= cross_val_score
(model
, x
, y
, scoring
="f1", cv
=5).mean
()
return cv_score
lgb_bo
= BayesianOptimization
(
lgb_cv
,
{'colsample_bytree': (0.7,1),
'min_child_samples': (2, 25),
'num_leaves': (5, 250),
'subsample': (0.7, 1),
'max_depth':(2,10)}
)
lgb_bo
.maximize
()
lgb_bo
.max
model
= lgb
.LGBMClassifier
(boosting_type
='gbdt',objective
='binary',
colsample_bytree
=0.7111, learning_rate
=0.01,
min_child_samples
=9, min_child_weight
=0.001,
n_estimators
=800, n_jobs
=-1, num_leaves
=188,
random_state
=None, reg_alpha
=0.0, reg_lambda
=0.0,max_depth
=2,
subsample
=0.91)
cv_score
= cross_val_score
(model
, x
, y
, scoring
="f1", cv
=5).mean
()
cv_score
接下里是采用lightgbm的自带的train方法配合交叉验证
def LGB_CV(
max_depth
,
num_leaves
,
min_data_in_leaf
,
feature_fraction
,
bagging_fraction
,
lambda_l1
):
kf
= KFold
(n_splits
=5,shuffle
=True)
f
= np
.zeros
(x
.shape
[0])
for index
, (train_index
, val_index
) in enumerate(kf
.split
(x
)):
print("fold--{}".format(index
))
train_data
= lgb
.Dataset
(x
[train_index
], label
=y
[train_index
])
val_data
= lgb
.Dataset
(x
[val_index
], label
=y
[val_index
])
params
= {'num_leaves': int(num_leaves
),
'min_data_in_leaf': int(min_data_in_leaf
),
'objective':'binary',
'max_depth': int(max_depth
),
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": feature_fraction
,
"bagging_fraction": bagging_fraction
,
"metric": 'auc',
"lambda_l1": lambda_l1
,
}
model
= lgb
.train
(params
, train_data
, valid_sets
=[train_data
, val_data
],num_boost_round
=700, verbose_eval
=500,early_stopping_rounds
= 20)
f
[val_index
] = model
.predict
(x
[val_index
], num_iteration
=model
.best_iteration
)
del model
, train_index
, val_index
f
= np
.array
([1 if i
>0.5 else 0 for i
in oof
])
return metrics
.f1_score
(f
, y
)
LGB_CV
(
max_depth
=5,
num_leaves
=32,
min_data_in_leaf
=15,
feature_fraction
=0.8,
bagging_fraction
=0.8,
lambda_l1
=None
)
lgb_ba
= BayesianOptimization
(LGB_CV
, {"max_depth":(2,12),
"num_leaves":(5,130),
"min_data_in_leaf":(5,30),
"feature_fraction":(0.7,1),
"bagging_fraction":(0.7,1),
"lambda_l1":(0,6)})
lgb_ba
.maximize
()
lgb_ba
.max["params"]
kf
= KFold
(n_splits
=5,shuffle
=True)
f
= np
.zeros
(x
.shape
[0])
x_test
= x
[:200]
y_test
= y
[:200]
prediction
= np
.zeros
(x_test
.shape
[0])
for index
, (train_index
, val_index
) in enumerate(kf
.split
(x
)):
print("fold--{}".format(index
))
train_data
= lgb
.Dataset
(x
[train_index
], label
=y
[train_index
])
val_data
= lgb
.Dataset
(x
[val_index
], label
=y
[val_index
])
params
= {'num_leaves': 44,
'min_data_in_leaf': 19,
'objective':'binary',
'max_depth': 11,
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": 0.81,
"bagging_fraction": 0.84 ,
"metric": 'auc',
"lambda_l1": 1.8,
}
model
= lgb
.train
(params
, train_data
, valid_sets
=[train_data
, val_data
],num_boost_round
=700, verbose_eval
=500,early_stopping_rounds
=20)
f
[val_index
] = model
.predict
(x
[val_index
], num_iteration
=model
.best_iteration
)
prediction
+=model
.predict
(x_test
)num_iteration
=model
.best_iteration
)/kf
.n_splits
metrics
.f1_score
(np
.array
([1 if i
>0.5 else 0 for i
in prediction
]), y_test
)
题外话
这是kaggle上面的一种做法,也是非常高效的
转载请注明原文地址:https://tech.qufami.com/read-27670.html