Files
Electrocardiogram/models/catboost_model.py

224 lines
7.3 KiB
Python

from itertools import product
from operator import sub
from tabnanny import verbose
import pandas
from catboost import CatBoostClassifier
from imblearn.over_sampling import KMeansSMOTE
from model_utils import average_fold_results, get_metrics, scaling_handler
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm import tqdm
class CAT_BOOST:
def __init__(
self,
data_frame,
params={},
n_split_kfold=5,
test_size=0.15,
seed=42,
output_file_tuning="cat_boost_tuning_results.csv",
):
self.data_frame = data_frame
self.params = params
self.n_split_kfold = n_split_kfold
self.test_size = test_size
self.seed = 42
self.x_test = None
self.y_test = None
self.scaling_method = self.params.get("scaling_method", None)
self.sampling_method = self.params.get("sampling_method", None)
self.class_weights = {0: 1.0, 1: 1.0}
self.model = None
self.iterations = self.params.get("iterations", 100)
self.learning_rate = self.params.get("learning_rate", 0.1)
self.depth = self.params.get("depth", 6)
self.l2_leaf_reg = self.params.get("l2_leaf_reg", 3)
self.subsample = self.params.get("subsample", 0.6)
self.k_neighbors = self.params.get("k_neighbors", 10)
self.kmeans_estimator = self.params.get("kmeans_estimator", 5)
self.tuning_results = None
self.output_file_tuning = output_file_tuning
def preprocess(self):
self.scaling_method = self.params.get("scaling_method", None)
if self.scaling_method:
self.data_frame = scaling_handler(self.data_frame, self.scaling_method)
def fit(self):
y = self.data_frame["label"]
X = self.data_frame.drop(columns=["label"])
x_train_val, self.x_test, y_train_val, self.y_test = train_test_split(
X, y, test_size=self.test_size, stratify=y, random_state=self.seed
)
skf = StratifiedKFold(
n_splits=self.n_split_kfold, shuffle=True, random_state=self.seed
)
fold_results = []
for fold_idx, (train_index, val_index) in enumerate(
tqdm(
skf.split(x_train_val, y_train_val),
total=self.n_split_kfold,
desc=" >> CatBoost Fitting: ",
)
):
x_train_fold, x_val = (
x_train_val.iloc[train_index],
x_train_val.iloc[val_index],
)
y_train_fold, y_val = (
y_train_val.iloc[train_index],
y_train_val.iloc[val_index],
)
self.sampling_method = self.params.get("sampling_method", None)
if self.sampling_method == "KMeansSMOTE":
smote = KMeansSMOTE(
sampling_strategy="minority",
k_neighbors=self.k_neighbors,
kmeans_estimator=self.kmeans_estimator,
cluster_balance_threshold=0.001,
random_state=self.seed,
n_jobs=-1,
)
x_train_fold, y_train_fold = smote.fit_resample(
x_train_fold, y_train_fold
)
y_train_fold = y_train_fold.astype(int)
elif self.sampling_method == "class_weight":
self.class_1_weight = int(
(y_train_fold.shape[0] - y_train_fold.sum()) / y_train_fold.sum()
)
self.class_weights = {0: 1, 1: self.class_1_weight}
self.model = CatBoostClassifier(
iterations=self.iterations,
learning_rate=self.learning_rate,
depth=self.depth,
l2_leaf_reg=self.l2_leaf_reg,
subsample=self.subsample,
verbose=False,
random_seed=self.seed,
class_weights=self.class_weights,
)
self.model.fit(x_train_fold, y_train_fold)
y_pred_val = self.model.predict(x_val)
val_metrics = get_metrics(y_val, y_pred_val)
fold_results.append(val_metrics)
return average_fold_results(fold_results)
def eval(self, x_test=None, y_test=None):
if x_test is not None and y_test is not None:
self.x_test = x_test
self.y_test = y_test
self.y_pred_test = self.model.predict(self.x_test)
test_metrics = get_metrics(self.y_test, self.y_pred_test)
return test_metrics
def tune(self):
scaling_methods = [
"standard_scaling",
"robust_scaling",
"minmax_scaling",
"yeo_johnson",
]
sampling_methods = [
"KMeansSMOTE",
"class_weight",
]
learning_rate_list = [0.03, 0.05, 0.1]
depth_list = [6, 8]
l2_leaf_reg_list = [1, 3]
subsample_list = [0.8, 1.0]
k_neighbors_list = [10]
kmeans_estimator_list = [5]
tuning_results = []
param_product = list(
product(
scaling_methods,
sampling_methods,
learning_rate_list,
depth_list,
l2_leaf_reg_list,
subsample_list,
k_neighbors_list,
kmeans_estimator_list,
)
)
for (
scaling_method,
sampling_method,
learning_rate,
depth,
l2_leaf_reg,
subsample,
k_neighbors,
kmeans_estimator,
) in tqdm(param_product, total=len(param_product), desc=" > CatBoost Tuning: "):
self.scaling_method = scaling_method
self.sampling_method = sampling_method
self.learning_rate = learning_rate
self.depth = depth
self.l2_leaf_reg = l2_leaf_reg
self.subsample = subsample
self.k_neighbors = k_neighbors
self.kmeans_estimator = kmeans_estimator
print(
" >> Fitting Params: ",
scaling_method,
sampling_method,
learning_rate,
depth,
l2_leaf_reg,
subsample,
k_neighbors,
kmeans_estimator,
)
self.preprocess()
fold_result = self.fit()
tuning_results.append(
{
"model": "cat_boost",
"scaling_method": scaling_method,
"sampling_method": sampling_method,
"learning_rate": learning_rate,
"depth": depth,
"l2_leaf_reg": l2_leaf_reg,
"subsample": subsample,
"k_neighbors": k_neighbors,
"kmeans_estimator": kmeans_estimator,
"metrics": fold_result,
}
)
self.tuning_results = tuning_results
# Save tuning results to CSV
df_tuning = pandas.DataFrame(tuning_results)
metrics_df = df_tuning["metrics"].apply(pandas.Series)
df_tuning = pandas.concat(
[df_tuning.drop(columns=["metrics"]), metrics_df], axis=1
)
df_tuning.to_csv(self.output_file_tuning, index=False)
return