from itertools import product from operator import sub from tabnanny import verbose import pandas from catboost import CatBoostClassifier from imblearn.over_sampling import KMeansSMOTE from model_utils import average_fold_results, get_metrics, scaling_handler from sklearn.model_selection import StratifiedKFold, train_test_split from tqdm import tqdm class CAT_BOOST: def __init__( self, data_frame, params={}, n_split_kfold=5, test_size=0.15, seed=42, output_file_tuning="cat_boost_tuning_results.csv", ): self.data_frame = data_frame self.params = params self.n_split_kfold = n_split_kfold self.test_size = test_size self.seed = 42 self.x_test = None self.y_test = None self.scaling_method = self.params.get("scaling_method", None) self.sampling_method = self.params.get("sampling_method", None) self.class_weights = {0: 1.0, 1: 1.0} self.model = None self.iterations = self.params.get("iterations", 100) self.learning_rate = self.params.get("learning_rate", 0.1) self.depth = self.params.get("depth", 6) self.l2_leaf_reg = self.params.get("l2_leaf_reg", 3) self.subsample = self.params.get("subsample", 0.6) self.k_neighbors = self.params.get("k_neighbors", 10) self.kmeans_estimator = self.params.get("kmeans_estimator", 5) self.tuning_results = None self.output_file_tuning = output_file_tuning def preprocess(self): self.scaling_method = self.params.get("scaling_method", None) if self.scaling_method: self.data_frame = scaling_handler(self.data_frame, self.scaling_method) def fit(self): y = self.data_frame["label"] X = self.data_frame.drop(columns=["label"]) x_train_val, self.x_test, y_train_val, self.y_test = train_test_split( X, y, test_size=self.test_size, stratify=y, random_state=self.seed ) skf = StratifiedKFold( n_splits=self.n_split_kfold, shuffle=True, random_state=self.seed ) fold_results = [] for fold_idx, (train_index, val_index) in enumerate( tqdm( skf.split(x_train_val, y_train_val), total=self.n_split_kfold, desc=" >> CatBoost Fitting: ", ) ): x_train_fold, x_val = ( x_train_val.iloc[train_index], x_train_val.iloc[val_index], ) y_train_fold, y_val = ( y_train_val.iloc[train_index], y_train_val.iloc[val_index], ) self.sampling_method = self.params.get("sampling_method", None) if self.sampling_method == "KMeansSMOTE": smote = KMeansSMOTE( sampling_strategy="minority", k_neighbors=self.k_neighbors, kmeans_estimator=self.kmeans_estimator, cluster_balance_threshold=0.001, random_state=self.seed, n_jobs=-1, ) x_train_fold, y_train_fold = smote.fit_resample( x_train_fold, y_train_fold ) y_train_fold = y_train_fold.astype(int) elif self.sampling_method == "class_weight": self.class_1_weight = int( (y_train_fold.shape[0] - y_train_fold.sum()) / y_train_fold.sum() ) self.class_weights = {0: 1, 1: self.class_1_weight} self.model = CatBoostClassifier( iterations=self.iterations, learning_rate=self.learning_rate, depth=self.depth, l2_leaf_reg=self.l2_leaf_reg, subsample=self.subsample, verbose=False, random_seed=self.seed, class_weights=self.class_weights, ) self.model.fit(x_train_fold, y_train_fold) y_pred_val = self.model.predict(x_val) val_metrics = get_metrics(y_val, y_pred_val) fold_results.append(val_metrics) return average_fold_results(fold_results) def eval(self, x_test=None, y_test=None): if x_test is not None and y_test is not None: self.x_test = x_test self.y_test = y_test self.y_pred_test = self.model.predict(self.x_test) test_metrics = get_metrics(self.y_test, self.y_pred_test) return test_metrics def tune(self): scaling_methods = [ "standard_scaling", "robust_scaling", "minmax_scaling", "yeo_johnson", ] sampling_methods = [ "KMeansSMOTE", "class_weight", ] learning_rate_list = [0.03, 0.05, 0.1] depth_list = [6, 8] l2_leaf_reg_list = [1, 3] subsample_list = [0.8, 1.0] k_neighbors_list = [10] kmeans_estimator_list = [5] tuning_results = [] param_product = list( product( scaling_methods, sampling_methods, learning_rate_list, depth_list, l2_leaf_reg_list, subsample_list, k_neighbors_list, kmeans_estimator_list, ) ) for ( scaling_method, sampling_method, learning_rate, depth, l2_leaf_reg, subsample, k_neighbors, kmeans_estimator, ) in tqdm(param_product, total=len(param_product), desc=" > CatBoost Tuning: "): self.scaling_method = scaling_method self.sampling_method = sampling_method self.learning_rate = learning_rate self.depth = depth self.l2_leaf_reg = l2_leaf_reg self.subsample = subsample self.k_neighbors = k_neighbors self.kmeans_estimator = kmeans_estimator print( " >> Fitting Params: ", scaling_method, sampling_method, learning_rate, depth, l2_leaf_reg, subsample, k_neighbors, kmeans_estimator, ) self.preprocess() fold_result = self.fit() tuning_results.append( { "model": "cat_boost", "scaling_method": scaling_method, "sampling_method": sampling_method, "learning_rate": learning_rate, "depth": depth, "l2_leaf_reg": l2_leaf_reg, "subsample": subsample, "k_neighbors": k_neighbors, "kmeans_estimator": kmeans_estimator, "metrics": fold_result, } ) self.tuning_results = tuning_results # Save tuning results to CSV df_tuning = pandas.DataFrame(tuning_results) metrics_df = df_tuning["metrics"].apply(pandas.Series) df_tuning = pandas.concat( [df_tuning.drop(columns=["metrics"]), metrics_df], axis=1 ) df_tuning.to_csv(self.output_file_tuning, index=False) return