comapre two models

2025-12-06 20:43:56 +01:00
parent fe6b805b31
commit 80ea363123
24 changed files with 1671 additions and 281 deletions
--- a/models/lightgbm_model.py
+++ b/models/lightgbm_model.py
@@ -0,0 +1,237 @@
+from itertools import product
+from operator import sub
+from tabnanny import verbose
+
+import lightgbm as lgb
+import pandas
+from imblearn.over_sampling import KMeansSMOTE
+from model_utils import average_fold_results, get_metrics, scaling_handler
+from sklearn.model_selection import StratifiedKFold, train_test_split
+from tqdm import tqdm
+
+
+class LIGHT_GBM:
+    def __init__(self, data_frame, params={}, n_split_kfold=5, test_size=0.15, seed=42):
+        self.data_frame = data_frame
+        self.params = params
+        self.n_split_kfold = n_split_kfold
+        self.test_size = test_size
+        self.seed = 42
+
+        self.x_test = None
+        self.y_test = None
+
+        self.scaling_method = None
+        self.sampling_method = None
+        self.class_weights = {0: 1.0, 1: 1.0}
+        self.model = None
+
+        self.learning_rate = self.params.get("learning_rate", 0.1)
+        self.num_leaves = self.params.get("num_leaves", 100)
+        self.boosting_type = self.params.get("boosting_type", "gbdt")
+        self.l1_reg = self.params.get("l1_reg", 0.1)
+        self.l2_reg = self.params.get("l2_reg", 0.1)
+        self.subsample = self.params.get("subsample", 1.0)
+        self.tree_subsample = self.params.get("tree_subsample", 1.0)
+        self.k_neighbors = self.params.get("k_neighbors", 10)
+        self.kmeans_estimator = self.params.get("kmeans_estimator", 5)
+        self.tuning_results = None
+
+    def preprocess(self):
+        self.scaling_method = self.params.get("scaling_method", None)
+        if self.scaling_method:
+            self.data_frame = scaling_handler(self.data_frame, self.scaling_method)
+
+    def fit(self):
+        y = self.data_frame["label"]
+        X = self.data_frame.drop(columns=["label"])
+
+        x_train_val, self.x_test, y_train_val, self.y_test = train_test_split(
+            X, y, test_size=self.test_size, stratify=y, random_state=self.seed
+        )
+
+        skf = StratifiedKFold(
+            n_splits=self.n_split_kfold, shuffle=True, random_state=self.seed
+        )
+
+        fold_results = []
+
+        for fold_idx, (train_index, val_index) in enumerate(
+            tqdm(
+                skf.split(x_train_val, y_train_val),
+                total=self.n_split_kfold,
+                desc="     >> LightGBM Fitting: ",
+            )
+        ):
+            x_train_fold, x_val = (
+                x_train_val.iloc[train_index],
+                x_train_val.iloc[val_index],
+            )
+            y_train_fold, y_val = (
+                y_train_val.iloc[train_index],
+                y_train_val.iloc[val_index],
+            )
+
+            self.sampling_method = self.params.get("sampling_method", None)
+            if self.sampling_method == "KMeansSMOTE":
+                smote = KMeansSMOTE(
+                    sampling_strategy="minority",
+                    k_neighbors=self.k_neighbors,
+                    kmeans_estimator=self.kmeans_estimator,
+                    cluster_balance_threshold=0.001,
+                    random_state=self.seed,
+                    n_jobs=-1,
+                )
+                x_train_fold, y_train_fold = smote.fit_resample(
+                    x_train_fold, y_train_fold
+                )
+                y_train_fold = y_train_fold.astype(int)
+
+            elif self.sampling_method == "class_weight":
+                self.class_1_weight = int(
+                    (y_train_fold.shape[0] - y_train_fold.sum()) / y_train_fold.sum()
+                )
+                self.class_weights = {0: 1, 1: self.class_1_weight}
+
+            self.model = lgb.LGBMClassifier(
+                boosting_type=self.boosting_type,
+                learning_rate=self.learning_rate,
+                num_leaves=self.num_leaves,
+                reg_alpha=self.l1_reg,
+                reg_lambda=self.l2_reg,
+                subsample=self.subsample,
+                subsample_freq=1,
+                colsample_bytree=self.tree_subsample,
+                class_weight=self.class_weights
+                if self.sampling_method == "class_weight"
+                else None,
+                n_estimators=100,
+                random_state=self.seed,
+                verbose=-1,
+            )
+
+            self.model.fit(x_train_fold, y_train_fold)
+            y_pred_val = self.model.predict(x_val)
+            val_metrics = get_metrics(y_val, y_pred_val)
+            fold_results.append(val_metrics)
+
+        return average_fold_results(fold_results)
+
+    def eval(self, x_test=None, y_test=None):
+        if x_test is not None and y_test is not None:
+            self.x_test = x_test
+            self.y_test = y_test
+        self.y_pred_test = self.model.predict(self.x_test)
+        test_metrics = get_metrics(self.y_test, self.y_pred_test)
+        return test_metrics
+
+    def tune(self):
+        scaling_methods = [
+            "standard_scaling",
+            "robust_scaling",
+            "minmax_scaling",
+            "yeo_johnson",
+        ]
+        sampling_methods = [
+            "KMeansSMOTE",
+            "class_weight",
+        ]
+        boosting_type_list = ["gbdt", "dart"]
+        learning_rate_list = [0.03, 0.05, 0.1]
+        number_of_leaves_list = [100]
+        l2_regularization_lambda_list = [0.1]
+        l1_regularization_alpha_list = [0.1]
+        tree_subsample_tree_list = [0.8, 1.0]
+        subsample_list = [0.8, 1.0]
+        kmeans_smote_k_neighbors_list = [10]
+        kmeans_smote_n_clusters_list = [5]
+
+        tuning_results = []
+
+        param_product = list(
+            product(
+                scaling_methods,
+                sampling_methods,
+                boosting_type_list,
+                learning_rate_list,
+                number_of_leaves_list,
+                l2_regularization_lambda_list,
+                l1_regularization_alpha_list,
+                tree_subsample_tree_list,
+                subsample_list,
+                kmeans_smote_k_neighbors_list,
+                kmeans_smote_n_clusters_list,
+            )
+        )
+
+        for (
+            scaling_method,
+            sampling_method,
+            boosting_type,
+            learning_rate,
+            num_leaves,
+            l2_reg,
+            l1_reg,
+            tree_subsample,
+            subsample,
+            k_neighbors,
+            kmeans_estimator,
+        ) in tqdm(param_product, total=len(param_product), desc=" > LightGBM Tuning: "):
+            self.scaling_method = scaling_method
+            self.sampling_method = sampling_method
+            self.boosting_type = boosting_type
+            self.learning_rate = learning_rate
+            self.num_leaves = num_leaves
+            self.l2_reg = l2_reg
+            self.l1_reg = l1_reg
+            self.tree_subsample = tree_subsample
+            self.subsample = subsample
+            self.k_neighbors = k_neighbors
+            self.kmeans_estimator = kmeans_estimator
+
+            print(
+                "     >> Fitting Params: ",
+                scaling_method,
+                sampling_method,
+                boosting_type,
+                learning_rate,
+                num_leaves,
+                l2_reg,
+                l1_reg,
+                tree_subsample,
+                subsample,
+                k_neighbors,
+                kmeans_estimator,
+            )
+            self.preprocess()
+
+            fold_result = self.fit()
+
+            tuning_results.append(
+                {
+                    "model": "lightgbm",
+                    "scaling_method": scaling_method,
+                    "sampling_method": sampling_method,
+                    "boosting_type": boosting_type,
+                    "learning_rate": learning_rate,
+                    "num_leaves": num_leaves,
+                    "l2_reg": l2_reg,
+                    "l1_reg": l1_reg,
+                    "tree_subsample": tree_subsample,
+                    "subsample": subsample,
+                    "k_neighbors": k_neighbors,
+                    "kmeans_estimator": kmeans_estimator,
+                    "metrics": fold_result,
+                }
+            )
+
+        self.tuning_results = tuning_results
+
+        df_tuning = pandas.DataFrame(tuning_results)
+        metrics_df = df_tuning["metrics"].apply(pandas.Series)
+        df_tuning = pandas.concat(
+            [df_tuning.drop(columns=["metrics"]), metrics_df], axis=1
+        )
+        df_tuning.to_csv("lightgbm_tuning_results.csv", index=False)
+
+        return