Electrocardiogram/runner.py

import pandas
from catboost import CatBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from custom_models.LGBMFocalWrapper import LGBMFocalWrapper
from train import test_model, train_model_with_kfold

data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv")
y = data_frame["label"]
X = data_frame.drop(columns=["label"])

x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    stratify=y,
    random_state=42,
)

neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos = neg / pos if pos > 0 else 1.0

models = [
    {
        "name": "LGBM_FOCAL_LOSS",
        "model": LGBMFocalWrapper(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "LGBM_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "smote",
    },
    {
        "name": "LGBM_KMEANS_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "LGBM_SVM_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "svm",
    },
    {
        "name": "LGBM_BORDERLINE_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "borderline",
    },
    {
        "name": "LGBM_ADASYN_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "adasyn",
    },
    {
        "name": "LGBM_Balanced",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            class_weight="balanced",
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "LGBM_DART",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            boosting_type="dart",
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "LGBM_GOSS",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            boosting_type="goss",
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "LGBM_RF",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            boosting_type="rf",
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "LGBM_scale_pos_weight",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            scale_pos_weight=scale_pos,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "LGBM_is_unbalance",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            is_unbalance=True,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "LGBM_DART",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            boosting_type="dart",
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    },
    {
        "name": "XGB_scale_pos_weight",
        "model": XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            scale_pos_weight=scale_pos,
            random_state=42,
            n_jobs=-1,
            use_label_encoder=False,
            eval_metric="logloss",
        ),
        "smote": False,
    },
    {
        "name": "CatBoost_balanced",
        "model": CatBoostClassifier(
            iterations=500,
            learning_rate=0.05,
            depth=6,
            class_weights=[1, scale_pos],
            random_state=42,
            verbose=0,
        ),
        "smote": False,
    },
    {
        "name": "RandomForest_balanced",
        "model": RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "BalancedRandomForest",
        "model": BalancedRandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            random_state=42,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "LogisticRegression_balanced",
        "model": LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            solver="liblinear",
            random_state=42,
        ),
        "smote": False,
    },
]


def compute_confusion(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn}


results_to_save = []

for m in models:
    print(f"\n===== Training model: {m['name']} =====")

    train_results = train_model_with_kfold(
        m["model"], x_train, y_train, n_splits=10, smote=m["smote"]
    )

    y_train_pred = m["model"].predict(x_train)
    train_confusion = compute_confusion(y_train, y_train_pred)

    test_results = test_model(m["model"], x_test, y_test)
    y_test_pred = m["model"].predict(x_test)
    test_confusion = compute_confusion(y_test, y_test_pred)

    results_to_save.append(
        {"model": m["name"], "stage": "train", **train_results, **train_confusion}
    )
    results_to_save.append(
        {"model": m["name"], "stage": "test", **test_results, **test_confusion}
    )

results_df = pandas.DataFrame(results_to_save)
csv_file = "lightgbm_results.csv"

try:
    results_df.to_csv(csv_file, mode="a", index=False, header=False)
except FileNotFoundError:
    results_df.to_csv(csv_file, mode="w", index=False)

print(f"\nAll results saved to {csv_file}")