Electrocardiogram/runner.py

import pandas
from catboost import CatBoostClassifier
# from imblearn.ensemble import BalancedRandomForestClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier

# from custom_models.LGBMFocalWrapper import LGBMFocalWrapper
from train import test_model, train_model_with_kfold

data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv")
y = data_frame["label"]
X = data_frame.drop(columns=["label"])

x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    stratify=y,
    random_state=42,
)

neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos = neg / pos if pos > 0 else 1.0

models = [
    # {
    #     "name": "LGBM_FOCAL_LOSS",
    #     "model": LGBMFocalWrapper(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "LGBM_SMOTE",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "smote",
    # },
    # {
    #     "name": "LGBM_KMEANS_SMOTE",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "LGBM_SVM_SMOTE",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "svm",
    # },
    # {
    #     "name": "LGBM_BORDERLINE_SMOTE",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "borderline",
    # },
    # {
    #     "name": "LGBM_ADASYN_SMOTE",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "adasyn",
    # },
    # {
    #     "name": "LGBM_Balanced",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         class_weight="balanced",
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "LGBM_DART",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         boosting_type="dart",
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "LGBM_GOSS",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         boosting_type="goss",
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "LGBM_RF",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         boosting_type="rf",
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "LGBM_scale_pos_weight",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         scale_pos_weight=scale_pos,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "LGBM_is_unbalance",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         is_unbalance=True,
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "LGBM_DART",
    #     "model": LGBMClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=-1,
    #         subsample=0.8,
    #         colsample_bytree=0.8,
    #         boosting_type="dart",
    #         random_state=42,
    #         verbose=-1,
    #         n_jobs=-1,
    #     ),
    #     "smote": True,
    #     "smote_method": "kmeans",
    # },
    # {
    #     "name": "XGB_scale_pos_weight",
    #     "model": XGBClassifier(
    #         n_estimators=500,
    #         learning_rate=0.05,
    #         max_depth=6,
    #         scale_pos_weight=scale_pos,
    #         random_state=42,
    #         n_jobs=-1,
    #         use_label_encoder=False,
    #         eval_metric="logloss",
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "CatBoost_balanced",
    #     "model": CatBoostClassifier(
    #         iterations=500,
    #         learning_rate=0.05,
    #         depth=6,
    #         class_weights=[1, scale_pos],
    #         random_state=42,
    #         verbose=0,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "RandomForest_balanced",
    #     "model": RandomForestClassifier(
    #         n_estimators=500,
    #         max_depth=None,
    #         class_weight="balanced",
    #         random_state=42,
    #         n_jobs=-1,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "BalancedRandomForest",
    #     "model": BalancedRandomForestClassifier(
    #         n_estimators=500,
    #         max_depth=None,
    #         random_state=42,
    #         n_jobs=-1,
    #     ),
    #     "smote": False,
    # },
    # {
    #     "name": "LogisticRegression_balanced",
    #     "model": LogisticRegression(
    #         max_iter=1000,
    #         class_weight="balanced",
    #         solver="liblinear",
    #         random_state=42,
    #     ),
    #     "smote": False,
    # },
    {
        "name": "CatBoost_balanced",
        "model": CatBoostClassifier(
            iterations=500,
            learning_rate=0.05,
            depth=6,
            class_weights=[1, scale_pos],
            random_state=42,
            verbose=0,
            n_jobs=-1,
        ),
        "smote": False,
    },
    {
        "name": "LGBM_KMEANS_SMOTE",
        "model": LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1,
        ),
        "smote": True,
        "smote_method": "kmeans",
    }

]


def compute_confusion(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn}


results_to_save = []

for m in models:
    print(f"\n===== Training model: {m['name']} =====")

    train_results = train_model_with_kfold(
        m["model"], x_train, y_train, n_splits=10, smote=m["smote"]
    )

    y_train_pred = m["model"].predict(x_train)
    train_confusion = compute_confusion(y_train, y_train_pred)

    test_results = test_model(m["model"], x_test, y_test)
    y_test_pred = m["model"].predict(x_test)
    test_confusion = compute_confusion(y_test, y_test_pred)

    results_to_save.append(
        {"model": m["name"], "stage": "train", **train_results, **train_confusion}
    )
    results_to_save.append(
        {"model": m["name"], "stage": "test", **test_results, **test_confusion}
    )

results_df = pandas.DataFrame(results_to_save)
csv_file = "lgbm_vs_cat_kmeans_smote_k10_results.csv"

try:
    results_df.to_csv(csv_file, mode="a", index=False, header=False)
except FileNotFoundError:
    results_df.to_csv(csv_file, mode="w", index=False)

print(f"\nAll results saved to {csv_file}")