import pandas from catboost import CatBoostClassifier # from imblearn.ensemble import BalancedRandomForestClassifier from lightgbm import LGBMClassifier # from sklearn.ensemble import RandomForestClassifier # from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split # from xgboost import XGBClassifier # from custom_models.LGBMFocalWrapper import LGBMFocalWrapper from train import test_model, train_model_with_kfold data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv") y = data_frame["label"] X = data_frame.drop(columns=["label"]) x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.15, stratify=y, random_state=42, ) neg = sum(y_train == 0) pos = sum(y_train == 1) scale_pos = neg / pos if pos > 0 else 1.0 models = [ # { # "name": "LGBM_FOCAL_LOSS", # "model": LGBMFocalWrapper( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "LGBM_SMOTE", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "smote", # }, # { # "name": "LGBM_KMEANS_SMOTE", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "LGBM_SVM_SMOTE", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "svm", # }, # { # "name": "LGBM_BORDERLINE_SMOTE", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "borderline", # }, # { # "name": "LGBM_ADASYN_SMOTE", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "adasyn", # }, # { # "name": "LGBM_Balanced", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # class_weight="balanced", # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": False, # }, # { # "name": "LGBM_DART", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # boosting_type="dart", # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "LGBM_GOSS", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # boosting_type="goss", # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "LGBM_RF", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # boosting_type="rf", # subsample=0.8, # colsample_bytree=0.8, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "LGBM_scale_pos_weight", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # scale_pos_weight=scale_pos, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": False, # }, # { # "name": "LGBM_is_unbalance", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # is_unbalance=True, # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": False, # }, # { # "name": "LGBM_DART", # "model": LGBMClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=-1, # subsample=0.8, # colsample_bytree=0.8, # boosting_type="dart", # random_state=42, # verbose=-1, # n_jobs=-1, # ), # "smote": True, # "smote_method": "kmeans", # }, # { # "name": "XGB_scale_pos_weight", # "model": XGBClassifier( # n_estimators=500, # learning_rate=0.05, # max_depth=6, # scale_pos_weight=scale_pos, # random_state=42, # n_jobs=-1, # use_label_encoder=False, # eval_metric="logloss", # ), # "smote": False, # }, # { # "name": "CatBoost_balanced", # "model": CatBoostClassifier( # iterations=500, # learning_rate=0.05, # depth=6, # class_weights=[1, scale_pos], # random_state=42, # verbose=0, # ), # "smote": False, # }, # { # "name": "RandomForest_balanced", # "model": RandomForestClassifier( # n_estimators=500, # max_depth=None, # class_weight="balanced", # random_state=42, # n_jobs=-1, # ), # "smote": False, # }, # { # "name": "BalancedRandomForest", # "model": BalancedRandomForestClassifier( # n_estimators=500, # max_depth=None, # random_state=42, # n_jobs=-1, # ), # "smote": False, # }, # { # "name": "LogisticRegression_balanced", # "model": LogisticRegression( # max_iter=1000, # class_weight="balanced", # solver="liblinear", # random_state=42, # ), # "smote": False, # }, { "name": "CatBoost_balanced", "model": CatBoostClassifier( iterations=500, learning_rate=0.05, depth=6, class_weights=[1, scale_pos], random_state=42, verbose=0, n_jobs=-1, ), "smote": False, }, { "name": "LGBM_KMEANS_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", } ] def compute_confusion(y_true, y_pred): tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return {"TP": tp, "TN": tn, "FP": fp, "FN": fn} results_to_save = [] for m in models: print(f"\n===== Training model: {m['name']} =====") train_results = train_model_with_kfold( m["model"], x_train, y_train, n_splits=10, smote=m["smote"] ) y_train_pred = m["model"].predict(x_train) train_confusion = compute_confusion(y_train, y_train_pred) test_results = test_model(m["model"], x_test, y_test) y_test_pred = m["model"].predict(x_test) test_confusion = compute_confusion(y_test, y_test_pred) results_to_save.append( {"model": m["name"], "stage": "train", **train_results, **train_confusion} ) results_to_save.append( {"model": m["name"], "stage": "test", **test_results, **test_confusion} ) results_df = pandas.DataFrame(results_to_save) csv_file = "lgbm_vs_cat_kmeans_smote_k10_results.csv" try: results_df.to_csv(csv_file, mode="a", index=False, header=False) except FileNotFoundError: results_df.to_csv(csv_file, mode="w", index=False) print(f"\nAll results saved to {csv_file}")