import pandas from catboost import CatBoostClassifier from imblearn.ensemble import BalancedRandomForestClassifier from lightgbm import LGBMClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from custom_models.LGBMFocalWrapper import LGBMFocalWrapper from train import test_model, train_model_with_kfold data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv") y = data_frame["label"] X = data_frame.drop(columns=["label"]) x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.15, stratify=y, random_state=42, ) neg = sum(y_train == 0) pos = sum(y_train == 1) scale_pos = neg / pos if pos > 0 else 1.0 models = [ { "name": "LGBM_FOCAL_LOSS", "model": LGBMFocalWrapper( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, ), "smote": True, "smote_method": "kmeans", }, { "name": "LGBM_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "smote", }, { "name": "LGBM_KMEANS_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", }, { "name": "LGBM_SVM_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "svm", }, { "name": "LGBM_BORDERLINE_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "borderline", }, { "name": "LGBM_ADASYN_SMOTE", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "adasyn", }, { "name": "LGBM_Balanced", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, class_weight="balanced", random_state=42, verbose=-1, n_jobs=-1, ), "smote": False, }, { "name": "LGBM_DART", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, boosting_type="dart", random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", }, { "name": "LGBM_GOSS", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, boosting_type="goss", random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", }, { "name": "LGBM_RF", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, boosting_type="rf", subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", }, { "name": "LGBM_scale_pos_weight", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, scale_pos_weight=scale_pos, random_state=42, verbose=-1, n_jobs=-1, ), "smote": False, }, { "name": "LGBM_is_unbalance", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, is_unbalance=True, random_state=42, verbose=-1, n_jobs=-1, ), "smote": False, }, { "name": "LGBM_DART", "model": LGBMClassifier( n_estimators=500, learning_rate=0.05, max_depth=-1, subsample=0.8, colsample_bytree=0.8, boosting_type="dart", random_state=42, verbose=-1, n_jobs=-1, ), "smote": True, "smote_method": "kmeans", }, { "name": "XGB_scale_pos_weight", "model": XGBClassifier( n_estimators=500, learning_rate=0.05, max_depth=6, scale_pos_weight=scale_pos, random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric="logloss", ), "smote": False, }, { "name": "CatBoost_balanced", "model": CatBoostClassifier( iterations=500, learning_rate=0.05, depth=6, class_weights=[1, scale_pos], random_state=42, verbose=0, ), "smote": False, }, { "name": "RandomForest_balanced", "model": RandomForestClassifier( n_estimators=500, max_depth=None, class_weight="balanced", random_state=42, n_jobs=-1, ), "smote": False, }, { "name": "BalancedRandomForest", "model": BalancedRandomForestClassifier( n_estimators=500, max_depth=None, random_state=42, n_jobs=-1, ), "smote": False, }, { "name": "LogisticRegression_balanced", "model": LogisticRegression( max_iter=1000, class_weight="balanced", solver="liblinear", random_state=42, ), "smote": False, }, ] def compute_confusion(y_true, y_pred): tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return {"TP": tp, "TN": tn, "FP": fp, "FN": fn} results_to_save = [] for m in models: print(f"\n===== Training model: {m['name']} =====") train_results = train_model_with_kfold( m["model"], x_train, y_train, n_splits=10, smote=m["smote"] ) y_train_pred = m["model"].predict(x_train) train_confusion = compute_confusion(y_train, y_train_pred) test_results = test_model(m["model"], x_test, y_test) y_test_pred = m["model"].predict(x_test) test_confusion = compute_confusion(y_test, y_test_pred) results_to_save.append( {"model": m["name"], "stage": "train", **train_results, **train_confusion} ) results_to_save.append( {"model": m["name"], "stage": "test", **test_results, **test_confusion} ) results_df = pandas.DataFrame(results_to_save) csv_file = "lightgbm_results.csv" try: results_df.to_csv(csv_file, mode="a", index=False, header=False) except FileNotFoundError: results_df.to_csv(csv_file, mode="w", index=False) print(f"\nAll results saved to {csv_file}")