Files
Electrocardiogram/runner.py
2025-12-01 00:10:33 +01:00

348 lines
9.2 KiB
Python

import pandas
from catboost import CatBoostClassifier
# from imblearn.ensemble import BalancedRandomForestClassifier
from lightgbm import LGBMClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier
# from custom_models.LGBMFocalWrapper import LGBMFocalWrapper
from train import test_model, train_model_with_kfold
data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv")
y = data_frame["label"]
X = data_frame.drop(columns=["label"])
x_train, x_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.15,
stratify=y,
random_state=42,
)
neg = sum(y_train == 0)
pos = sum(y_train == 1)
scale_pos = neg / pos if pos > 0 else 1.0
models = [
# {
# "name": "LGBM_FOCAL_LOSS",
# "model": LGBMFocalWrapper(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "LGBM_SMOTE",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "smote",
# },
# {
# "name": "LGBM_KMEANS_SMOTE",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "LGBM_SVM_SMOTE",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "svm",
# },
# {
# "name": "LGBM_BORDERLINE_SMOTE",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "borderline",
# },
# {
# "name": "LGBM_ADASYN_SMOTE",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "adasyn",
# },
# {
# "name": "LGBM_Balanced",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# class_weight="balanced",
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": False,
# },
# {
# "name": "LGBM_DART",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# boosting_type="dart",
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "LGBM_GOSS",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# boosting_type="goss",
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "LGBM_RF",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# boosting_type="rf",
# subsample=0.8,
# colsample_bytree=0.8,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "LGBM_scale_pos_weight",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# scale_pos_weight=scale_pos,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": False,
# },
# {
# "name": "LGBM_is_unbalance",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# is_unbalance=True,
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": False,
# },
# {
# "name": "LGBM_DART",
# "model": LGBMClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=-1,
# subsample=0.8,
# colsample_bytree=0.8,
# boosting_type="dart",
# random_state=42,
# verbose=-1,
# n_jobs=-1,
# ),
# "smote": True,
# "smote_method": "kmeans",
# },
# {
# "name": "XGB_scale_pos_weight",
# "model": XGBClassifier(
# n_estimators=500,
# learning_rate=0.05,
# max_depth=6,
# scale_pos_weight=scale_pos,
# random_state=42,
# n_jobs=-1,
# use_label_encoder=False,
# eval_metric="logloss",
# ),
# "smote": False,
# },
# {
# "name": "CatBoost_balanced",
# "model": CatBoostClassifier(
# iterations=500,
# learning_rate=0.05,
# depth=6,
# class_weights=[1, scale_pos],
# random_state=42,
# verbose=0,
# ),
# "smote": False,
# },
# {
# "name": "RandomForest_balanced",
# "model": RandomForestClassifier(
# n_estimators=500,
# max_depth=None,
# class_weight="balanced",
# random_state=42,
# n_jobs=-1,
# ),
# "smote": False,
# },
# {
# "name": "BalancedRandomForest",
# "model": BalancedRandomForestClassifier(
# n_estimators=500,
# max_depth=None,
# random_state=42,
# n_jobs=-1,
# ),
# "smote": False,
# },
# {
# "name": "LogisticRegression_balanced",
# "model": LogisticRegression(
# max_iter=1000,
# class_weight="balanced",
# solver="liblinear",
# random_state=42,
# ),
# "smote": False,
# },
{
"name": "CatBoost_balanced",
"model": CatBoostClassifier(
iterations=500,
learning_rate=0.05,
depth=6,
class_weights=[1, scale_pos],
random_state=42,
verbose=0,
n_jobs=-1,
),
"smote": False,
},
{
"name": "LGBM_KMEANS_SMOTE",
"model": LGBMClassifier(
n_estimators=500,
learning_rate=0.05,
max_depth=-1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
verbose=-1,
n_jobs=-1,
),
"smote": True,
"smote_method": "kmeans",
}
]
def compute_confusion(y_true, y_pred):
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
return {"TP": tp, "TN": tn, "FP": fp, "FN": fn}
results_to_save = []
for m in models:
print(f"\n===== Training model: {m['name']} =====")
train_results = train_model_with_kfold(
m["model"], x_train, y_train, n_splits=10, smote=m["smote"]
)
y_train_pred = m["model"].predict(x_train)
train_confusion = compute_confusion(y_train, y_train_pred)
test_results = test_model(m["model"], x_test, y_test)
y_test_pred = m["model"].predict(x_test)
test_confusion = compute_confusion(y_test, y_test_pred)
results_to_save.append(
{"model": m["name"], "stage": "train", **train_results, **train_confusion}
)
results_to_save.append(
{"model": m["name"], "stage": "test", **test_results, **test_confusion}
)
results_df = pandas.DataFrame(results_to_save)
csv_file = "lgbm_vs_cat_kmeans_smote_k10_results.csv"
try:
results_df.to_csv(csv_file, mode="a", index=False, header=False)
except FileNotFoundError:
results_df.to_csv(csv_file, mode="w", index=False)
print(f"\nAll results saved to {csv_file}")