319 lines
8.0 KiB
Python
319 lines
8.0 KiB
Python
import pandas
|
|
from catboost import CatBoostClassifier
|
|
from imblearn.ensemble import BalancedRandomForestClassifier
|
|
from lightgbm import LGBMClassifier
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
from xgboost import XGBClassifier
|
|
|
|
from custom_models.LGBMFocalWrapper import LGBMFocalWrapper
|
|
from train import test_model, train_model_with_kfold
|
|
|
|
data_frame = pandas.read_csv("./data/Ketamin_icp_cleaned.csv")
|
|
y = data_frame["label"]
|
|
X = data_frame.drop(columns=["label"])
|
|
|
|
x_train, x_test, y_train, y_test = train_test_split(
|
|
X,
|
|
y,
|
|
test_size=0.15,
|
|
stratify=y,
|
|
random_state=42,
|
|
)
|
|
|
|
neg = sum(y_train == 0)
|
|
pos = sum(y_train == 1)
|
|
scale_pos = neg / pos if pos > 0 else 1.0
|
|
|
|
models = [
|
|
{
|
|
"name": "LGBM_FOCAL_LOSS",
|
|
"model": LGBMFocalWrapper(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "LGBM_SMOTE",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "smote",
|
|
},
|
|
{
|
|
"name": "LGBM_KMEANS_SMOTE",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "LGBM_SVM_SMOTE",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "svm",
|
|
},
|
|
{
|
|
"name": "LGBM_BORDERLINE_SMOTE",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "borderline",
|
|
},
|
|
{
|
|
"name": "LGBM_ADASYN_SMOTE",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "adasyn",
|
|
},
|
|
{
|
|
"name": "LGBM_Balanced",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
class_weight="balanced",
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "LGBM_DART",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
boosting_type="dart",
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "LGBM_GOSS",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
boosting_type="goss",
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "LGBM_RF",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
boosting_type="rf",
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "LGBM_scale_pos_weight",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
scale_pos_weight=scale_pos,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "LGBM_is_unbalance",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
is_unbalance=True,
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "LGBM_DART",
|
|
"model": LGBMClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=-1,
|
|
subsample=0.8,
|
|
colsample_bytree=0.8,
|
|
boosting_type="dart",
|
|
random_state=42,
|
|
verbose=-1,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": True,
|
|
"smote_method": "kmeans",
|
|
},
|
|
{
|
|
"name": "XGB_scale_pos_weight",
|
|
"model": XGBClassifier(
|
|
n_estimators=500,
|
|
learning_rate=0.05,
|
|
max_depth=6,
|
|
scale_pos_weight=scale_pos,
|
|
random_state=42,
|
|
n_jobs=-1,
|
|
use_label_encoder=False,
|
|
eval_metric="logloss",
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "CatBoost_balanced",
|
|
"model": CatBoostClassifier(
|
|
iterations=500,
|
|
learning_rate=0.05,
|
|
depth=6,
|
|
class_weights=[1, scale_pos],
|
|
random_state=42,
|
|
verbose=0,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "RandomForest_balanced",
|
|
"model": RandomForestClassifier(
|
|
n_estimators=500,
|
|
max_depth=None,
|
|
class_weight="balanced",
|
|
random_state=42,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "BalancedRandomForest",
|
|
"model": BalancedRandomForestClassifier(
|
|
n_estimators=500,
|
|
max_depth=None,
|
|
random_state=42,
|
|
n_jobs=-1,
|
|
),
|
|
"smote": False,
|
|
},
|
|
{
|
|
"name": "LogisticRegression_balanced",
|
|
"model": LogisticRegression(
|
|
max_iter=1000,
|
|
class_weight="balanced",
|
|
solver="liblinear",
|
|
random_state=42,
|
|
),
|
|
"smote": False,
|
|
},
|
|
]
|
|
|
|
|
|
def compute_confusion(y_true, y_pred):
|
|
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
|
|
return {"TP": tp, "TN": tn, "FP": fp, "FN": fn}
|
|
|
|
|
|
results_to_save = []
|
|
|
|
for m in models:
|
|
print(f"\n===== Training model: {m['name']} =====")
|
|
|
|
train_results = train_model_with_kfold(
|
|
m["model"], x_train, y_train, n_splits=10, smote=m["smote"]
|
|
)
|
|
|
|
y_train_pred = m["model"].predict(x_train)
|
|
train_confusion = compute_confusion(y_train, y_train_pred)
|
|
|
|
test_results = test_model(m["model"], x_test, y_test)
|
|
y_test_pred = m["model"].predict(x_test)
|
|
test_confusion = compute_confusion(y_test, y_test_pred)
|
|
|
|
results_to_save.append(
|
|
{"model": m["name"], "stage": "train", **train_results, **train_confusion}
|
|
)
|
|
results_to_save.append(
|
|
{"model": m["name"], "stage": "test", **test_results, **test_confusion}
|
|
)
|
|
|
|
results_df = pandas.DataFrame(results_to_save)
|
|
csv_file = "lightgbm_results.csv"
|
|
|
|
try:
|
|
results_df.to_csv(csv_file, mode="a", index=False, header=False)
|
|
except FileNotFoundError:
|
|
results_df.to_csv(csv_file, mode="w", index=False)
|
|
|
|
print(f"\nAll results saved to {csv_file}")
|