From ccd94812bcb9255a2bccf6c995e3f50250835eba Mon Sep 17 00:00:00 2001
From: saeedkhosravi94 <saeedkhosravi72@gmail.com>
Date: Wed, 10 Dec 2025 22:03:58 +0100
Subject: [PATCH] main.R

---
 main.R | 65 ++++++++++++++++++++++------------------------------------
 1 file changed, 25 insertions(+), 40 deletions(-)

diff --git a/main.R b/main.R
index 083670e..ecded79 100644
--- a/main.R
+++ b/main.R
@@ -4,14 +4,11 @@
 library(lightgbm)
 library(MLmetrics)
 
-# 1. Load your data
 df <- read.csv("./data/Ketamine_icp.csv")
 
-# --- 2. Data Preparation ---
-target_name <- "label" 
+target_name <- "label"
 target_index <- which(names(df) == target_name)
 
-# Prepare target variable
 if (is.factor(df[, target_index])) {
   y <- as.numeric(df[, target_index]) - 1
 } else {
@@ -19,18 +16,16 @@ if (is.factor(df[, target_index])) {
 }
 
 # Create the data matrix for features
-X <- as.matrix(df[, -target_index]) 
+X <- as.matrix(df[, -target_index])
 
-# --- 3. Split Data into Training and Testing Sets ---
-set.seed(42) 
-train_index <- sample(nrow(X), size = 0.8 * nrow(X)) 
+set.seed(42)
+train_index <- sample(nrow(X), size = 0.8 * nrow(X))
 
 X_train <- X[train_index, ]
 X_test <- X[-train_index, ]
 y_train <- y[train_index]
 y_test <- y[-train_index]
 
-# --- 4. Get Feature Importance from Full Model ---
 lgb_train_full <- lgb.Dataset(data = X_train, label = y_train)
 
 params <- list(
@@ -64,23 +59,21 @@ results_df <- data.frame(
   Recall_class1 = numeric()
 )
 
-# --- 5. Loop through different numbers of top features ---
 cat("Training models with different numbers of top features...\n")
-cat("=====================================================\n")
 
 for (i in 1:num_features) {
   cat(paste("Training model with top", i, "features...\n"))
-  
+
   # Select top i features
   top_features <- importance$Feature[1:i]
-  
+
   # Subset training and test data
   X_train_sub <- X_train[, top_features, drop = FALSE]
   X_test_sub <- X_test[, top_features, drop = FALSE]
-  
+
   # Create LightGBM dataset
   lgb_train_sub <- lgb.Dataset(data = X_train_sub, label = y_train)
-  
+
   # Train model with subset of features
   bst_sub <- lgb.train(
     params = params,
@@ -88,27 +81,27 @@ for (i in 1:num_features) {
     nrounds = 100,
     verbose = -1
   )
-  
+
   # Make predictions
   pred_prob_sub <- predict(bst_sub, X_test_sub)
   pred_class_sub <- as.numeric(pred_prob_sub > 0.5)
-  
+
   # Calculate metrics
   accuracy <- mean(pred_class_sub == y_test)
-  
+
   # For binary classification
   if (length(unique(y_test)) == 2) {
     # F1 score for class 1
     f1 <- F1_Score(y_true = y_test, y_pred = pred_class_sub, positive = 1)
-    
+
     # Precision and Recall for class 1
     precision <- Precision(y_true = y_test, y_pred = pred_class_sub, positive = 1)
     recall <- Recall(y_true = y_test, y_pred = pred_class_sub, positive = 1)
-    
+
     # F2-score (beta = 2)
     beta <- 2
     f2 <- (1 + beta^2) * (precision * recall) / (beta^2 * precision + recall)
-    
+
     # Handle cases where precision or recall might be NaN
     if (is.na(f2)) {
       f2 <- 0
@@ -120,7 +113,7 @@ for (i in 1:num_features) {
     recall <- NA
     f2 <- NA
   }
-  
+
   # Store results
   results_df <- rbind(results_df, data.frame(
     Num_Features = i,
@@ -131,49 +124,44 @@ for (i in 1:num_features) {
     Precision_class1 = round(precision, 4),
     Recall_class1 = round(recall, 4)
   ))
-  
+
   # Print progress
-  cat(paste("  Accuracy:", round(accuracy, 4), 
+  cat(paste("  Accuracy:", round(accuracy, 4),
             "| F1:", round(f1, 4),
             "| F2:", round(f2, 4),
             "| Precision:", round(precision, 4),
             "| Recall:", round(recall, 4), "\n"))
 }
 
-cat("=====================================================\n")
 
-# --- 6. Display Results ---
-cat("\nSummary of Results:\n")
-cat("===================\n")
+cat("Summary of Results:\n")
 print(results_df)
 
 # Find best performing models based on different metrics
 cat("\nBest Performing Models:\n")
-cat("=======================\n")
 
 # Best by F1 score
 if (!all(is.na(results_df$F1_class1))) {
   best_f1_idx <- which.max(results_df$F1_class1)
-  cat(paste("Best F1-score (", results_df$F1_class1[best_f1_idx], 
+  cat(paste("Best F1-score (", results_df$F1_class1[best_f1_idx],
             ") with", results_df$Num_Features[best_f1_idx], "features\n"))
 }
 
 # Best by F2 score
 if (!all(is.na(results_df$F2_class1))) {
   best_f2_idx <- which.max(results_df$F2_class1)
-  cat(paste("Best F2-score (", results_df$F2_class1[best_f2_idx], 
+  cat(paste("Best F2-score (", results_df$F2_class1[best_f2_idx],
             ") with", results_df$Num_Features[best_f2_idx], "features\n"))
 }
 
 # Best by Accuracy
 best_acc_idx <- which.max(results_df$Accuracy)
-cat(paste("Best Accuracy (", results_df$Accuracy[best_acc_idx], 
+cat(paste("Best Accuracy (", results_df$Accuracy[best_acc_idx],
           ") with", results_df$Num_Features[best_acc_idx], "features\n"))
 
-# --- 7. Optional: Plot metrics vs number of features ---
 if (require(ggplot2)) {
   library(ggplot2)
-  
+
   # Plot F1 and F2 scores
   p1 <- ggplot(results_df, aes(x = Num_Features)) +
     geom_line(aes(y = F1_class1, color = "F1 Score"), size = 1) +
@@ -185,7 +173,7 @@ if (require(ggplot2)) {
          y = "Score Value") +
     theme_minimal() +
     scale_color_manual(values = c("F1 Score" = "blue", "F2 Score" = "red"))
-  
+
   # Plot Accuracy
   p2 <- ggplot(results_df, aes(x = Num_Features, y = Accuracy)) +
     geom_line(color = "darkgreen", size = 1) +
@@ -194,7 +182,7 @@ if (require(ggplot2)) {
          x = "Number of Top Features",
          y = "Accuracy") +
     theme_minimal()
-  
+
   # Plot Precision and Recall
   p3 <- ggplot(results_df, aes(x = Num_Features)) +
     geom_line(aes(y = Precision_class1, color = "Precision"), size = 1) +
@@ -206,17 +194,14 @@ if (require(ggplot2)) {
          y = "Score Value") +
     theme_minimal() +
     scale_color_manual(values = c("Precision" = "purple", "Recall" = "orange"))
-  
+
   # Display plots
   print(p1)
   print(p2)
   print(p3)
 }
 
-# --- 8. Save results to CSV ---
 write.csv(results_df, "feature_selection_results.csv", row.names = FALSE)
 cat("\nResults saved to 'feature_selection_results.csv'\n")
 
-# --- 9. Display top 20 feature importance plot ---
 lgb.plot.importance(importance, top_n = min(20, num_features))
-