#install.packages("lightgbm", repos = "https://cran.r-project.org")
#install.packages("MLmetrics")

library(lightgbm)
library(MLmetrics)

df <- read.csv("./data/Ketamine_icp.csv")

target_name <- "label"
target_index <- which(names(df) == target_name)

if (is.factor(df[, target_index])) {
  y <- as.numeric(df[, target_index]) - 1
} else {
  y <- df[, target_index]
}

# Create the data matrix for features
X <- as.matrix(df[, -target_index])

set.seed(42)
train_index <- sample(nrow(X), size = 0.8 * nrow(X))

X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

lgb_train_full <- lgb.Dataset(data = X_train, label = y_train)

params <- list(
  objective = "binary",
  metric = "binary_logloss",
  boosting_type = "gbdt",
  num_leaves = 20,
  learning_rate = 0.05,
  feature_fraction = 0.8
)

bst_full <- lgb.train(
  params = params,
  data = lgb_train_full,
  nrounds = 100,
  verbose = -1
)

# Get feature importance
importance <- lgb.importance(bst_full)
num_features <- nrow(importance)

# Create a data frame to store results
results_df <- data.frame(
  Num_Features = integer(),
  Features = character(),
  Accuracy = numeric(),
  F1_class1 = numeric(),
  F2_class1 = numeric(),
  Precision_class1 = numeric(),
  Recall_class1 = numeric()
)

cat("Training models with different numbers of top features...\n")

for (i in 1:num_features) {
  cat(paste("Training model with top", i, "features...\n"))

  # Select top i features
  top_features <- importance$Feature[1:i]

  # Subset training and test data
  X_train_sub <- X_train[, top_features, drop = FALSE]
  X_test_sub <- X_test[, top_features, drop = FALSE]

  # Create LightGBM dataset
  lgb_train_sub <- lgb.Dataset(data = X_train_sub, label = y_train)

  # Train model with subset of features
  bst_sub <- lgb.train(
    params = params,
    data = lgb_train_sub,
    nrounds = 100,
    verbose = -1
  )

  # Make predictions
  pred_prob_sub <- predict(bst_sub, X_test_sub)
  pred_class_sub <- as.numeric(pred_prob_sub > 0.5)

  # Calculate metrics
  accuracy <- mean(pred_class_sub == y_test)

  # For binary classification
  if (length(unique(y_test)) == 2) {
    # F1 score for class 1
    f1 <- F1_Score(y_true = y_test, y_pred = pred_class_sub, positive = 1)

    # Precision and Recall for class 1
    precision <- Precision(y_true = y_test, y_pred = pred_class_sub, positive = 1)
    recall <- Recall(y_true = y_test, y_pred = pred_class_sub, positive = 1)

    # F2-score (beta = 2)
    beta <- 2
    f2 <- (1 + beta^2) * (precision * recall) / (beta^2 * precision + recall)

    # Handle cases where precision or recall might be NaN
    if (is.na(f2)) {
      f2 <- 0
    }
  } else {
    # For multi-class classification
    f1 <- NA
    precision <- NA
    recall <- NA
    f2 <- NA
  }

  # Store results
  results_df <- rbind(results_df, data.frame(
    Num_Features = i,
    Features = paste(top_features, collapse = ", "),
    Accuracy = round(accuracy, 4),
    F1_class1 = round(f1, 4),
    F2_class1 = round(f2, 4),
    Precision_class1 = round(precision, 4),
    Recall_class1 = round(recall, 4)
  ))

  # Print progress
  cat(paste("  Accuracy:", round(accuracy, 4),
            "| F1:", round(f1, 4),
            "| F2:", round(f2, 4),
            "| Precision:", round(precision, 4),
            "| Recall:", round(recall, 4), "\n"))
}


cat("Summary of Results:\n")
print(results_df)

# Find best performing models based on different metrics
cat("\nBest Performing Models:\n")

# Best by F1 score
if (!all(is.na(results_df$F1_class1))) {
  best_f1_idx <- which.max(results_df$F1_class1)
  cat(paste("Best F1-score (", results_df$F1_class1[best_f1_idx],
            ") with", results_df$Num_Features[best_f1_idx], "features\n"))
}

# Best by F2 score
if (!all(is.na(results_df$F2_class1))) {
  best_f2_idx <- which.max(results_df$F2_class1)
  cat(paste("Best F2-score (", results_df$F2_class1[best_f2_idx],
            ") with", results_df$Num_Features[best_f2_idx], "features\n"))
}

# Best by Accuracy
best_acc_idx <- which.max(results_df$Accuracy)
cat(paste("Best Accuracy (", results_df$Accuracy[best_acc_idx],
          ") with", results_df$Num_Features[best_acc_idx], "features\n"))

if (require(ggplot2)) {
  library(ggplot2)

  # Plot F1 and F2 scores
  p1 <- ggplot(results_df, aes(x = Num_Features)) +
    geom_line(aes(y = F1_class1, color = "F1 Score"), size = 1) +
    geom_line(aes(y = F2_class1, color = "F2 Score"), size = 1) +
    geom_point(aes(y = F1_class1, color = "F1 Score"), size = 2) +
    geom_point(aes(y = F2_class1, color = "F2 Score"), size = 2) +
    labs(title = "F1 and F2 Scores vs Number of Features",
         x = "Number of Top Features",
         y = "Score Value") +
    theme_minimal() +
    scale_color_manual(values = c("F1 Score" = "blue", "F2 Score" = "red"))

  # Plot Accuracy
  p2 <- ggplot(results_df, aes(x = Num_Features, y = Accuracy)) +
    geom_line(color = "darkgreen", size = 1) +
    geom_point(color = "darkgreen", size = 2) +
    labs(title = "Accuracy vs Number of Features",
         x = "Number of Top Features",
         y = "Accuracy") +
    theme_minimal()

  # Plot Precision and Recall
  p3 <- ggplot(results_df, aes(x = Num_Features)) +
    geom_line(aes(y = Precision_class1, color = "Precision"), size = 1) +
    geom_line(aes(y = Recall_class1, color = "Recall"), size = 1) +
    geom_point(aes(y = Precision_class1, color = "Precision"), size = 2) +
    geom_point(aes(y = Recall_class1, color = "Recall"), size = 2) +
    labs(title = "Precision and Recall (Class 1) vs Number of Features",
         x = "Number of Top Features",
         y = "Score Value") +
    theme_minimal() +
    scale_color_manual(values = c("Precision" = "purple", "Recall" = "orange"))

  # Display plots
  print(p1)
  print(p2)
  print(p3)
}

write.csv(results_df, "feature_selection_results.csv", row.names = FALSE)
cat("\nResults saved to 'feature_selection_results.csv'\n")

lgb.plot.importance(importance, top_n = min(20, num_features))