#install.packages("lightgbm", repos = "https://cran.r-project.org") #install.packages("MLmetrics") library(lightgbm) library(MLmetrics) df <- read.csv("./data/Ketamine_icp.csv") target_name <- "label" target_index <- which(names(df) == target_name) if (is.factor(df[, target_index])) { y <- as.numeric(df[, target_index]) - 1 } else { y <- df[, target_index] } # Create the data matrix for features X <- as.matrix(df[, -target_index]) set.seed(42) train_index <- sample(nrow(X), size = 0.8 * nrow(X)) X_train <- X[train_index, ] X_test <- X[-train_index, ] y_train <- y[train_index] y_test <- y[-train_index] lgb_train_full <- lgb.Dataset(data = X_train, label = y_train) params <- list( objective = "binary", metric = "binary_logloss", boosting_type = "gbdt", num_leaves = 20, learning_rate = 0.05, feature_fraction = 0.8 ) bst_full <- lgb.train( params = params, data = lgb_train_full, nrounds = 100, verbose = -1 ) # Get feature importance importance <- lgb.importance(bst_full) num_features <- nrow(importance) # Create a data frame to store results results_df <- data.frame( Num_Features = integer(), Features = character(), Accuracy = numeric(), F1_class1 = numeric(), F2_class1 = numeric(), Precision_class1 = numeric(), Recall_class1 = numeric() ) cat("Training models with different numbers of top features...\n") for (i in 1:num_features) { cat(paste("Training model with top", i, "features...\n")) # Select top i features top_features <- importance$Feature[1:i] # Subset training and test data X_train_sub <- X_train[, top_features, drop = FALSE] X_test_sub <- X_test[, top_features, drop = FALSE] # Create LightGBM dataset lgb_train_sub <- lgb.Dataset(data = X_train_sub, label = y_train) # Train model with subset of features bst_sub <- lgb.train( params = params, data = lgb_train_sub, nrounds = 100, verbose = -1 ) # Make predictions pred_prob_sub <- predict(bst_sub, X_test_sub) pred_class_sub <- as.numeric(pred_prob_sub > 0.5) # Calculate metrics accuracy <- mean(pred_class_sub == y_test) # For binary classification if (length(unique(y_test)) == 2) { # F1 score for class 1 f1 <- F1_Score(y_true = y_test, y_pred = pred_class_sub, positive = 1) # Precision and Recall for class 1 precision <- Precision(y_true = y_test, y_pred = pred_class_sub, positive = 1) recall <- Recall(y_true = y_test, y_pred = pred_class_sub, positive = 1) # F2-score (beta = 2) beta <- 2 f2 <- (1 + beta^2) * (precision * recall) / (beta^2 * precision + recall) # Handle cases where precision or recall might be NaN if (is.na(f2)) { f2 <- 0 } } else { # For multi-class classification f1 <- NA precision <- NA recall <- NA f2 <- NA } # Store results results_df <- rbind(results_df, data.frame( Num_Features = i, Features = paste(top_features, collapse = ", "), Accuracy = round(accuracy, 4), F1_class1 = round(f1, 4), F2_class1 = round(f2, 4), Precision_class1 = round(precision, 4), Recall_class1 = round(recall, 4) )) # Print progress cat(paste(" Accuracy:", round(accuracy, 4), "| F1:", round(f1, 4), "| F2:", round(f2, 4), "| Precision:", round(precision, 4), "| Recall:", round(recall, 4), "\n")) } cat("Summary of Results:\n") print(results_df) # Find best performing models based on different metrics cat("\nBest Performing Models:\n") # Best by F1 score if (!all(is.na(results_df$F1_class1))) { best_f1_idx <- which.max(results_df$F1_class1) cat(paste("Best F1-score (", results_df$F1_class1[best_f1_idx], ") with", results_df$Num_Features[best_f1_idx], "features\n")) } # Best by F2 score if (!all(is.na(results_df$F2_class1))) { best_f2_idx <- which.max(results_df$F2_class1) cat(paste("Best F2-score (", results_df$F2_class1[best_f2_idx], ") with", results_df$Num_Features[best_f2_idx], "features\n")) } # Best by Accuracy best_acc_idx <- which.max(results_df$Accuracy) cat(paste("Best Accuracy (", results_df$Accuracy[best_acc_idx], ") with", results_df$Num_Features[best_acc_idx], "features\n")) if (require(ggplot2)) { library(ggplot2) # Plot F1 and F2 scores p1 <- ggplot(results_df, aes(x = Num_Features)) + geom_line(aes(y = F1_class1, color = "F1 Score"), size = 1) + geom_line(aes(y = F2_class1, color = "F2 Score"), size = 1) + geom_point(aes(y = F1_class1, color = "F1 Score"), size = 2) + geom_point(aes(y = F2_class1, color = "F2 Score"), size = 2) + labs(title = "F1 and F2 Scores vs Number of Features", x = "Number of Top Features", y = "Score Value") + theme_minimal() + scale_color_manual(values = c("F1 Score" = "blue", "F2 Score" = "red")) # Plot Accuracy p2 <- ggplot(results_df, aes(x = Num_Features, y = Accuracy)) + geom_line(color = "darkgreen", size = 1) + geom_point(color = "darkgreen", size = 2) + labs(title = "Accuracy vs Number of Features", x = "Number of Top Features", y = "Accuracy") + theme_minimal() # Plot Precision and Recall p3 <- ggplot(results_df, aes(x = Num_Features)) + geom_line(aes(y = Precision_class1, color = "Precision"), size = 1) + geom_line(aes(y = Recall_class1, color = "Recall"), size = 1) + geom_point(aes(y = Precision_class1, color = "Precision"), size = 2) + geom_point(aes(y = Recall_class1, color = "Recall"), size = 2) + labs(title = "Precision and Recall (Class 1) vs Number of Features", x = "Number of Top Features", y = "Score Value") + theme_minimal() + scale_color_manual(values = c("Precision" = "purple", "Recall" = "orange")) # Display plots print(p1) print(p2) print(p3) } write.csv(results_df, "feature_selection_results.csv", row.names = FALSE) cat("\nResults saved to 'feature_selection_results.csv'\n") lgb.plot.importance(importance, top_n = min(20, num_features))