library(tidyverse) # Set seed for reproducibility set.seed(510) # Simulate predictor variables; we will cover this design in the second half of the class n <- 100 # Number of observations X1 <- rnorm(n) # Continuous predictor X2 <- rbinom(n, 1, 0.5) # Binary predictor X3 <- runif(n, min = 0, max = 10) # Continuous predictor # Generate the probability of success using a logistic model beta_0 <- -1 # Intercept beta_1 <- 1.5 # Coefficient for X1 beta_2 <- -0.8 # Coefficient for X2 beta_3 <- 0.4 # Coefficient for X3 logit_p <- beta_0 + beta_1 * X1 + beta_2 * X2 + beta_3 * X3 p <- exp(logit_p) / (1 + exp(logit_p)) # Convert to probability # Simulate binary outcome Y Y <- rbinom(n, 1, p) # Create tibble simulated_data <- tibble(Y = Y, X1 = X1, X2 = X2, X3 = X3) # Fit logistic regression model logit_model <- glm(Y ~ X1 + X2 + X3, data = simulated_data, family = binomial) # Predict probabilities predicted_probs <- predict(logit_model, type = "response") # Append predicted probabilities d = simulated_data |> bind_cols(predicted_probs) names(d)[5] = "Predicted Probability" # Look at the data d |> view() # Create two small functions that take logical vectors of prediction and truth sensitivity = function(prediction, truth) { true_positive = sum(prediction == 1 & truth == 1) false_negative = sum(prediction == 0 & truth == 1) sensitivity = true_positive / (true_positive + false_negative) return(sensitivity) } specificity = function(prediction, truth) { true_negative = sum(prediction == 0 & truth == 0) false_positive = sum(prediction == 1 & truth == 0) specificity = true_negative / (true_negative + false_positive) return(specificity) } # Now create an ROC curve. Start with threshold 0, and go to threshold 1. Use 1000 steps, say n_thresh = 50 # thresholds = seq(min(d$`Predicted Probability`),max(d$`Predicted Probability`),length.out = n_thresh) # or try: thresholds = d$`Predicted Probability` |> unique() # For each value of "thresh", calculate sensitivity and specificity # Start with an empty tibble results = tibble(threshold = numeric(), sensitivity = numeric(), specificity = numeric()) for (threshold in thresholds){ threshold prediction = d["Predicted Probability"]>threshold sens = sensitivity(prediction,d$Y) spec = specificity(prediction,d$Y) results = bind_rows(results, tibble(threshold = threshold, sensitivity = sens, specificity = spec)) } # Look a the ROC results results |> view() # Plot the ROC curve results |> arrange(desc(sensitivity)) |> ggplot(aes(1-specificity,sensitivity,color = threshold)) + geom_point() + theme_minimal() # Debugging # Only works in code! i = sample.int(12,1) add_one_to_square = function(x){ sq = x^2 out = sq+1 return(out) } add_one_to_square(sample.int(12,1))