--- title: "Lab - Logistic Regression" author: "J. Lucas McKay" date: "2023-04-12" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = F, collapse = T, warning = F, message = F) ``` *** #### 1. Logistic regression and individual parameter tests ```{r} library(tidyverse) library(ISLR) # load the default dataset included with ISLR d = tibble(Default) |> mutate(student_status = case_when(student=="Yes"~"Student",T~"Non-Student")) |> mutate(default_status = case_when(default=="Yes"~"Default",T~"Good Standing")) # replicate the charts from the book d |> ggplot(aes(default,balance)) + geom_boxplot() + facet_wrap(~student_status) + theme_minimal() # replicate the charts from the book d |> ggplot(aes(default,balance,color = student_status)) + geom_jitter(width = 0.2) + facet_wrap(~student_status) + theme_minimal() ``` ```{r} d = tibble(Default) m1 = glm(default~student,data = d, family = binomial) # what are we modeling here? levels(d$default) # r treats the second level as a 1 by default summary(m1) summary(m1)$coef m2 = glm(default~balance,data = d, family = "binomial") summary(m2) summary(m2)$coef m3 = glm(default~income,data = d, family = "binomial") summary(m3) summary(m3)$coef m4 = glm(default~.,data = d, family = "binomial") summary(m4) summary(m4)$coef["studentYes",] # what happens to the association between student status and default probability when we remove income? m5 = glm(default~balance+student,data = d, family = "binomial") summary(m5) summary(m5)$coef["studentYes",] d |> ggplot(aes(student,income,color = student)) + geom_boxplot() # interaction m6 = glm(default~.+student*balance,data = d, family = "binomial") ``` *** #### 2. Model predictions ```{r} # create some data library(tidyverse) set.seed(510) n = 100 beta = c(-6,10) x = runif(n) # calculate the left side of the equation logity = beta[1]+beta[2]*x # transform the logit to probability py = exp(logity)/(1+exp(logity)) # this is also built in as plogis max(abs(py-plogis(logity))) # we need to simulate a coin flip for every observation given the probability p_i y = rbinom(n,1,py) # make a design matrix simdat = tibble(y = y, x = x) # fit a model m1 = glm(y~x,data=simdat,family=binomial) m1 ``` *** #### 2. Deviance We can present the deviance either with the log likelihood equation: $$ D=-2\log\text{lik}(\hat{\boldsymbol{\beta}})+2\log\text{lik}(\text{saturated model}) $$ Or with the numerical version: $$ D = -2 \sum_{i=1}^{n} \left[ y_i \log(\hat{p}_i) + (1 - y_i) \log(1 - \hat{p}_i) \right] $$ ```{r} # create some data library(tidyverse) set.seed(510) n = 100 beta = c(-6,10) x = runif(n) # calculate the left side of the equation logity = beta[1]+beta[2]*x # transform the logit to probability py = exp(logity)/(1+exp(logity)) # this is also built in as plogis max(abs(py-plogis(logity))) # we need to simulate a coin flip for every observation given the probability p_i y = rbinom(n,1,py) # make a design matrix simdat = tibble(y = y, x = x) # fit a model, get the deviance and null deviance m1 = glm(y~x,data=simdat,family=binomial) m1$deviance m1$null.deviance # note not a perfect fit, but ok # by default, the model produces ln(odds): predict(m1) # you can also have it report probability # change to probability predict(m1,type="response") # or infer directly using the function "fitted" fitted(m1) # calculate deviance manually y = simdat$y p_hat = fitted(m1) manual_deviance = -2 * sum(y * log(p_hat) + (1 - y) * log(1 - p_hat)) manual_deviance m1$deviance # Null model prediction: overall probability of success p_null <- mean(y) # Manual calculation of null deviance manual_null_dev = -2 * sum(y * log(p_null) + (1 - y) * log(1 - p_null)) manual_null_dev m1$null.deviance ```