--- title: "Lab - Logistic Regression" author: "J. Lucas McKay" date: "2023-04-12" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = F, collapse = T, warning = F, message = F) ``` *** #### 1. Logistic regression and individual parameter tests ```{r} library(tidyverse) library(ISLR) # load the default dataset included with ISLR d = tibble(Default) |> mutate(student_status = case_when(student=="Yes"~"Student",T~"Non-Student")) |> mutate(default_status = case_when(default=="Yes"~"Default",T~"Good Standing")) # replicate the charts from the book d |> ggplot(aes(default,balance)) + geom_boxplot() + facet_wrap(~student_status) + theme_minimal() # replicate the charts from the book d |> ggplot(aes(default,balance,color = student_status)) + geom_jitter(width = 0.2) + facet_wrap(~student_status) + theme_minimal() ``` ```{r} d = tibble(Default) m1 = glm(default~student,data = d, family = binomial) # what are we modeling here? levels(d$default) # r treats the second level as a 1 by default summary(m1) summary(m1)$coef m2 = glm(default~balance,data = d, family = "binomial") summary(m2) summary(m2)$coef m3 = glm(default~income,data = d, family = "binomial") summary(m3) summary(m3)$coef m4 = glm(default~.,data = d, family = "binomial") summary(m4) summary(m4)$coef["studentYes",] # what happens to the association between student status and default probability when we remove income? m5 = glm(default~balance+student,data = d, family = "binomial") summary(m5) summary(m5)$coef["studentYes",] d |> ggplot(aes(student,income,color = student)) + geom_boxplot() # what happens when we create an appropriate interaction term? m6 = glm(default~.+student*balance,data = d, family = "binomial") summary(m6) summary(m6)$coef["studentYes",] ``` *** #### 2. Simulating ```{r} # create some data library(tidyverse) set.seed(510) n = 100 beta = c(-6,10) x = runif(n) # calculate the left side of the equation logity = beta[1]+beta[2]*x # transform the logit to probability py = exp(logity)/(1+exp(logity)) # this is also built in as plogis max(abs(py-plogis(logity))) # we need to simulate a coin flip for every observation given the probability p_i y = rbinom(n,1,py) # make a design matrix simdat = tibble(y = y, x = x) # fit a model # note that you can also lose the quotes on binomial m1 = glm(y~x,data=simdat,family=binomial) m1 ``` *** #### 3. Deviance $$ D=-2\log\text{lik}(\hat{\boldsymbol{\beta}})+2\log\text{lik}(\text{saturated model}) $$ ```{r} library(tidyverse) # Here's an example adapted from the internet. Weights of cats. male <- c(1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0) weight <- c(2.1, 2.5, 1.2, 1, 3, 2.1, 1.5, 2.2, 1.9, 2.7, 1.1, 2.9, 1.2, 2.1, 2.2, 2.5, 1.9, 1.2, 2, 2.9, 2.2, 1.5, 3, 2.4, 1.2, 1.6, 2.3, 2.1, 2.6, 2.4, 2.5, 2, 1, 1.4, 2.9, 1.5, 3, 2.9, 2.9, 2.1, 2.8, 2.7, 1, 2.9, 1.1, 2.2, 1.3, 1.7, 1.5, 1.7) d = tibble(male,weight) |> mutate(obs = factor(row_number())) # Plot d |> ggplot(aes(weight,male)) + geom_point() # Null and saturated model s mdl.sat = glm(male ~ obs, data = d, family=binomial) mdl.null = glm(male ~ 1, data = d, family=binomial) # Logistic model mdl.log = glm(male ~ weight, data = d, family = binomial) # We can get the log likelihood with the logLik function logLik(mdl.sat) logLik(mdl.log) logLik(mdl.null) # You can get the deviance from summary summary(mdl.null) summary(mdl.sat) summary(mdl.log) summary(mdl.log)$null.deviance summary(mdl.log)$deviance # Or from the function deviance(mdl.log) # Or manually (residual deviance) 2*(logLik(mdl.sat) - logLik(mdl.log)) # So, we can actually compare deviances here dev = tibble(Model = c(" Saturated"," Proposed"," Null"), Deviance = c( logLik(mdl.sat), logLik(mdl.log), logLik(mdl.null) )) dev |> ggplot(aes(x = 0, y = Deviance, label = Model)) + geom_point() + geom_text(hjust = 0) # We can also compare the predicted probabilities of each training data pred.p = tibble(Model = "Saturated", X = weight, P = predict(mdl.sat, type = "response")) |> bind_rows( tibble(Model = "Proposed", X = weight, P = predict(mdl.log, type = "response")) ) |> bind_rows( tibble(Model = "Null", X = weight, P = predict(mdl.null, type = "response")) ) |> arrange(Model,X) pred.p |> ggplot(aes(X,P,color = Model)) + geom_point() + geom_line() + theme_minimal() ```