---
title: "Lab - Logistic Regression"
author: "J. Lucas McKay"
date: "2023-04-12"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, collapse = T, warning = F, message = F)
```


***
#### 1. Logistic regression and individual parameter tests

```{r}
library(tidyverse)
library(ISLR)

# load the default dataset included with ISLR
d = tibble(Default) |> 
  mutate(student_status = case_when(student=="Yes"~"Student",T~"Non-Student")) |> 
  mutate(default_status = case_when(default=="Yes"~"Default",T~"Good Standing"))

# replicate the charts from the book
d |> ggplot(aes(default,balance)) +
  geom_boxplot() +
  facet_wrap(~student_status) +
  theme_minimal()

# replicate the charts from the book
d |> ggplot(aes(default,balance,color = student_status)) +
  geom_jitter(width = 0.2) +
  facet_wrap(~student_status) +
  theme_minimal()
```


```{r}
d = tibble(Default)

m1 = glm(default~student,data = d, family = binomial)
# what are we modeling here?
levels(d$default)
# r treats the second level as a 1 by default

summary(m1)
summary(m1)$coef

m2 = glm(default~balance,data = d, family = "binomial")
summary(m2)
summary(m2)$coef

m3 = glm(default~income,data = d, family = "binomial")
summary(m3)
summary(m3)$coef

m4 = glm(default~.,data = d, family = "binomial")
summary(m4)
summary(m4)$coef["studentYes",]

# what happens to the association between student status and default probability when we remove income?
m5 = glm(default~balance+student,data = d, family = "binomial")
summary(m5)
summary(m5)$coef["studentYes",]

d |> ggplot(aes(student,income,color = student)) + geom_boxplot()

# what happens when we create an appropriate interaction term?
m6 = glm(default~.+student*balance,data = d, family = "binomial")
summary(m6)
summary(m6)$coef["studentYes",]

```

***
#### 2. Simulating 

```{r}

# create some data
library(tidyverse)
set.seed(510)
n = 100
beta = c(-6,10)
x = runif(n)
# calculate the left side of the equation
logity = beta[1]+beta[2]*x
# transform the logit to probability
py = exp(logity)/(1+exp(logity))
# this is also built in as plogis
max(abs(py-plogis(logity)))

# we need to simulate a coin flip for every observation given the probability p_i
y = rbinom(n,1,py)

# make a design matrix
simdat = tibble(y = y, x = x)

# fit a model
# note that you can also lose the quotes on binomial
m1 = glm(y~x,data=simdat,family=binomial)
m1


```


*** 
#### 3. Deviance

$$
D=-2\log\text{lik}(\hat{\boldsymbol{\beta}})+2\log\text{lik}(\text{saturated model})
$$


```{r}
library(tidyverse)

# Here's an example adapted from the internet. Weights of cats.
male <- c(1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0)
weight <- c(2.1, 2.5, 1.2, 1, 3, 2.1, 1.5, 2.2, 1.9, 2.7, 1.1, 2.9, 1.2, 2.1, 
          2.2, 2.5, 1.9, 1.2, 2, 2.9, 2.2, 1.5, 3, 2.4, 1.2, 1.6, 2.3, 2.1, 
          2.6, 2.4, 2.5, 2, 1, 1.4, 2.9, 1.5, 3, 2.9, 2.9, 2.1, 2.8, 2.7, 1, 
          2.9, 1.1, 2.2, 1.3, 1.7, 1.5, 1.7)

d = tibble(male,weight) |> mutate(obs = factor(row_number()))

# Plot
d |> ggplot(aes(weight,male)) + geom_point()

# Null and saturated model s
mdl.sat = glm(male ~ obs, data = d, family=binomial)
mdl.null = glm(male ~ 1, data = d, family=binomial)

# Logistic model
mdl.log = glm(male ~ weight, data = d, family = binomial)

# We can get the log likelihood with the logLik function
logLik(mdl.sat)
logLik(mdl.log)
logLik(mdl.null)

# You can get the deviance from summary
summary(mdl.null)
summary(mdl.sat)
summary(mdl.log)
summary(mdl.log)$null.deviance
summary(mdl.log)$deviance

# Or from the function
deviance(mdl.log)

# Or manually (residual deviance)
2*(logLik(mdl.sat) - logLik(mdl.log))

# So, we can actually compare deviances here
dev = tibble(Model = c("  Saturated","  Proposed","  Null"), Deviance = c(
logLik(mdl.sat),
logLik(mdl.log),
logLik(mdl.null)
))

dev |> ggplot(aes(x = 0, y = Deviance, label = Model)) + geom_point() + geom_text(hjust = 0)

# We can also compare the predicted probabilities of each training data
pred.p = tibble(Model = "Saturated", X = weight, P = predict(mdl.sat, type = "response")) |> 
  bind_rows(
    tibble(Model = "Proposed", X = weight, P = predict(mdl.log, type = "response"))
  ) |>   bind_rows(
    tibble(Model = "Null", X = weight, P = predict(mdl.null, type = "response"))
  ) |> arrange(Model,X)

pred.p |> ggplot(aes(X,P,color = Model)) + geom_point() + geom_line() + theme_minimal()


```