---
title: "Lab - Logistic Regression"
author: "J. Lucas McKay"
date: "2023-04-12"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, collapse = T, warning = F, message = F)
```


***
#### 1. Logistic regression and individual parameter tests

```{r}
library(tidyverse)
library(ISLR)

# load the default dataset included with ISLR
d = tibble(Default) |> 
  mutate(student_status = case_when(student=="Yes"~"Student",T~"Non-Student")) |> 
  mutate(default_status = case_when(default=="Yes"~"Default",T~"Good Standing"))

# replicate the charts from the book
d |> ggplot(aes(default,balance)) +
  geom_boxplot() +
  facet_wrap(~student_status) +
  theme_minimal()

# replicate the charts from the book
d |> ggplot(aes(default,balance,color = student_status)) +
  geom_jitter(width = 0.2) +
  facet_wrap(~student_status) +
  theme_minimal()
```


```{r}
d = tibble(Default)

m1 = glm(default~student,data = d, family = binomial)
# what are we modeling here?
levels(d$default)
# r treats the second level as a 1 by default

summary(m1)
summary(m1)$coef

m2 = glm(default~balance,data = d, family = "binomial")
summary(m2)
summary(m2)$coef

m3 = glm(default~income,data = d, family = "binomial")
summary(m3)
summary(m3)$coef

m4 = glm(default~.,data = d, family = "binomial")
summary(m4)
summary(m4)$coef["studentYes",]

# what happens to the association between student status and default probability when we remove income?
m5 = glm(default~balance+student,data = d, family = "binomial")
summary(m5)
summary(m5)$coef["studentYes",]

d |> ggplot(aes(student,income,color = student)) + geom_boxplot()

# interaction
m6 = glm(default~.+student*balance,data = d, family = "binomial")

```

***
#### 2. Model predictions

```{r}

# create some data
library(tidyverse)
set.seed(510)
n = 100
beta = c(-6,10)
x = runif(n)

# calculate the left side of the equation
logity = beta[1]+beta[2]*x

# transform the logit to probability
py = exp(logity)/(1+exp(logity))

# this is also built in as plogis
max(abs(py-plogis(logity)))

# we need to simulate a coin flip for every observation given the probability p_i
y = rbinom(n,1,py)

# make a design matrix
simdat = tibble(y = y, x = x)

# fit a model
m1 = glm(y~x,data=simdat,family=binomial)
m1


```


*** 
#### 2. Deviance

We can present the deviance either with the log likelihood equation:

$$
D=-2\log\text{lik}(\hat{\boldsymbol{\beta}})+2\log\text{lik}(\text{saturated model})
$$

Or with the numerical version:

$$
D = -2 \sum_{i=1}^{n} \left[ y_i \log(\hat{p}_i) + (1 - y_i) \log(1 - \hat{p}_i) \right]
$$

```{r}
# create some data
library(tidyverse)
set.seed(510)
n = 100
beta = c(-6,10)
x = runif(n)

# calculate the left side of the equation
logity = beta[1]+beta[2]*x

# transform the logit to probability
py = exp(logity)/(1+exp(logity))

# this is also built in as plogis
max(abs(py-plogis(logity)))

# we need to simulate a coin flip for every observation given the probability p_i
y = rbinom(n,1,py)

# make a design matrix
simdat = tibble(y = y, x = x)

# fit a model, get the deviance and null deviance
m1 = glm(y~x,data=simdat,family=binomial)
m1$deviance
m1$null.deviance


# note not a perfect fit, but ok

# by default, the model produces ln(odds):
predict(m1)

# you can also have it report probability

# change to probability
predict(m1,type="response")

# or infer directly using the function "fitted"
fitted(m1)

# calculate deviance manually
y = simdat$y
p_hat = fitted(m1)

manual_deviance = -2 * sum(y * log(p_hat) + (1 - y) * log(1 - p_hat))
manual_deviance
m1$deviance

# Null model prediction: overall probability of success
p_null <- mean(y)

# Manual calculation of null deviance
manual_null_dev = -2 * sum(y * log(p_null) + (1 - y) * log(1 - p_null))
manual_null_dev
m1$null.deviance

```