---
title: "Multiple Comparisons"
author: "J. Lucas McKay"
date: "2023-03-02"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, collapse = T, warning = F, message = F)
```

***
#### 1. Using `p.adjust`

```{r}
library(tidyverse)
# Let's say we have performed a series of statistical tests and obtained the following p-values:
p.vals = seq(0.0025,0.025,0.0025)
p.vals

# We could give them names, to identify the results of each test.
names(p.vals) = paste("Test",LETTERS[1:10])
p.vals

# Which ones are statistically-significant?
p.vals[p.vals<0.05]

length(p.vals[p.vals<0.05])
```

```{r}
# To control for Type I error, we have a few options, all of which are built in to p.adjust().

# We can calculate the adjusted p-values
p.adjust(p.vals, method = "holm")

# And compare to the original statistical significance level
p.vals[p.adjust(p.vals, method = "holm")<0.05]

# Note that as we get less conservative, the number of significant tests increases
p.vals[p.adjust(p.vals, method = "bonferroni")<0.05]

# same thing
p.vals[p.vals<0.05/10]

# Note that as we get less conservative, the number of significant tests increases
p.vals[p.adjust(p.vals, method = "bonferroni")<0.05]
# 1 significant result
p.vals[p.adjust(p.vals, method = "holm")<0.05]
# 2 significant result
p.vals[p.adjust(p.vals, method = "fdr")<0.05]
```

```{r}
# What is the expected number of false positives when we use FDR?
```


***
#### 2. Reporting P values

```{r}
# load biomarker data; compare group A and C
library(tidyverse)
csf = read_csv("https://jlucasmckay.bmi.emory.edu/global/bmi510/Labs-Materials/csf.csv") |> filter(Group %in% c("A","C"))

# Extract group
G = csf$Group

# Extract numerical values
vals = csf |> select(-Group) |> as.matrix()

# Calculate p-values. We can use "apply" for this:
apply(vals,2,mean)

nanmean = function(x) mean(x,na.rm = T) |> round(2)
vals |> apply(2,nanmean)

p.values = vals |> apply(2,function(x) t.test(x[G=="A"],x[G=="C"])$p.value)

sort(p.values)

# We can directly adjust with FDR:
sort(p.adjust(p.values, "fdr"))

# But with Holm, remember that no adjustment is made for the higher p-values; sometimes this leads to curious repeated numbers, etc.
sort(p.adjust(p.values, "holm"))
```
```{r}
# Often what we'd like to do is something like the following.
report.values = tibble(Biomarker = names(p.values), P = as.numeric(p.values))


report.values$padj = p.adjust(report.values$P,"fdr")
report.values = report.values |> mutate(
  `FDR` = case_when(
    padj<0.001 ~ "***",
    padj<0.01 ~ "**",
    padj<0.05 ~ "*",
    TRUE ~ ""
  ))

report.values |>
  select(-padj) |>
  mutate(P = round(P,3)) |>
  arrange(P) |> 
  knitr::kable()
```