# Lab_01.R
library(tidyverse)

# Load the german credit data
d = read_csv("https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv")

# printing the dataset is a lot like a dataframe
d

summary(d)
glimpse(d)
names(d)

# the pipe
d |> names()

# Create a new variable two ways
dm.usd = function(dm) dm*0.56
d$amount.usd = d$amount |> dm.usd()

# or
d = d |> mutate(amt.usd = dm.usd(amount))

# Access: 
d[4]
# Returns a tibble
d$purpose
# Returns a vector

rec.num = 17
d[rec.num,]
# Returns row 17 of the tibble
d[rec.num,"purpose"]
# Returns a tibble with one column and one row
d[,"purpose"]
# Returns a tibble with all rows
d$purpose[rec.num]
# Returns entry 17 of the vector

# Let's do a little exploration
d$personal_status_sex |> table()

# Clearly there is something odd here - did no single women request a loan? (Probably because they were not allowed to do so)

# Create a new variable that only captures sex
d = d |> 
  mutate(sex = if_else(startsWith(personal_status_sex,"female"),"female","male")) |> 
  mutate(marital_status = personal_status_sex |> str_remove_all("female : ") |> str_remove_all("male : "))

d$sex |> table()
d$marital_status |> table()

table(d$sex,d$marital_status)

# The janitor package is very useful
d |> janitor::tabyl(sex,marital_status)
d |> janitor::tabyl(sex,personal_status_sex)

# Crosstabs!
d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals()

# How to add totals? Note variable by position or name.
?janitor::adorn_totals

d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals(where = c("row","col"))
d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals(c("row","col"))
tb = d |> janitor::tabyl(sex,marital_status)

# Note that the table itself is a tibble
names(tb)


# Let's look at credit worthiness
d |> janitor::tabyl(sex,credit_risk)
d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_totals(where = c("row","col"))
d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row")

t = d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row")
names(t)
t[2] = round(100*t[2],0)
t[3] = round(100*t[3],0)
t
# Do a chi-squared test (we'll get to this)
chisq.test(t[,-1])


# Do some initial plotting
d$savings |> table()
ggplot(data = d, mapping = aes(x = sex, y = age)) + geom_jitter()
ggplot(data = d, mapping = aes(x = sex, y = age, color = savings)) + geom_jitter()

d |> ggplot(aes(sex,age,color = savings)) + geom_jitter()
d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2)
d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d()

# Creating a factor variable
d$savings |> unique() |> cat(sep = "\n")
factor(x = d$savings, levels = c("unknown/no savings account",
                                 "... < 100 DM",
                                 "100 <= ... < 500 DM",
                                 "500 <= ... < 1000 DM",
                                 "... >= 1000 DM"))
d$savings = d$savings |> factor(c("unknown/no savings account",
                                  "... < 100 DM",
                                  "100 <= ... < 500 DM",
                                  "500 <= ... < 1000 DM",
                                  "... >= 1000 DM"))

# Do some initial plotting
d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d()

d |> ggplot(aes(sex,age,color = forcats::fct_rev(savings))) +
  geom_jitter(width = 0.2) +
  scale_color_viridis_d() +
  labs(color = "Savings", x = "Sex", y = "Age") +
  ggthemes::theme_gdocs()

d |> ggplot(aes(age,amount,color = credit_risk)) + geom_point()
d |> ggplot(aes(age,amount,color = factor(credit_risk))) + geom_point() + facet_wrap(~factor(credit_risk))
d |> ggplot(aes(age,amount,color = factor(credit_risk))) + geom_point() + facet_wrap(sex~factor(credit_risk))


# Convert labels to sentence case
sentence_case <- function(text) {
  str_to_sentence(as.character(text))
}


ggplot(d, aes(age, amount, color = factor(credit_risk))) +
  geom_point() +
  facet_wrap(sex~factor(credit_risk), labeller = labeller(.default = sentence_case)) +
  labs(
    x = sentence_case("age"),
    y = sentence_case("amount"),
    color = sentence_case("credit risk")
  ) +
  theme(
    strip.text = element_text(face = "plain"),
    legend.title = element_text(face = "plain")
  )