# Lab_01.R library(tidyverse) # Load the german credit data d = read_csv("https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv") # printing the dataset is a lot like a dataframe d summary(d) glimpse(d) names(d) # the pipe d |> names() # Create a new variable two ways dm.usd = function(dm) dm*0.56 d$amount.usd = d$amount |> dm.usd() # or d = d |> mutate(amt.usd = dm.usd(amount)) # Access: d[4] # Returns a tibble d$purpose # Returns a vector rec.num = 17 d[rec.num,] # Returns row 17 of the tibble d[rec.num,"purpose"] # Returns a tibble with one column and one row d[,"purpose"] # Returns a tibble with all rows d$purpose[rec.num] # Returns entry 17 of the vector # Let's do a little exploration d$personal_status_sex |> table() # Clearly there is something odd here - did no single women request a loan? (Probably because they were not allowed to do so) # Create a new variable that only captures sex d = d |> mutate(sex = if_else(startsWith(personal_status_sex,"female"),"female","male")) |> mutate(marital_status = personal_status_sex |> str_remove_all("female : ") |> str_remove_all("male : ")) d$sex |> table() d$marital_status |> table() table(d$sex,d$marital_status) # The janitor package is very useful d |> janitor::tabyl(sex,marital_status) d |> janitor::tabyl(sex,personal_status_sex) # Crosstabs! d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals() # How to add totals? Note variable by position or name. ?janitor::adorn_totals d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals(where = c("row","col")) d |> janitor::tabyl(sex,marital_status) |> janitor::adorn_totals(c("row","col")) tb = d |> janitor::tabyl(sex,marital_status) # Note that the table itself is a tibble names(tb) # Let's look at credit worthiness d |> janitor::tabyl(sex,credit_risk) d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_totals(where = c("row","col")) d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row") t = d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row") names(t) t[2] = round(100*t[2],0) t[3] = round(100*t[3],0) t # Do a chi-squared test (we'll get to this) chisq.test(t[,-1]) # Do some initial plotting d$savings |> table() ggplot(data = d, mapping = aes(x = sex, y = age)) + geom_jitter() ggplot(data = d, mapping = aes(x = sex, y = age, color = savings)) + geom_jitter() d |> ggplot(aes(sex,age,color = savings)) + geom_jitter() d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d() # Creating a factor variable d$savings |> unique() |> cat(sep = "\n") factor(x = d$savings, levels = c("unknown/no savings account", "... < 100 DM", "100 <= ... < 500 DM", "500 <= ... < 1000 DM", "... >= 1000 DM")) d$savings = d$savings |> factor(c("unknown/no savings account", "... < 100 DM", "100 <= ... < 500 DM", "500 <= ... < 1000 DM", "... >= 1000 DM")) # Do some initial plotting d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d() d |> ggplot(aes(sex,age,color = forcats::fct_rev(savings))) + geom_jitter(width = 0.2) + scale_color_viridis_d() + labs(color = "Savings", x = "Sex", y = "Age") + ggthemes::theme_gdocs() d |> ggplot(aes(age,amount,color = credit_risk)) + geom_point() d |> ggplot(aes(age,amount,color = factor(credit_risk))) + geom_point() + facet_wrap(~factor(credit_risk)) d |> ggplot(aes(age,amount,color = factor(credit_risk))) + geom_point() + facet_wrap(sex~factor(credit_risk)) # Convert labels to sentence case sentence_case <- function(text) { str_to_sentence(as.character(text)) } ggplot(d, aes(age, amount, color = factor(credit_risk))) + geom_point() + facet_wrap(sex~factor(credit_risk), labeller = labeller(.default = sentence_case)) + labs( x = sentence_case("age"), y = sentence_case("amount"), color = sentence_case("credit risk") ) + theme( strip.text = element_text(face = "plain"), legend.title = element_text(face = "plain") )