# Lab_01.R library(tidyverse) # Load the german credit data d = read_csv("https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv") # printing the dataset is a lot like a dataframe d summary(d) glimpse(d) d |> names() # Create a new variable two ways dm.usd = function(dm) dm*0.56 d$amount.usd = d$amount |> dm.usd() # or d = d |> mutate(amt.usd = dm.usd(amount)) # Access: d[4] # Returns a tibble d$purpose # Returns a vector rec.num = 17 d[rec.num,] # Returns row 17 of the tibble d[rec.num,"purpose"] # Returns a tibble with one column and one row d[,"purpose"] # Returns a tibble with all rows d$purpose[rec.num] # Returns entry 17 of the vector # Let's do a little exploration d$personal_status_sex |> table() # Clearly there is something odd here - did no single women request a loan? # Create a new variable that only captures sex d = d |> mutate(sex = if_else(startsWith(personal_status_sex,"female"),"female","male")) |> mutate(marital_status = personal_status_sex |> str_remove_all("female : ") |> str_remove_all("male : ")) d$marital_status |> table() # The janitor package is very useful d |> janitor::tabyl(sex,marital_status) d |> janitor::tabyl(sex,personal_status_sex) # Let's look at credit worthiness d |> janitor::tabyl(sex,credit_risk) d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_totals(where = c("row","col")) d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row") t = d |> janitor::tabyl(sex,credit_risk) |> janitor::adorn_percentages(denominator = "row") names(t) t[2] = round(100*t[2],0) t[3] = round(100*t[3],0) t # Do a chi-squared test (we'll get to this) chisq.test(t[,-1]) # Do some initial plotting d$savings |> table() ggplot(data = d, mapping = aes(x = sex, y = age)) + geom_jitter() ggplot(data = d, mapping = aes(x = sex, y = age, color = savings)) + geom_jitter() d |> ggplot(aes(sex,age,color = savings)) + geom_jitter() d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d() # Creating a factor variable d$savings |> unique() |> cat(sep = "\n") factor(x = d$savings, levels = c("unknown/no savings account", "... < 100 DM", "100 <= ... < 500 DM", "500 <= ... < 1000 DM", "... >= 1000 DM")) d$savings = d$savings |> factor(c("unknown/no savings account", "... < 100 DM", "100 <= ... < 500 DM", "500 <= ... < 1000 DM", "... >= 1000 DM")) # Do some initial plotting d |> ggplot(aes(sex,age,color = savings)) + geom_jitter(width = 0.2) + scale_color_viridis_d() d |> ggplot(aes(sex,age,color = forcats::fct_rev(savings))) + geom_jitter(width = 0.2) + scale_color_viridis_d() + labs(color = "Savings", x = "Sex", y = "Age") + ggthemes::theme_gdocs()