diff --git a/.Rhistory b/.Rhistory index f95dad6..58f21e8 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,231 +1,3 @@ -rstatix::t_test(simon_effect ~ similarity, data = simon_effect, paired = TRUE) -View(simon_effect) -simon_effect <- zwaan_data %>% -pivot_longer(cols = session1_congruent:session2_incongruent, names_to = "col_headings", values_to = "RT") %>% -separate(col_headings, into = c("Session_number", "congruency"), sep = "_") %>% -group_by(participant, similarity, congruency) %>% -summarise(mean_RT = mean(RT)) %>% -ungroup() %>% -pivot_wider(names_from = congruency, values_from = mean_RT) %>% -mutate(simon_effect = incongruent - congruent) %>% -full_join(zwaan_demo, by = join_by(participant == twosubjectnumber)) %>% -select(participant, gender = gender_response, age = age_response, education = education_response, similarity:simon_effect) -rstatix::t_test(simon_effect ~ similarity, data = simon_effect, paired = TRUE) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -simon_effect <- zwaan_data %>% -pivot_longer(cols = session1_congruent:session2_incongruent, names_to = "col_headings", values_to = "RT") %>% -separate(col_headings, into = c("Session_number", "congruency"), sep = "_") %>% -group_by(participant, similarity, congruency) %>% -summarise(mean_RT = mean(RT)) %>% -ungroup() -rstatix::t_test(simon_effect ~ congruency, data = simon_effect, paired = TRUE) -rstatix::t_test(simon_effect, simon_effect ~ congruency, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect, paired = TRUE) -simon_effect2 <- zwaan_data %>% -pivot_longer(cols = session1_congruent:session2_incongruent, names_to = "col_headings", values_to = "RT") %>% -separate(col_headings, into = c("Session_number", "congruency"), sep = "_") %>% -group_by(participant, similarity, congruency) %>% -summarise(mean_RT = mean(RT)) %>% -ungroup() #%>% -pivot_wider(names_from = congruency, values_from = mean_RT) %>% -mutate(simon_effect = incongruent - congruent) %>% -full_join(zwaan_demo, by = join_by(participant == twosubjectnumber)) %>% -select(participant, gender = gender_response, age = age_response, education = education_response, similarity:simon_effect) -simon_effect <- zwaan_data %>% -pivot_longer(cols = session1_congruent:session2_incongruent, names_to = "col_headings", values_to = "RT") %>% -separate(col_headings, into = c("Session_number", "congruency"), sep = "_") %>% -group_by(participant, similarity, congruency) %>% -summarise(mean_RT = mean(RT)) %>% -ungroup() %>% -pivot_wider(names_from = congruency, values_from = mean_RT) %>% -mutate(simon_effect = incongruent - congruent) %>% -full_join(zwaan_demo, by = join_by(participant == twosubjectnumber)) %>% -select(participant, gender = gender_response, age = age_response, education = education_response, similarity:simon_effect) -View(simon_effect) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect2, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect2, paired = TRUE) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect2, paired = TRUE) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect2, paired = TRUE, detailed = TRUE) -simon_effect <- zwaan_data %>% -pivot_longer(cols = session1_congruent:session2_incongruent, names_to = "col_headings", values_to = "RT") %>% -separate(col_headings, into = c("Session_number", "congruency"), sep = "_") %>% -group_by(participant, similarity, congruency) %>% -summarise(mean_RT = mean(RT)) %>% -ungroup() %>% -pivot_wider(names_from = congruency, values_from = mean_RT) %>% -mutate(simon_effect = incongruent - congruent) %>% -full_join(zwaan_demo, by = join_by(participant == twosubjectnumber)) %>% -select(participant, gender = gender_response, age = age_response, education = education_response, similarity:simon_effect) -head(simon_effect, n = 5) -View(simon_effect) -descriptives <- simon_effect %>% -summarise(mean_congruent = mean(congruent), -sd_congruent = sd(congruent), -mean_incongruent = mean(incongruent), -sd_incongruent = sd(incongruent), -mean_simon_effect = mean(simon_effect), -diff = mean_incongruent - mean_congruent) -View(descriptives) -descriptives <- simon_effect %>% -summarise(mean_congruent = mean(congruent), -sd_congruent = sd(congruent), -mean_incongruent = mean(incongruent), -sd_incongruent = sd(incongruent), -diff = mean_incongruent - mean_congruent) -simon_effect_long <- simon_effect %>% -pivot_longer(cols = congruent:incongruent, names_to = "col_headings", values_to = "RT") -View(simon_effect_long) -simon_effect_long <- simon_effect %>% -pivot_longer(cols = congruent:incongruent, names_to = "congruency", values_to = "mean_RT") -ggplot(simon_effect_long, aes(x = congruency, y = mean_RT, fill = congruency)) + -geom_violin(alpha = 0.5) + -geom_boxplot(width = 0.4, alpha = 0.8) + -scale_fill_viridis_d(guide = "none") + -theme_classic() + -labs(x = "Congruency", y = "mean Response Time") -rstatix::t_test(mean_RT ~ congruency, data = simon_effect2, paired = TRUE, detailed = TRUE) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect_long, paired = TRUE, detailed = TRUE) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -rstatix::t_test(mean_RT ~ congruency, data = simon_effect_long, paired = TRUE, detailed = TRUE) -## congruent group -shapiro.test(simon_effect$congruent) -## congruent group -shapiro.test(simon_effect$congruent) -## incongruent group -shapiro.test(simon_effect$incongruent) -View(simon_effect) -ggplot(simon_effect, aes(x = "", y = simon_effect)) + -geom_violin(fill = "#FB8D61", alpha = 0.4) + # alpha for opacity, fill for adding colour -geom_boxplot(fill = "#FB8D61", width = 0.5) + # change width of the boxes -theme_classic() + -labs(x = "", -y = "Difference in mean Response Time scores") -# Version 2 with package qqplotr -ggplot(simon_effect, aes(sample = simon_effect)) + -stat_qq_band(fill = "#FB8D61", alpha = 0.4) + -stat_qq_line(colour = "#FB8D61") + -stat_qq_point() -library(qqplotr) -# Version 2 with package qqplotr -ggplot(simon_effect, aes(sample = simon_effect)) + -stat_qq_band(fill = "#FB8D61", alpha = 0.4) + -stat_qq_line(colour = "#FB8D61") + -stat_qq_point() -simon_effect_long <- simon_effect %>% -pivot_longer(cols = c(congruent, incongruent), names_to = "congruency", values_to = "mean_RT") -shapiro.test(simon_effect$simon_effect) -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -library(lsr) -cohensD(simon_effect ~ similarity, data = simon_effect) -cohensD(simon_effect ~ similarity, data = simon_effect, method = "unequal") -cohensD(simon_effect ~ similarity, data = simon_effect, method = "unequal") -cohensD(congruent, incongrunet, data = simon_effect, method = "paired") -cohensD(congruent, incongruent, data = simon_effect, method = "paired") -cohensD(simon_effect$congruent, simon_effect$incongruent, method = "paired") -cohensD(simon_effect$congruent, simon_effect$incongruent) -cohensD(simon_effect$congruent, simon_effect$incongruent, method = "paired") -View(simon_effect_long) -cohensD(mean_RT ~ congruency, data = simon_effect_long, method = "paired") -View(simon_effect_long) -cohensD(mean_RT ~ congruency, data = simon_effect_long, method = "paired") -cohensD(simon_effect ~ similarity, data = simon_effect, method = "unequal") -cohensD(simon_effect$congruent, simon_effect$incongruent, method = "paired") -cohensD(mean_RT ~ congruency, data = simon_effect_long, method = "paired") -library(pwr) -pwr.t.test(n = 160, sig.level = 0.05, power = 0.8, type = "paired", alternative = "two.sided") -descriptives <- simon_effect %>% -summarise(mean_congruent = mean(congruent), -sd_congruent = sd(congruent), -mean_incongruent = mean(incongruent), -sd_incongruent = sd(incongruent), -diff = mean_incongruent - mean_congruent) -descriptives -View(descriptives) -descriptives <- simon_effect %>% -summarise(mean_congruent = mean(congruent), -sd_congruent = sd(congruent), -mean_incongruent = mean(incongruent), -sd_incongruent = sd(incongruent), -diff = mean_incongruent - mean_congruent, # diff = mean(simon_effect) would also work -sd_diff = sd(simon_effect)) -descriptives -t.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -cohensD(simon_effect$congruent, simon_effect$incongruent, method = "paired") -wilcox.test(mean_RT ~ congruency, data = simon_effect_long, paired = TRUE) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -wilcox.test(simon_effect$simon_effect, mu = 0) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -wilcox.test(simon_effect$simon_effect, mu = 0) -wilcox.test(simon_effect$incongruent, simon_effect$congruent, paired = TRUE) -wilcox.test(simon_effect$simon_effect, mu = 0) -cohensD(simon_effect$incongruent, simon_effect$congruent, method = "paired") -t.test(simon_effect$incongruent, simon_effect$congruent, paired = TRUE) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE)$p -wilcox.test(simon_effect$simon_effect, mu = 0) -wilcox.test(simon_effect ~ similarity, data = simon_effect) -wilcox_effsize(data = simon_effect_long, formula = mean_RT ~ congruency, paired = TRUE) -# load in the packages -library(rstatix) -wilcox_effsize(data = simon_effect_long, formula = mean_RT ~ congruency, paired = TRUE) -summary(simon_effect) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -library(psych) -install.packages("psych") -library(psych) -describeBy(simon_effect ~ similarity, data = simon_effect) -describeBy(simon_effect ~ similarity, data = simon_effect) -wilcox_effsize(data = simon_effect, formula = simon_effect ~ similarity) -# storing the p-value -p_wilcoxon <- wilcox.test(simon_effect ~ similarity, data = simon_effect)$p.value -# calculate the z value from half the p-value -z = qnorm(p_wilcoxon/2) -z -wilcox.test(one_sample$wemwbs_sum, mu = 53.0) -# storing the p-value -p_wilcoxon <- wilcox.test(simon_effect ~ similarity, data = simon_effect)$p.value -# calculate the z value from half the p-value -z = qnorm(p_wilcoxon/2) -z -# storing the p-value -p_wilcoxon <- wilcox.test(simon_effect$incongruent, simon_effect$congruent, paired = TRUE)$p.value -# calculate the z value from half the p-value -z = qnorm(p_wilcoxon/2) -z -summary(simon_effect) -wilcox.test(simon_effect$incongruent, simon_effect$congruent, paired = TRUE) -wilcox.test(simon_effect$congruent, simon_effect$incongruent, paired = TRUE) -wilcoxonOneSampleR(one_sample$wemwbs_sum, mu = 53.0, digits = 3) -library(tidyverse) -## I basically have to have 2 code chunks since I tell them to put the data files next to the project, and mine are in a separate folder called data - unless I'll turn this into a fixed path -library(tidyverse) -data_prp <- read_csv("data/prp_data_reduced.csv") -demo_total <- data_prp %>% -summarise(n = n(), # participant number -mean_age = mean(Age), # mean age -sd_age = sd(Age)) # standard deviation of age -demo_total -age_distinct <- data_prp %>% -distinct(Age) -age_distinct -age_distinct -data_prp <- data_prp %>% -mutate(Age = parse_number(Age)) -typeof(data_prp$Age) # fixed -demo_total <- data_prp %>% -summarise(n = n(), # participant number -mean_age = mean(Age), # mean age -sd_age = sd(Age)) # standard deviation of age -demo_total -demo_total <- data_prp %>% -summarise(n = n(), # participant number -mean_age = mean(Age, na.rm = TRUE), # mean age -sd_age = sd(Age, na.rm = TRUE)) # standard deviation of age -demo_total demo_by_gender <- data_prp %>% group_by(Gender) %>% # split data up into groups (here Gender) summarise(n = n(), # participant number @@ -510,3 +282,231 @@ View(test) test2 <- test %>% ungroup() %>% group_by(gender_label) %>% summarise(Percentage = max_age*22/100) View(test2) test2 <- test %>% ungroup() %>% group_by(gender_label, mean_age) %>% summarise(Percentage = max_age*22/100) +test <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 20.0), +sd_age = c(0.5725791, 0.5582028, 0.4677369), +min_age = c(18, 18, 19), +max_age = c(22, 20, 21)) +library(tidyverse) +test <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 20.0), +sd_age = c(0.5725791, 0.5582028, 0.4677369), +min_age = c(18, 18, 19), +max_age = c(22, 20, 21)) +test2 <- tibble(gender_label = c("female", "male", "non-binary"), +new_col = "column") +View(test2) +test3 <- inner_join(test, test2) +test3 <- inner_join(test, test2, by = "gender_label") +test4 <- inner_join(test, test2, by = "gender_label") +View(test4) +test4 <- inner_join(test, test2, by = gender_label) +# working +test4 <- inner_join(test, test2, by = join_by(gender_label)) +test5 <- inner_join(test, test2, by = "gender_label") +rm(test3, test4, test5) +# not working +test3 <- inner_join(test, test2, by = gender_label) +test4 <- inner_join(test, test2, by = join_by("gender_label")) +test2 <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 22.0), +new_col = "column") +# not working +test3 <- inner_join(test, test2, by = gender_label) +# working +test7 <- inner_join(test, test2) +View(test7) +test8 <- inner_join(test, test2, by = join_by(gender_label)) +View(test8) +test8 <- inner_join(test, test2, by = join_by(gender_label, age_mean)) +test8 <- inner_join(test, test2, by = join_by(gender_label, mean_age)) +View(test8) +test9 <- inner_join(test, test2, by = join_by("gender_label", "mean_age")) +View(test9) +test10 <- inner_join(test, test2, by = "gender_label") +knitr::opts_chunk$set(echo = TRUE) +test <- tibble(group = c("group1", "group2"), +n = c(157, 99), +mean = c(NA, 100.58), +sd = c(NA, 1.25)) +View(test) +library(tidyverse) +test <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 20.0), +sd_age = c(0.5725791, 0.5582028, 0.4677369), +min_age = c(18, 18, 19), +max_age = c(22, 20, 21)) +test2 <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 22.0), +new_col = "column") +# not working +test3 <- inner_join(test, test2, by = gender_label) +# working +test7 <- inner_join(test, test2) +test8 <- inner_join(test, test2, by = join_by(gender_label)) +test9 <- inner_join(test, test2, by = join_by("gender_label", "mean_age")) +test10 <- inner_join(test, test2, by = "gender_label") +library(tidyverse) +test <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 20.0), +sd_age = c(0.5725791, 0.5582028, 0.4677369), +min_age = c(18, 18, 19), +max_age = c(22, 20, 21)) +test2 <- tibble(gender_label = c("female", "male", "non-binary"), +mean_age = c(19.5, 19.3, 22.0), +new_col = "column") +# not working +test3 <- inner_join(test, test2, by = gender_label) +# working +test4 <- inner_join(test, test2) +test5 <- inner_join(test, test2, by = join_by(gender_label)) +test6 <- inner_join(test, test2, by = join_by("gender_label", "mean_age")) +test7 <- inner_join(test, test2, by = "gender_label") +# working +test4 <- inner_join(test, test2) +test5 <- inner_join(test, test2, by = join_by(gender_label)) +test6 <- inner_join(test, test2, by = join_by("gender_label", "mean_age")) +test7 <- inner_join(test, test2, by = "gender_label") +# working +test4 <- inner_join(test, test2) +test5 <- inner_join(test, test2, by = join_by(gender_label)) +test6 <- inner_join(test, test2, by = join_by("gender_label", "mean_age")) +test7 <- inner_join(test, test2, by = "gender_label") +test_mean <- tibble(group = c("group1", "group2"), +n = c(157, 99), +mean = c(NA, 100.58), +sd = c(NA, 1.25)) +test_less_mean <- tibble(group = c("group1", "group2"), +n = c(157, 99), +mean = c(101.33, 100.58), +sd = c(NA, NA)) +View(test_mean) +View(test_less_mean) +test_before <- tibble(col_A = c("value_1", "value_2"), +col_B = c("value_3", "value_4"), +`col c` = c("value_5", "value_6")) +test_after <- tibble(col_A = c("value_1", "value_2"), +col_B = c("value_3", "value_4"), +col_C = c("value_5", "value_6")) +View(test_before) +View(test_after) +View(test_less_mean) +View(test_mean) +test_mean %>% filter(mean != NA) +test10 <- test_mean %>% filter(mean != NA) +View(test10) +test10 <- test_mean %>% filter(mean != "NA") +View(test10) +## I basically have to have 2 code chunks since I tell them to put the data files next to the project, and mine are in a separate folder called data - unless I'll turn this into a fixed path +library(tidyverse) +data_prp <- read_csv("data/prp_data_reduced.csv") +qrp_t1 <- read_csv("data/qrp_t1.csv") +qrp_t1 <- read_csv("data/qrp_t1.csv") +library(tidyverse) +data_prp <- read_csv("prp_data_reduced.csv") +understanding_t1 <- data_prp %>% +# Step 1 +select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% +# Step 2 - I picked different column labels this time for some variety +pivot_longer(cols = Understanding_OS_1_Time1:Understanding_OS_12_Time1, names_to = "Understanding_Qs", values_to = "Responses") +understanding_t1 <- understanding_t1 %>% +mutate(Responses_corrected = case_match(Responses, # column of the values to recode +"Not at all confident" ~ 1, # values to recode +"Entirely confident" ~ 7, +.default = Responses # all other values taken from column Responses +)) +understanding_t1_step3_v1 <- understanding_t1 %>% +mutate(Responses_corrected = case_match(Responses, # column of the values to recode +"Not at all confident" ~ 1, # values to recode +"Entirely confident" ~ 7, +.default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type +)) +understanding_t1_step3_v2 <- understanding_t1 %>% +mutate(Responses_corrected = case_match(Responses, # column of the values to recode +"Not at all confident" ~ "1", +"Entirely confident" ~ "7", +.default = Responses # all other values taken from column Responses (character) +), +Responses_corrected = parse_number(Responses_corrected)) # turning Responses_corrected into a numeric column +understanding_t1_step3_v2 <- understanding_t1 %>% +mutate(Responses_recoded = case_match(Responses, # column of the values to recode +"Not at all confident" ~ 1, # recode all of them +"2" ~ 2, +"3" ~ 3, +"4" ~ 4, +"5" ~ 5, +"6" ~ 6, +"Entirely confident" ~ 7)) +understanding_t1 <- understanding_t1 %>% +mutate(Responses_corrected = case_match(Responses, # column of the values to recode +"Not at all confident" ~ 1, # values to recode +"Entirely confident" ~ 7, +.default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type +)) %>% +# Step 4: calculating averages per participant +group_by(Code) %>% +summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>% +ungroup() +understanding_t1 <- data_prp %>% +# Step 1 +select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% +# Step 2 +pivot_longer(cols = -Code, names_to = "Understanding_Qs", values_to = "Responses") %>% +# Step 3 +mutate(Responses_corrected = case_match(Responses, # column of the values to recode +"Not at all confident" ~ 1, # values to recode +"Entirely confident" ~ 7, +.default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type +)) %>% +# Step 4 +group_by(Code) %>% +summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>% +ungroup() +sats_t1 <- data_prp %>% +select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% # Step 1 +pivot_longer(cols = -Code, names_to = "Items", values_to = "Response") # Step 2 +sats_t1 <- data_prp %>% +# Step 1 +select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% +# Step 2 +pivot_longer(cols = -Code, names_to = "Items", values_to = "Response") %>% +# Step 3 +separate(Items, into = c(NA, "Item_number", "Subscale", NA), sep = "_", convert = TRUE) %>% +# step 4 +mutate(FW_RV = case_when( +Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ "Reverse", +.default = "Forward" +), +Scores_corrected = case_when( +FW_RV == "Reverse" ~ 8-Response, +.default = Response +)) %>% +# step 5 +group_by(Code, Subscale) %>% +summarise(mean_score = mean(Scores_corrected)) %>% +ungroup() %>% +# step 6 +pivot_wider(names_from = Subscale, values_from = mean_score) %>% +rename(SATS28_Affect_Time1_mean = Affect, +SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence, +SATS28_Value_Time1_mean = Value, +SATS28_Difficulty_Time1_mean = Difficulty) +super <- data_ppr %>% +mutate(Supervisor_15 = 9-supervisor_15_R) %>% +filter(Supervisor_7 = 1) %>% +select(Code, starts_with("Super"), -Supervisor_7, -Supervisor_15_R) +super <- data_prp %>% # spelling mistake in data object +mutate(Supervisor_15 = 8-Supervisor_15_R) %>% # semantic error: 8 minus response for a 7-point scale and supervisor_15_R needs a capital S +filter(Supervisor_7 == 1) %>% # needs a Boolean expression == instead of = +select(Code, starts_with("Super"), -Supervisor_7, -Supervisor_15_R) %>% # no pipe at the end, the rest is actually legit +pivot_longer(cols = -Code, names_to = "Item", values_to = "Response") %>% # pivot_longer instead of pivot_wider +group_by(Code) %>% # Code rather than Time2_Code - the reduced dataset does not contain Time2_Code +summarise(Mean_Supervisor_Support = mean(Response, na.rm = TRUE)) %>% # Score_corrected doesn't exist; needs to be Response +ungroup() +data_prp_final <- data_prp %>% +select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% +full_join(qrp_t1) %>% +full_join(understanding_t1) %>% +full_join(sats_t1) %>% +full_join(super) %>% +select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support) diff --git a/.Rproj.user/EEE9B81B/pcs/source-pane.pper b/.Rproj.user/EEE9B81B/pcs/source-pane.pper index ea660b4..bc7dc99 100644 --- a/.Rproj.user/EEE9B81B/pcs/source-pane.pper +++ b/.Rproj.user/EEE9B81B/pcs/source-pane.pper @@ -1,3 +1,3 @@ { - "activeTab": 6 + "activeTab": 4 } \ No newline at end of file diff --git a/.Rproj.user/EEE9B81B/rmd-outputs b/.Rproj.user/EEE9B81B/rmd-outputs index f1e4c9b..2180aab 100644 --- a/.Rproj.user/EEE9B81B/rmd-outputs +++ b/.Rproj.user/EEE9B81B/rmd-outputs @@ -1,5 +1,4 @@ -D:/OneDrive - University of Glasgow/Teaching_2/2024-25/Level 2 labs/2A_chapter1/chapter_0102.html -C:/Users/Lenovo/Downloads/test.html +D:/OneDrive - University of Glasgow/Teaching_2/2024-25/Level 2 labs/project/Rmd_test.html D:/OneDrive - University of Glasgow/R_Book/analysis-v3-main/data/ch1_2.html D:/OneDrive - University of Glasgow/Teaching_2/2024-25/Level 2/datasets/Alter teaching SPSS/data_ch4/Alter.html diff --git a/.Rproj.user/EEE9B81B/sources/prop/INDEX b/.Rproj.user/EEE9B81B/sources/prop/INDEX index e857b1c..5fe33f6 100644 --- a/.Rproj.user/EEE9B81B/sources/prop/INDEX +++ b/.Rproj.user/EEE9B81B/sources/prop/INDEX @@ -123,6 +123,7 @@ D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FR_Book_L2%2Fanalysis-v3%2F_qua D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FR_Book_L2%2Fanalysis-v3%2Findex.qmd="83BB1C5B" D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%20labs%2F2A_chapter1%2Fchapter_0102.Rmd="8C2681BF" D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%20labs%2F2A_chapter3%2F03_data_viz.Rmd="C157BD41" +D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%20labs%2Fproject%2FRmd_test.Rmd="F4EAD707" D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%2Fdatasets%2FAlter%20teaching%20SPSS%2FVSSL%20CODE%20ANALYSIS%20OUTPUT.Rmd="933B0F5C" D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%2Fdatasets%2FAlter%20teaching%20SPSS%2Fdata_ch4%2FAlter.Rmd="4C6D86E7" D%3A%2FOneDrive%20-%20University%20of%20Glasgow%2FTeaching_2%2F2024-25%2FLevel%202%2Fdatasets%2FBBC_Loneliness%2Fchapter_1.Rmd="AB65009E" diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index e1398dc..45c7af3 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -7,3 +7,4 @@ D:/OneDrive - University of Glasgow/R_Book_L2/analysis-v3/02-wrangling.qmd="239B D:/OneDrive - University of Glasgow/R_Book_L2/analysis-v3/03-wrangling2.qmd="44869D8D" D:/OneDrive - University of Glasgow/R_Book_L2/analysis-v3/04-dataviz.qmd="774ACD6C" D:/OneDrive - University of Glasgow/R_Book_L2/analysis-v3/index.qmd="D9578480" +D:/OneDrive - University of Glasgow/Teaching_2/2024-25/Level 2 labs/project/Rmd_test.Rmd="CCDEBB5C" diff --git a/.quarto/_freeze/03-wrangling2/execute-results/html.json b/.quarto/_freeze/03-wrangling2/execute-results/html.json index a71947f..1cef382 100644 --- a/.quarto/_freeze/03-wrangling2/execute-results/html.json +++ b/.quarto/_freeze/03-wrangling2/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "28db4924dbdf8795f0bba0859a044877", + "hash": "231b72f65fb32087caaf7d6986543529", "result": { - "markdown": "# Data wrangling II {#sec-wrangling2}\n\n## Intended Learning Outcomes {.unnumbered}\n\nBy the end of this chapter, you should be able to:\n\n- apply familiar data wrangling functions to novel datasets\n- read and interpret error messages\n- realise there are several ways of getting to the results\n\nIn this chapter, we will pick up where we left off in @sec-wrangling. We will calculate average scores for two of the questionnaires, address an error mode problem, and finally, join all data objects together. This will finalise our data for the upcoming data visualization sections (@sec-dataviz and @sec-dataviz2).\n\n\n## [Individual Walkthrough]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n## Activity 1: Setup\n\n* Go to the project folder we have been using in the last two weeks and double-click on the project icon to **open the project** in RStudio\n* Either **Create a new `.Rmd` file** for chapter 3 and save it to your project folder or continue the one from last week. See @sec-rmd if you need some guidance.\n\n\n\n## Activity 2: Load in the libraries and read in the data\n\nToday, we will be using `tidyverse` along with the two csv files created at the end of the last chapter: `data_prp_for_ch3.csv` and `qrp_t1.csv`. If you need to download them again for any reason, click on the following links: [data_prp_for_ch3.csv](data/data_prp_for_ch3.csv \"download\") and [qrp_t1.csv](data/qrp_t1.csv \"download\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(???)\ndata_prp <- read_csv(\"???\")\nqrp_t1 <- read_csv(\"???\")\n```\n:::\n\n\n\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidyverse)\ndata_prp <- read_csv(\"prp_data_reduced.csv\")\nqrp_t1 <- read_csv(\"qrp_t1.csv\")\n```\n:::\n\n\n:::\n\nIf you need a quick reminder what the dataset was about, have a look at the abstract in @sec-download_data_ch1. We also addressed the changes we made to the dataset there.\n\nAnd remember to have a quick `glimpse()` at your data.\n\n\n\n## Activity 3: Confidence in understanding Open Science practices\n\n#### The main goal is to compute the mean Understanding score per participant. {.unnumbered}\n\nThe mean Understanding score for time point 2 has already been calculated (in the `Time2_Understanding_OS` column), but we still need to compute it for time point 1.\n\nLooking at the Understanding data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nThe steps are quite similar to those for QRP, but we need to add an extra step: converting the character labels into numbers.\n\nAgain, let's do this step by step:\n\n* **Step 1**: Select the relevant columns `Code`, and every Understanding column from time point 1 (e.g., from `Understanding_OS_1_Time1` to `Understanding_OS_12_Time1`) and store them in an object called `understanding_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: Recode the values \"Not at all confident\" as 1 and \"Entirely confident\" as 7. All other values are already numbers. We can use functions `mutate()` in combination with `case_match()` for that\n* **Step 4**: Calculate the average QRP score (`QRPs_Acceptance_Time1_mean`) per participant using `group_by()` and `summarise()`\n\n#### Steps 1 and 2: Select and pivot {.unnumbered}\n\nHow about you try the first 2 steps yourself using the code from Chapter 2 Activity 4 (@sec-ch2_act4) as a template?\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2 - I picked different column labels this time for some variety\n pivot_longer(cols = Understanding_OS_1_Time1:Understanding_OS_12_Time1, names_to = \"Understanding_Qs\", values_to = \"Responses\") \n```\n:::\n\n\n:::\n\n#### Step 3: recoding the values {.unnumbered}\n\nOK, we now want to recode the values in the `Responses` column (or whatever name you picked for your column that has some of the numbers in it) so that \"Not at all confident\" = 1 and \"Entirely confident\" = 7. We want to keep all other values as they are (2-6 look already quite \"numeric\").\n\nLet's create a new column `Responses_corrected` that stores the new values with `mutate()`. Then we can combine that with the `case_match()` function.\n\n* The first argument in `case_match()` is the column name of the variable you want to recode.\n* Then you can start recoding the values in the way of `CurrentValue ~ NewValue` (~ is a tilde). Make sure you use the `~` and not `=`!\n* Lastly, the `.default` argument tells R what to do with values that are neither \"Not at all confident\" nor \"Entirely confident\". Here, we want to replace them with the original value of the `Responses` column. In other datasets, you may want to set the default to `NA` for missing values, a character string or a number, and `case_match()` is happy to oblige.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = Responses # all other values taken from column Responses\n ))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `mutate()`:\nℹ In argument: `Responses_corrected = case_match(...)`.\nCaused by error in `case_match()`:\n! Can't combine `..1 (right)` and `.default` .\n```\n:::\n:::\n\n\n::: {.callout-important collapse=\"true\"}\n\n## Error!!! Can you explain what is happening here?\n\nHave a look at the error message. It's pretty helpful this time. It says `Can't combine ..1 (right) and .default .` It means that the replacement values are expected to be data type character since the original column type was type character.\n\n:::\n\n**So how do we fix this?** Actually, there are several ways this could be done. Click on the tabs below to check out 3 possible solutions.\n\n::: {.panel-tabset group=\"layers\"}\n\n## Fix option 1\n\nOne option is to modify the `.default` argument `Responses` so that the values are copied over from the original column but as a number rather than the original character value. The function `as.numeric()` does the conversion.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n ))\n```\n:::\n\n\n## Fix option 2\n\nChange the numeric values on the right side of the `~` to character. Then in a second step, we would need to turn the character column into a numeric type. Again, we have several options to do so. We could either use the `parse_number()` function we encountered earlier during the demographics wrangling or the `as.numeric()` function.\n\n* V1: `Responses_corrected = parse_number(Responses_corrected)`\n* V2: `Responses_corrected = as.numeric(Responses_corrected)`\n\nJust pay attention that you are still working *within* the `mutate()` function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ \"1\",\n \"Entirely confident\" ~ \"7\",\n .default = Responses # all other values taken from column Responses (character)\n ),\n Responses_corrected = parse_number(Responses_corrected)) # turning Responses_corrected into a numeric column\n```\n:::\n\n\n\n## Fix option 3\n\nIf you recode all the labels into numbers (e.g., \"2\" into 2, \"3\" into 3, etc.) from the start, you won’t need to perform any additional conversions later.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_recoded = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # recode all of them\n \"2\" ~ 2,\n \"3\" ~ 3,\n \"4\" ~ 4,\n \"5\" ~ 5,\n \"6\" ~ 6,\n \"Entirely confident\" ~ 7))\n```\n:::\n\n\n:::\n\n::: {.callout-note icon=\"false\"}\n\n## Your Turn\n\nChoose the option that works best for you to modify the code above that didn't work. You should now be able to calculate the **mean Understanding Score per participant**. Store the average scores in a variable called `Time1_Understanding_OS`. If you need help, refer to the hint below or use Chapter 2 Activity 4 (@sec-ch2_act4) as guidance.\n\n::: {.callout-caution icon=\"false\" collapse=\"true\"}\n\n## One solution for Steps 3 and 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4: calculating averages per participant\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\nOf course, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 3\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Understanding_Qs\", values_to = \"Responses\") %>% \n # Step 3\n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n## Activity 4: Survey of Attitudes Toward Statistics (SATS-28)\n\n#### The main goal is to compute the mean SATS-28 score for each of the 4 subscales per participant for time point 1. {.unnumbered}\n\nLooking at the SATS data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n* Additionally, we are looking to compute the means for the 4 different subscales of the SAT-28 which are , , , and .\n\nThis scenario is slightly more tricky than the previous ones due to the reverse-coding and the 4 subscales. So, let's tackle this step by step again:\n\n* **Step 1**: Select the relevant columns `Code`, and every SATS28 column from time point 1 (e.g., from `SATS28_1_Affect_Time1` to `SATS28_28_Difficulty_Time1`) and store them in an object called `sats_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: We need to know which items belong to which subscale - fortunately, we have that information in the variable name and can use the `separate()` function to access it.\n* **Step 4**: We need to know which items are reverse-coded and then reverse-score them - unfortunately, the info is only in the codebook and we need to find a work-around. `case_when()` can help identify and re-score the reverse-coded items.\n* **Step 5**: Calculate the average SATS score per participant and subscale using `group_by()` and `summarise()`\n* **Step 6**: use `pivot_wider()` to spread out the dataframe into wide format and `rename()` to tidy up the column names\n\n#### Steps 1 and 2: select and pivot {.unnumbered}\n\nThe selecting and pivoting are exactly the same way as we already practiced in the other 2 questionnaires. Apply them here to this questionnaire.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% # Step 1\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") # Step 2\n```\n:::\n\n\n:::\n\n:::\n\n#### Step 3: separate Subscale information {.unnumbered}\n\nIf you look at the `Items` column more closely, you can see that there is information on the `Questionnaire`, the `Item_number`, the `Subscale`, and the `Timepoint` the data was collected at.\n\nWe can separate the information into separate columns using the `separate()` function. The function's first argument is the column to separate, then define `into` which columns you want the original column to split up, and lastly, define the separator `sep` (here an underscore). For our example, we would write:\n\n* V1: `separate(Items, into = c(\"SATS\", \"Item_number\", \"Subscale\", \"Time\"), sep = \"_\")`\n\nHowever, we don't need all of those columns, so we could just drop the ones we are not interested in by replacing them with `NA`.\n\n* V2: `separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\")`\n\nWe might also add an extra argument of `convert = TRUE` to have numeric columns (i.e., `Item_number`) converted to numeric as opposed to keeping them as characters. Saves us typing a few quotation marks later in Step 4.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE)\n```\n:::\n\n\n#### Step 4: identifying reverse-coded items and then correct them {.unnumbered}\n\nWe can use `case_when()` within the `mutate()` function here to create a new column `FW_RV` that stores information on whether the item is a reverse-coded item or not.\n\n`case_when()` works similarly to `case_match()`, however `case_match()` only allows us to \"recode\" values (i.e., replace one value with another), whereas `case_when()` is more flexible. It allows us to use **conditional statements** on the left side of the tilde which is useful when you want to change only *some* of the data based on specific conditions.\n\nLooking at the codebook, it seems that items 2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, and 28 are reverse-coded. The rest are forward-coded.\n\nWe want to tell R now, that\n\n* **if** the `Item_number` is any of those numbers listed above, R should write \"Reverse\" into the new column `FW_RV` we are creating. Since we have a few possible matches for `Item_number`, we need the Boolean expression `%in%` rather than `==`.\n* **if** `Item_number` is none of those numbers, then we would like the word \"Forward\" in the `FW_RV` column to appear. We can achieve that by specifying a `.default` argument again, but this time we want a \"word\" rather than a value from another column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ))\n```\n:::\n\n\nMoving on to correcting the scores: Once again, we can use `case_when ()` within the `mutate()` function to create another **conditional statement**. This time, the condition is:\n\n* **if** `FW_RV` column has a value of \"Reverse\" then we would like to turn all 1 into 7, 2 into 6, etc.\n* **if** `FW_RV` column has a value of \"Forward\" then we would like to keep the score from the `Response` column\n\nThere is a quick way and a not-so-quick way to achieve the actual **reverse-coding**.\n\n* **Option 1 (quick)**: The easiest way to reverse-code scores is to take the maximum value of the scale, add 1 unit, and subtract the original value. For example, on a 5-point Likert scale, it would be 6 minus the original rating; for a 7-point Likert scale, 8 minus the original rating, etc. (see *Option 1* tab).\n* **Option 2 (not so quick)**: This involves using two conditional statements (see *Option 2* tab).\n\nUse the one you find more intuitive.\n\n::: panel-tabset\n\n## Option 1\n\nHere we are using a Boolean expression to check if the string \"Reverse\" is present in the `FW_RV` column. If this condition is `TRUE`, the value in the new column we're creating, `Scores_corrected`, will be calculated as 8 minus the value from the Response column. If the condition is FALSE (handled by the .default argument), the original values from the `Response` column will be retained.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n ))\n```\n:::\n\n\n## Option 2\n\nAs stated above, the longer approach involves using two conditional statements. The first condition checks if the value in the `FW_RV` column is \"Reverse\", while the second condition checks if the value in the `Response` column equals a specific number. **When both conditions are met**, the corresponding value on the right side of the tilde is placed in the newly created `Scores_corrected_v2` column.\n\nFor example, line 3 would read: if the value in the `FW_RV` column is \"Reverse\" **AND** the value in the `Response` column is 1, then assign a value of 7 to the `Scores_corrected_v2` column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected_v2 = case_when(\n FW_RV == \"Reverse\" & Response == 1 ~ 7,\n FW_RV == \"Reverse\" & Response == 2 ~ 6,\n FW_RV == \"Reverse\" & Response == 3 ~ 5,\n # no need to recode 4 as 4\n FW_RV == \"Reverse\" & Response == 5 ~ 3,\n FW_RV == \"Reverse\" & Response == 6 ~ 2,\n FW_RV == \"Reverse\" & Response == 7 ~ 1,\n .default = Response\n ))\n```\n:::\n\n\nAs you can see now in `sats_t1`, both columns `Scores_corrected` and `Scores_corrected_v2` are identical.\n\n:::\n\nOne way to **check whether our reverse-coding worked** is by examining the `distinct` values in the original `Response` column and comparing them with the `Scores_corrected`. We should also retain the `FW_RV` column to observe how the reverse-coding applied.\n\nTo see the patterns more clearly, we can use `arrange()` to sort the values in a meaningful order. Remember, the default sorting order is ascending, so if you want to sort values in descending order, you’ll need to wrap your variable in the desc() function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding <- sats_t1 %>% \n distinct(FW_RV, Response, Scores_corrected) %>% \n arrange(desc(FW_RV), Response)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show `check_coding` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding\n```\n\n::: {.cell-output-display}\n
\n\n|FW_RV | Response| Scores_corrected|\n|:-------|--------:|----------------:|\n|Reverse | 1| 7|\n|Reverse | 2| 6|\n|Reverse | 3| 5|\n|Reverse | 4| 4|\n|Reverse | 5| 3|\n|Reverse | 6| 2|\n|Reverse | 7| 1|\n|Forward | 1| 1|\n|Forward | 2| 2|\n|Forward | 3| 3|\n|Forward | 4| 4|\n|Forward | 5| 5|\n|Forward | 6| 6|\n|Forward | 7| 7|\n\n
\n:::\n:::\n\n\n:::\n\n#### Step 5 {.unnumbered}\n\nNow that we know everything worked out as intended, we can calculate the mean scores of each subscale for each participant in `sats_t1`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(???, ???) %>% \n summarise(mean_score = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`summarise()` has grouped output by 'Code'. You can override using the\n`.groups` argument.\n```\n:::\n:::\n\n\n:::\n\n:::\n\n#### Step 6 {.unnumbered}\n\nThe final step is to transform the data back into wide format, ensuring that each subscale has its own column. This will make it easier to join the data objects later on. In `pivot_wider()`, the first argument, `names_from`, specifies the column you want to use for your new column headings. The second argument, `values_from`, tells R which column should provide the cell values.\n\nWe should also **rename the column names** to match those in the codebook. Conveniently, we can use a function called `rename()` that works exactly like `select()` (following the pattern `new_name = old_name`), but it keeps all other column names the same rather than reducing the number of columns.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show final `sats_t1` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nhead(sats_t1, n = 5)\n```\n\n::: {.cell-output-display}\n
\n\n|Code | SATS28_Affect_Time1_mean| SATS28_CognitiveCompetence_Time1_mean| SATS28_Difficulty_Time1_mean| SATS28_Value_Time1_mean|\n|:----|------------------------:|-------------------------------------:|----------------------------:|-----------------------:|\n|AD03 | 2.333333| 3.833333| 3.428571| 5.555556|\n|AD05 | 3.500000| 5.000000| 2.142857| 4.777778|\n|Ab01 | 5.166667| 5.666667| 4.142857| 5.444444|\n|Al05 | 2.166667| 2.666667| 2.857143| 3.777778|\n|Am05 | 4.166667| 5.666667| 5.571429| 4.888889|\n\n
\n:::\n:::\n\n\n:::\n\nAgain, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n # Step 1\n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE) %>% \n # step 4\n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ),\n Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n )) %>% \n # step 5\n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup() %>% \n # step 6\n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n:::\n\n\n\n## Activity 5 (Error Mode): Perceptions of supervisory support\n\n#### The main goal is to compute the mean score for perceived supervisory support per participant. {.unnumbered}\n\nLooking at the supervisory support data, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nI have outlined my steps as follows:\n\n* **Step 1**: Reverse-code the single column first because that's less hassle than having to do that with conditional statements (`Supervisor_15_R`). `mutate()` is my friend.\n* **Step 2**: I want to filter out everyone who failed the attention check in `Supervisor_7`. I can do this with a Boolean expression within the `filter()` function. The correct response was \"completely disagree\" which is 1.\n* **Step 3**: Select their id from time point 2 and all the columns that start with the word \"super\", apart from `Supervisor_7` and the original `Supervisor_15_R` column\n* **Step 4**: pivot into long format so I can calculate the averages better\n* **Step 5**: calculate the average scores per participant\n\nI've started coding but there are some errors in my code. Help me find and fix all of them. Try to go through the code line by line and read the error messages.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_ppr %>% \n mutate(Supervisor_15 = 9-supervisor_15_R) %>% \n filter(Supervisor_7 = 1) %>% \n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) \npivot_wider(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% \n group_by(Time2_Code) %>% \n summarise(Mean_Supervisor_Support = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## How many mistakes am I supposed to find?\n\nThere are 8 mistakes in the code.\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Reveal solution\n\nDid you spot all 8 mistakes? Let's go through them line by line.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_prp %>% # spelling mistake in data object\n mutate(Supervisor_15 = 8-Supervisor_15_R) %>% # semantic error: 8 minus response for a 7-point scale and supervisor_15_R needs a capital S\n filter(Supervisor_7 == 1) %>% # needs a Boolean expression == instead of =\n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) %>% # no pipe at the end, the rest is actually legit\n pivot_longer(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% # pivot_longer instead of pivot_wider\n group_by(Code) %>% # Code rather than Time2_Code - the reduced dataset does not contain Time2_Code\n summarise(Mean_Supervisor_Support = mean(Response, na.rm = TRUE)) %>% # Score_corrected doesn't exist; needs to be Response\n ungroup()\n```\n:::\n\n\n* Note that the **semantic error** in line 2 will not give you an error message.\n* Were you thrown off by the `starts_with(\"Super\")` expression in line 4? `starts_with()` and `ends_with()` are great alternatives to selecting columns via `:` But, using `select(Code, Supervisor_1:Supervisor_6, Supervisor_8:Supervisor_14)` would have given us the same result. *[I admit, that one was perhaps a bit mean]*\n\n:::\n\n## Activity 6: Join everything together with `???_join()`\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: `inner_join()`, `left_join()`, `right_join()`, and `full_join()`. Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n::: {.callout-note icon=\"false\"}\n\n## Info on mutating joins\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n::: panel-tabset\n\nA mutating join allows you to combine variables from two tables. It first matches observations by their keys, then copies across variables from one table to the other.\n\n## `inner_join()`\n\n`inner_join()` returns only the rows where the values in the column specified in the `by =` statement match in both tables.\n\n![inner_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/inner-join.gif)\n\n## `left_join()`\n\n`left_join()` retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the `by =` statement. Rows in the left table with no match in the right table will have missing values (`NA`) in the new columns.\n\n![left_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/left-join.gif)\n\n## `right_join()`\n\n`right_join()` retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the `by =` statement. Rows in the right table with no match in the left table will have missing values (`NA`) in the new columns.\n\n![right_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/right-join.gif)\n\n## `full_join()`\n\n`full_join()` returns all rows and all columns from both tables. `NA` values fill unmatched rows.\n\n![full_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/full-join.gif)\n\n:::\n\n:::\n\nFrom our original `data_prp`, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the `Global Environment`.\n\nWhile you may be familiar with `inner_join()` from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use `full_join()`. Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\n\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use `select()` at the end to order them better.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndata_prp_final <- data_prp %>% \n select(Code:Plan_prereg, Other_OS_behav_2:Time2_Understanding_OS) %>% \n full_join(qrp_t1) %>% \n full_join(understanding_t1) %>% \n full_join(sats_t1) %>% \n full_join(super) %>% \n select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n```\n:::\n\n\n\n::: {.callout-important icon=\"false\"}\n## No `by` argument in the code above? \n\nNote how I didn't include a `by` argument in the code above. If you leave `by =` out, R will join the 2 data objects by **ALL** columns that have the same name.\n\n**Special case 1: matching column names but different values**\n\nIf you want more control, you should include the `by` argument; for example, if both data objects include a column `age` but data was recorded at 2 different time points. In that case, the information from both `age` columns should be retained and the `by` argument would not include `age`.\n\n**Special case 2: different column names but matching values**\n\nAnother special case presents when both data objects contain identical information but the variable names don't match. Let's say, both data objects contain gender information, but in one data object the variable is named `gender` and in the other one `gender_label`. In that case, your `by` argument needs to be modified as: `by = join_by(gender == gender_label)`.\n\nMore info on joins can be found [https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/](https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/){target=\"_blank\"}\n:::\n\nAnd this is basically the dataset we need for @sec-dataviz and @sec-dataviz2.\n\n\n\n## Activity 7: Knit and export\n\nKnit the `.Rmd` file to ensure everything runs as expected. Once it does, export the data object `data_prp_final` as a csv for use in the @sec-dataviz. Name it something meaningful, something like `data_prp_for_ch4.csv`.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nwrite_csv(data_prp_final, \"data_prp_for_ch4.csv\")\n```\n:::\n\n\n:::\n\n\n## [Pair-coding]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\nWe will once again be working with data from Binfet et al. (2021), which focuses on the randomised controlled trials data involving therapy dog interventions. Today, our goal is to calculate the average `Loneliness` score for each participant measured at time point 1 (pre-intervention) using the raw data file `dog_data_raw`. Currently, the data looks like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| L1_1| L1_2| L1_3| L1_4| L1_5| L1_6| L1_7| L1_8| L1_9| L1_10| L1_11| L1_12| L1_13| L1_14| L1_15| L1_16| L1_17| L1_18| L1_19| L1_20|\n|---:|----:|----:|----:|----:|----:|----:|----:|----:|----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|\n| 1| 3| 3| 4| 3| 2| 3| 1| 2| 3| 4| 3| 1| 3| 1| 2| 3| 2| 3| 2| 4|\n| 2| 3| 2| 3| 3| 4| 3| 2| 2| 4| 3| 2| 2| 1| 2| 4| 3| 3| 2| 4| 3|\n| 3| 3| 3| 2| 3| 3| 4| 2| 3| 3| 3| 2| 2| 2| 2| 3| 3| 4| 3| 3| 3|\n| 4| 4| 2| 2| 3| 4| 4| 1| 3| 3| 4| 2| 1| 2| 2| 4| 4| 3| 3| 4| 3|\n| 5| 2| 3| 3| 3| 4| 3| 2| 2| 3| 2| 4| 4| 4| 3| 2| 2| 3| 4| 3| 2|\n\n
\n:::\n:::\n\n\nBut we want the data to look like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| Loneliness_pre|\n|---:|--------------:|\n| 1| 2.25|\n| 2| 1.90|\n| 3| 2.25|\n| 4| 1.75|\n| 5| 2.85|\n\n
\n:::\n:::\n\n\nThis task is a bit more challenging compared to last week's lab activity, as the Loneliness scale includes some reverse-coded items.\n\n### Task 1: Open the R project for the lab {.unnumbered}\n\n### Task 2: Open your `.Rmd` file from last week or create a new `.Rmd` file {.unnumbered}\n\nYou could continue the `.Rmd` file you used last week, or create a new `.Rmd`. If you need some guidance, have a look at @sec-rmd.\n\n### Task 3: Load in the library and read in the data {.unnumbered}\n\nThe data should already be in your project folder. If you want a fresh copy, you can download the data again here: [data_pair_ch1](data/data_pair_ch1.zip \"download\").\n\nWe are using the package `tidyverse` today, and the datafile we should read in is `dog_data_raw.csv`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(???)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"???\")\n```\n:::\n\n\n:::\n\n### Task 4: Calculating the mean for `Loneliness_pre` {.unnumbered}\n\n* **Step 1**: Select all relevant columns, such as the participant ID and all 20 items of the `Loneliness` questionnaire completed by participants before the intervention. Store this data in an object called `data_loneliness`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nLook at the codebook. Try to figure out\n\n* the variable name of the column in which the participant id is stored, and\n* which items relate to the Loneliness scale at Stage \"pre\"\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* the participant id column is called `RID`\n* The Loneliness items at pre-intervention stage start with `L1_`\n\n:::\n\n:::\n\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily (in step 3)\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n`pivot_`\n\nWe also need 3 arguments in that function:\n\n* the columns we want to select (e.g., all the loneliness items),\n* the name of the column in which the current column headings will be stored (e.g., \"Qs\"), and\n* the name of the column that should store all the values (e.g., \"Responses\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\")\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 3**: Reverse-scoring\n\nIdentify the items on the `Loneliness` scale that are reverse-coded, and then reverse-score them accordingly.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nWe need to figure out:\n\n* which are the items of the loneliness scale we need to reverse-score\n* what is the measuring scale of loneliness so we can determine the new values\n* which function to use to create a new column that has the corrected scores in it\n* which one of the `case_` functions will get us there\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* The items to be reverse-coded items can be found in the codebook: L1_1, L1_5, L1_6, L1_9, L1_10, L1_15, L1_16, L1_19, L1_20\n* the loneliness scale ranges from 1 to 4, so we need to replace 1 with 4, 2 with 3, 3 with 2, and 4 with 1\n* the function to create a new column `mutate()`\n* it's a conditional statement rather than \"just\" replacing values, hence we need `case_when()`\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n mutate(Score_corrected = case_when(\n ??? ~ ???,\n .default = ???\n ))\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 4**: Calculate the average Loneliness score per participant. To match with the table above, we want to call this column `Loneliness_pre`\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\ngrouping and summarising\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n group_by(???) %>% \n summarise(Loneliness_pre = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(tidyverse)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"dog_data_raw.csv\")\n\n# Task 4: Tidying \nloneliness_tidy <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L1\")) %>% # select(RID, L1_1:L1_20) also works\n # Step 2\n pivot_longer(cols = -RID, names_to = \"Qs\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score_corrected = case_when(\n Qs %in% c(\"L1_1\", \"L1_5\", \"L1_6\", \"L1_9\", \"L1_10\", \"L1_15\", \"L1_16\", \"L1_19\", \"L1_20\") ~ 5-Response,\n .default = Response\n )) %>% \n # Step 4\n group_by(RID) %>% \n summarise(Loneliness_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n## [Test your knowledge and challenge yourself]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n### Knowledge check {.unnumbered}\n\n\n#### Question 1 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to recode an existing variable? \n\n\n#### Question 2 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to create a new variable based on one or multiple conditional statements? \n\n\n#### Question 3 {.unnumbered}\n\nWhich of the following functions would you use if you wanted to join two data sets by their shared identifier? \n\n\n#### Question 4 {.unnumbered}\n\nYour data object contains a column `Score` with numbers, but they have been read in incorrectly as a character datatype. Which of the following functions would *not* work for fixing this issue? \n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Explain this answer\n\n* `parse_number()` from the `readr` package extracts numeric values from strings, so this would work.\n* `factor(Score)`: This would *not* work as expected because it converts the column into a factor, not a numeric datatype, leading to incorrect results if numeric operations are needed.\n* `mutate(Score = as.numeric(Score))`: This would work too because `mutate()` can be used in combination with `as.numeric()` to create a new numeric column or override the existing character column.\n* `as.numeric()`: This would also work to convert a character column to numeric. Without mutate, you could use it in a BaseR way, e.g., `data$Score <- as.numeric(data$Score)` (*shudder, BaseR!!! But effective*)\n\n:::\n\n\n\n### Challenge yourself {.unnumbered}\n\nIf you want to **challenge yourself** and further apply the skills from Chapter 3, you could wrangle the data from `dog_data_raw` for one of the other questionnaires. There are plenty of options to choose from:\n\n::: {.callout-tip collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: easy\n\n* recode column `Live_Pets` so the values read yes and no rather than 1 and 2\n* recode `Year_of_Study` so they have the labels from the codebook rather than the numbers\n* reverse-code the `Homesickness` scale for `_pre` and `_post`\n* renaming the columns of the other one-item scales as `Stress_pre`, `Stress_post`, `Engagement_pre` and `Engagement_post`\n\nAny of these tasks should be doable in one step. No need to select or pivot anything. You could just modify `dog_data_raw`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* For the **recoding tasks**, you need to work out which function to use to recode one value as another - just plain replacing, no conditional statements\n* The **reverse-coding** might sound daunting to do in one step, but it is only a single value that needs to be recoded. Take some inspiration from Activity 5 (error mode).\n* For the **renaming tasks**, check how you would change column names without reducing the number of columns overall\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - easy**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Live_Pets\ndog_data_raw <- dog_data_raw %>%\n mutate(Live_Pets = case_match(Live_Pets,\n 1 ~ \"yes\",\n 2 ~ \"no\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Year of Study\ndog_data_raw <- dog_data_raw %>%\n mutate(Year_of_Study = case_match(Year_of_Study,\n 1 ~ \"First\",\n 2 ~ \"Second\",\n 3 ~ \"Third\",\n 4 ~ \"Fourth\",\n 5 ~ \"Fifth or above\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Reverse-coding of homesickness pre and post. It's a 5-point scale, hence you'd calculate 6-the original response column\ndog_data_raw <- dog_data_raw %>% \n mutate(Homesick_pre = 6-HO1_1,\n Homesick_post = 6-HO2_1)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Renaming of Stress and Engagement\ndog_data_raw <- dog_data_raw %>% \n rename(Stress_pre = S1_1, Stress_post = S2_1, Engagement_pre = HO1_2, Engagement_post = HO2_2)\n```\n:::\n\n:::\n:::\n\n::: {.callout-warning collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: medium\n\n* reverse-code the Social connectedness scale (pre-intervention) and compute a mean score per participant\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\nThis task would take 4 steps to complete. These are the exact same steps we applied to `Loneliness_pre` in the lab activity. You would just need to figure out which items are related to the Social connectedness scale (pre-intervention) and which ones of those are reverse-coded. The codebook has all the answers.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - medium**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS pre\nscs_pre <- dog_data_raw %>% \n select(RID, starts_with(\"SC1\")) %>% \n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n mutate(Score_corrected = case_when(\n Names %in% c(\"SC1_3\", \"SC1_6\", \"SC1_7\", \"SC1_9\", \"SC1_11\", \"SC1_13\", \"SC1_15\", \"SC1_17\", \"SC1_18\", \"SC1_20\") ~ 7-Response,\n .default = Response\n )) %>% \n group_by(RID) %>% \n summarise(SCS_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: hard\n\n* reverse-code the Loneliness scale (post-intervention) and compute a mean score per participant\n* reverse-code the Social connectedness scale (post-intervention) and compute a mean score per participant\n\nBoth activities are similar to Activity 3 from the individual walkthrough and would take about 5 steps to complete. **Start by mapping out the steps**.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* **Step 1**: Select all relevant columns, such as participant ID and all the items that belong to the questionnaire that participants completed after the intervention\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily\n* **Step 3**: Recode the initial responses so that the new column has numbers instead of labels\n* **Step 4**: Reverse-score the items that are labelled as \"Reverse\" in the codebook and then reverse-score them\n* **Step 5**: Group by and summarise to calculate the mean Score\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## loneliness post\nlonely_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score = case_match(Response,\n \"never\" ~ 1,\n \"rarely\" ~ 2,\n \"sometimes\" ~ 3,\n \"often\" ~ 4,\n .default = NA\n ),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"L2_1\", \"L2_5\", \"L2_6\", \"L2_9\", \"L2_10\", \"L2_15\", \"L2_16\", \"L2_19\", \"L2_20\") ~ 5-Score,\n .default = Score\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(Loneliness_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS post\nscs_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"SC2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Response = case_match(Response,\n \"strongly disagree\" ~ \"1\",\n \"strongly agree\" ~ \"6\",\n .default = Response),\n Response = parse_number(Response),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"SC2_3\", \"SC2_6\", \"SC2_7\", \"SC2_9\", \"SC2_11\", \"SC2_13\", \"SC2_15\", \"SC2_17\", \"SC2_18\", \"SC2_20\") ~ 7-Response,\n .default = Response\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(SCS_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-important collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: extra hard\n\n* PANAS: positive and negative affect of pre- and post-intervention in a single pipe rather than in 4 different data objects (see last week's)\n\nThis task would take about 7 steps to get it from\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| PN1_1| PN1_2| PN1_3| PN1_4| PN1_5| PN1_6| PN1_7| PN1_8| PN1_9| PN1_10| PN2_1| PN2_2| PN2_3| PN2_4| PN2_5| PN2_6| PN2_7| PN2_8| PN2_9| PN2_10|\n|---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|\n| 1| 1| 1| 1| 1| 4| 1| 4| 3| 1| 4| 2| 1| 3| 1| 4| 1| 4| 4| 1| 4|\n| 2| 1| 2| 3| 2| 1| 3| 3| 4| 1| 4| 1| 1| 2| 1| 3| 1| 3| 4| 1| 4|\n| 3| 1| 1| 3| 1| 2| 4| 4| 3| 1| 2| 2| 2| 3| 1| 3| 2| 4| 3| 1| 2|\n| 4| 1| 1| 5| 1| 4| 3| 5| 5| 3| 2| 1| 1| 5| 1| 4| 3| 4| 4| 2| 2|\n| 5| 2| 3| 5| 2| 3| 2| 3| 4| 2| 2| 1| 2| 5| 2| 3| 2| 4| 5| 1| 3|\n\n
\n:::\n:::\n\n\nto\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID|Stage | PANAS_NA| PANAS_PA|\n|---:|:-----|--------:|--------:|\n| 1|post | 1.2| 3.8|\n| 1|pre | 1.0| 3.2|\n| 2|post | 1.0| 3.2|\n| 2|pre | 1.8| 3.0|\n| 3|post | 1.6| 3.0|\n\n
\n:::\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n**Start by mapping out the steps**\n\n* **Step 1**: select all relevant columns, such as participant ID and all the items that belong to PANAs scale (pos, neg, pre, and post)\n* **Step 2**: pivot the data from wide format to long format. You want to do that for ALL columns that are not the participant id. The data object should have 3 columns and 5680 observations, i.e. each participant has 20 rows.\n* **Step 3**: All of the items will have the structure `PN1_1`. Use separate to split the information across 2 columns. First column has information about the `Stage`, second column should turn into an `Item_number` and it should convert into a numeric column in the process to save you typing quotation marks in Step 5.\\\n* **Step 4**: recode the `Stage` column you just created so that everything that starts with PN1 relates to \"pre\" and PN2 as post.\n* **Step 5**: identify the subscales positive affect (PA) and negative affect (NA) by item number and recode them. This requires a conditional statement.\n* **Step 6**: group by and summarise to calculate the mean Score\n* **Step 7**: pivot, so that you have the 2 PANAS subscales presented in separate columns (see table above). You might need an extra step if the columns aren't labelled exactly as shown in the table above.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - extra hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nPANAS <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"PN\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Items\", values_to = \"Scores\") %>% \n # Step 3\n separate(Items, into = c(\"Stage\", \"Item_number\"), sep = \"_\", convert = TRUE) %>% \n # Step 4 recode Stage\n mutate(Stage = case_match(Stage,\n \"PN1\" ~ \"pre\",\n \"PN2\" ~ \"post\")) %>% \n # Step 5 identify subscales by item number\n mutate(Subscales = case_when(\n Item_number %in% c(3, 5, 7, 8, 10) ~ \"PANAS_PA\",\n .default = \"PANAS_NA\"\n )) %>% \n # Step 6 \n group_by(RID, Stage, Subscales) %>% \n summarise(Score = mean(Scores)) %>% \n ungroup() %>% \n # Step 7 - to make the data look like the data in `dog_data_clean_long.csv`\n pivot_wider(names_from = Subscales, values_from = Score)\n```\n:::\n\n:::\n:::\n", + "markdown": "# Data wrangling II {#sec-wrangling2}\n\n## Intended Learning Outcomes {.unnumbered}\n\nBy the end of this chapter, you should be able to:\n\n- apply familiar data wrangling functions to novel datasets\n- read and interpret error messages\n- realise there are several ways of getting to the results\n\nIn this chapter, we will pick up where we left off in @sec-wrangling. We will calculate average scores for two of the questionnaires, address an error mode problem, and finally, join all data objects together. This will finalise our data for the upcoming data visualization sections (@sec-dataviz and @sec-dataviz2).\n\n\n## [Individual Walkthrough]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n## Activity 1: Setup\n\n* Go to the project folder we have been using in the last two weeks and double-click on the project icon to **open the project** in RStudio\n* Either **Create a new `.Rmd` file** for chapter 3 and save it to your project folder or continue the one from last week. See @sec-rmd if you need some guidance.\n\n\n\n## Activity 2: Load in the libraries and read in the data\n\nToday, we will be using `tidyverse` along with the two csv files created at the end of the last chapter: `data_prp_for_ch3.csv` and `qrp_t1.csv`. If you need to download them again for any reason, click on the following links: [data_prp_for_ch3.csv](data/data_prp_for_ch3.csv \"download\") and [qrp_t1.csv](data/qrp_t1.csv \"download\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(???)\ndata_prp <- read_csv(\"???\")\nqrp_t1 <- read_csv(\"???\")\n```\n:::\n\n\n\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidyverse)\ndata_prp <- read_csv(\"prp_data_reduced.csv\")\nqrp_t1 <- read_csv(\"qrp_t1.csv\")\n```\n:::\n\n\n:::\n\nIf you need a quick reminder what the dataset was about, have a look at the abstract in @sec-download_data_ch1. We also addressed the changes we made to the dataset there.\n\nAnd remember to have a quick `glimpse()` at your data.\n\n\n\n## Activity 3: Confidence in understanding Open Science practices\n\n#### The main goal is to compute the mean Understanding score per participant. {.unnumbered}\n\nThe mean Understanding score for time point 2 has already been calculated (in the `Time2_Understanding_OS` column), but we still need to compute it for time point 1.\n\nLooking at the Understanding data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nThe steps are quite similar to those for QRP, but we need to add an extra step: converting the character labels into numbers.\n\nAgain, let's do this step by step:\n\n* **Step 1**: Select the relevant columns `Code`, and every Understanding column from time point 1 (e.g., from `Understanding_OS_1_Time1` to `Understanding_OS_12_Time1`) and store them in an object called `understanding_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: Recode the values \"Not at all confident\" as 1 and \"Entirely confident\" as 7. All other values are already numbers. We can use functions `mutate()` in combination with `case_match()` for that\n* **Step 4**: Calculate the average QRP score (`QRPs_Acceptance_Time1_mean`) per participant using `group_by()` and `summarise()`\n\n#### Steps 1 and 2: Select and pivot {.unnumbered}\n\nHow about you try the first 2 steps yourself using the code from Chapter 2 Activity 4 (@sec-ch2_act4) as a template?\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2 - I picked different column labels this time for some variety\n pivot_longer(cols = Understanding_OS_1_Time1:Understanding_OS_12_Time1, names_to = \"Understanding_Qs\", values_to = \"Responses\") \n```\n:::\n\n\n:::\n\n#### Step 3: recoding the values {.unnumbered}\n\nOK, we now want to recode the values in the `Responses` column (or whatever name you picked for your column that has some of the numbers in it) so that \"Not at all confident\" = 1 and \"Entirely confident\" = 7. We want to keep all other values as they are (2-6 look already quite \"numeric\").\n\nLet's create a new column `Responses_corrected` that stores the new values with `mutate()`. Then we can combine that with the `case_match()` function.\n\n* The first argument in `case_match()` is the column name of the variable you want to recode.\n* Then you can start recoding the values in the way of `CurrentValue ~ NewValue` (~ is a tilde). Make sure you use the `~` and not `=`!\n* Lastly, the `.default` argument tells R what to do with values that are neither \"Not at all confident\" nor \"Entirely confident\". Here, we want to replace them with the original value of the `Responses` column. In other datasets, you may want to set the default to `NA` for missing values, a character string or a number, and `case_match()` is happy to oblige.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = Responses # all other values taken from column Responses\n ))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `mutate()`:\nℹ In argument: `Responses_corrected = case_match(...)`.\nCaused by error in `case_match()`:\n! Can't combine `..1 (right)` and `.default` .\n```\n:::\n:::\n\n\n::: {.callout-important collapse=\"true\"}\n\n## Error!!! Can you explain what is happening here?\n\nHave a look at the error message. It's pretty helpful this time. It says `Can't combine ..1 (right) and .default .` It means that the replacement values are expected to be data type character since the original column type was type character.\n\n:::\n\n**So how do we fix this?** Actually, there are several ways this could be done. Click on the tabs below to check out 3 possible solutions.\n\n::: {.panel-tabset group=\"layers\"}\n\n## Fix option 1\n\nOne option is to modify the `.default` argument `Responses` so that the values are copied over from the original column but as a number rather than the original character value. The function `as.numeric()` does the conversion.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n ))\n```\n:::\n\n\n## Fix option 2\n\nChange the numeric values on the right side of the `~` to character. Then in a second step, we would need to turn the character column into a numeric type. Again, we have several options to do so. We could either use the `parse_number()` function we encountered earlier during the demographics wrangling or the `as.numeric()` function.\n\n* V1: `Responses_corrected = parse_number(Responses_corrected)`\n* V2: `Responses_corrected = as.numeric(Responses_corrected)`\n\nJust pay attention that you are still working *within* the `mutate()` function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ \"1\",\n \"Entirely confident\" ~ \"7\",\n .default = Responses # all other values taken from column Responses (character)\n ),\n Responses_corrected = parse_number(Responses_corrected)) # turning Responses_corrected into a numeric column\n```\n:::\n\n\n\n## Fix option 3\n\nIf you recode all the labels into numbers (e.g., \"2\" into 2, \"3\" into 3, etc.) from the start, you won’t need to perform any additional conversions later.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_recoded = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # recode all of them\n \"2\" ~ 2,\n \"3\" ~ 3,\n \"4\" ~ 4,\n \"5\" ~ 5,\n \"6\" ~ 6,\n \"Entirely confident\" ~ 7))\n```\n:::\n\n\n:::\n\n::: {.callout-note icon=\"false\"}\n\n## Your Turn\n\nChoose the option that works best for you to modify the code above that didn't work. You should now be able to calculate the **mean Understanding Score per participant**. Store the average scores in a variable called `Time1_Understanding_OS`. If you need help, refer to the hint below or use Chapter 2 Activity 4 (@sec-ch2_act4) as guidance.\n\n::: {.callout-caution icon=\"false\" collapse=\"true\"}\n\n## One solution for Steps 3 and 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4: calculating averages per participant\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\nOf course, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 3\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Understanding_Qs\", values_to = \"Responses\") %>% \n # Step 3\n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n## Activity 4: Survey of Attitudes Toward Statistics (SATS-28)\n\n#### The main goal is to compute the mean SATS-28 score for each of the 4 subscales per participant for time point 1. {.unnumbered}\n\nLooking at the SATS data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n* Additionally, we are looking to compute the means for the 4 different subscales of the SAT-28 which are , , , and .\n\nThis scenario is slightly more tricky than the previous ones due to the reverse-coding and the 4 subscales. So, let's tackle this step by step again:\n\n* **Step 1**: Select the relevant columns `Code`, and every SATS28 column from time point 1 (e.g., from `SATS28_1_Affect_Time1` to `SATS28_28_Difficulty_Time1`) and store them in an object called `sats_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: We need to know which items belong to which subscale - fortunately, we have that information in the variable name and can use the `separate()` function to access it.\n* **Step 4**: We need to know which items are reverse-coded and then reverse-score them - unfortunately, the info is only in the codebook and we need to find a work-around. `case_when()` can help identify and re-score the reverse-coded items.\n* **Step 5**: Calculate the average SATS score per participant and subscale using `group_by()` and `summarise()`\n* **Step 6**: use `pivot_wider()` to spread out the dataframe into wide format and `rename()` to tidy up the column names\n\n#### Steps 1 and 2: select and pivot {.unnumbered}\n\nThe selecting and pivoting are exactly the same way as we already practiced in the other 2 questionnaires. Apply them here to this questionnaire.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% # Step 1\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") # Step 2\n```\n:::\n\n\n:::\n\n:::\n\n#### Step 3: separate Subscale information {.unnumbered}\n\nIf you look at the `Items` column more closely, you can see that there is information on the `Questionnaire`, the `Item_number`, the `Subscale`, and the `Timepoint` the data was collected at.\n\nWe can separate the information into separate columns using the `separate()` function. The function's first argument is the column to separate, then define `into` which columns you want the original column to split up, and lastly, define the separator `sep` (here an underscore). For our example, we would write:\n\n* V1: `separate(Items, into = c(\"SATS\", \"Item_number\", \"Subscale\", \"Time\"), sep = \"_\")`\n\nHowever, we don't need all of those columns, so we could just drop the ones we are not interested in by replacing them with `NA`.\n\n* V2: `separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\")`\n\nWe might also add an extra argument of `convert = TRUE` to have numeric columns (i.e., `Item_number`) converted to numeric as opposed to keeping them as characters. Saves us typing a few quotation marks later in Step 4.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE)\n```\n:::\n\n\n#### Step 4: identifying reverse-coded items and then correct them {.unnumbered}\n\nWe can use `case_when()` within the `mutate()` function here to create a new column `FW_RV` that stores information on whether the item is a reverse-coded item or not.\n\n`case_when()` works similarly to `case_match()`, however `case_match()` only allows us to \"recode\" values (i.e., replace one value with another), whereas `case_when()` is more flexible. It allows us to use **conditional statements** on the left side of the tilde which is useful when you want to change only *some* of the data based on specific conditions.\n\nLooking at the codebook, it seems that items 2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, and 28 are reverse-coded. The rest are forward-coded.\n\nWe want to tell R now, that\n\n* **if** the `Item_number` is any of those numbers listed above, R should write \"Reverse\" into the new column `FW_RV` we are creating. Since we have a few possible matches for `Item_number`, we need the Boolean expression `%in%` rather than `==`.\n* **if** `Item_number` is none of those numbers, then we would like the word \"Forward\" in the `FW_RV` column to appear. We can achieve that by specifying a `.default` argument again, but this time we want a \"word\" rather than a value from another column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ))\n```\n:::\n\n\nMoving on to correcting the scores: Once again, we can use `case_when ()` within the `mutate()` function to create another **conditional statement**. This time, the condition is:\n\n* **if** `FW_RV` column has a value of \"Reverse\" then we would like to turn all 1 into 7, 2 into 6, etc.\n* **if** `FW_RV` column has a value of \"Forward\" then we would like to keep the score from the `Response` column\n\nThere is a quick way and a not-so-quick way to achieve the actual **reverse-coding**.\n\n* **Option 1 (quick)**: The easiest way to reverse-code scores is to take the maximum value of the scale, add 1 unit, and subtract the original value. For example, on a 5-point Likert scale, it would be 6 minus the original rating; for a 7-point Likert scale, 8 minus the original rating, etc. (see *Option 1* tab).\n* **Option 2 (not so quick)**: This involves using two conditional statements (see *Option 2* tab).\n\nUse the one you find more intuitive.\n\n::: panel-tabset\n\n## Option 1\n\nHere we are using a Boolean expression to check if the string \"Reverse\" is present in the `FW_RV` column. If this condition is `TRUE`, the value in the new column we're creating, `Scores_corrected`, will be calculated as 8 minus the value from the Response column. If the condition is FALSE (handled by the .default argument), the original values from the `Response` column will be retained.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n ))\n```\n:::\n\n\n## Option 2\n\nAs stated above, the longer approach involves using two conditional statements. The first condition checks if the value in the `FW_RV` column is \"Reverse\", while the second condition checks if the value in the `Response` column equals a specific number. **When both conditions are met**, the corresponding value on the right side of the tilde is placed in the newly created `Scores_corrected_v2` column.\n\nFor example, line 3 would read: if the value in the `FW_RV` column is \"Reverse\" **AND** the value in the `Response` column is 1, then assign a value of 7 to the `Scores_corrected_v2` column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected_v2 = case_when(\n FW_RV == \"Reverse\" & Response == 1 ~ 7,\n FW_RV == \"Reverse\" & Response == 2 ~ 6,\n FW_RV == \"Reverse\" & Response == 3 ~ 5,\n # no need to recode 4 as 4\n FW_RV == \"Reverse\" & Response == 5 ~ 3,\n FW_RV == \"Reverse\" & Response == 6 ~ 2,\n FW_RV == \"Reverse\" & Response == 7 ~ 1,\n .default = Response\n ))\n```\n:::\n\n\nAs you can see now in `sats_t1`, both columns `Scores_corrected` and `Scores_corrected_v2` are identical.\n\n:::\n\nOne way to **check whether our reverse-coding worked** is by examining the `distinct` values in the original `Response` column and comparing them with the `Scores_corrected`. We should also retain the `FW_RV` column to observe how the reverse-coding applied.\n\nTo see the patterns more clearly, we can use `arrange()` to sort the values in a meaningful order. Remember, the default sorting order is ascending, so if you want to sort values in descending order, you’ll need to wrap your variable in the desc() function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding <- sats_t1 %>% \n distinct(FW_RV, Response, Scores_corrected) %>% \n arrange(desc(FW_RV), Response)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show `check_coding` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding\n```\n\n::: {.cell-output-display}\n
\n\n|FW_RV | Response| Scores_corrected|\n|:-------|--------:|----------------:|\n|Reverse | 1| 7|\n|Reverse | 2| 6|\n|Reverse | 3| 5|\n|Reverse | 4| 4|\n|Reverse | 5| 3|\n|Reverse | 6| 2|\n|Reverse | 7| 1|\n|Forward | 1| 1|\n|Forward | 2| 2|\n|Forward | 3| 3|\n|Forward | 4| 4|\n|Forward | 5| 5|\n|Forward | 6| 6|\n|Forward | 7| 7|\n\n
\n:::\n:::\n\n\n:::\n\n#### Step 5 {.unnumbered}\n\nNow that we know everything worked out as intended, we can calculate the mean scores of each subscale for each participant in `sats_t1`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(???, ???) %>% \n summarise(mean_score = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`summarise()` has grouped output by 'Code'. You can override using the\n`.groups` argument.\n```\n:::\n:::\n\n\n:::\n\n:::\n\n#### Step 6 {.unnumbered}\n\nThe final step is to transform the data back into wide format, ensuring that each subscale has its own column. This will make it easier to join the data objects later on. In `pivot_wider()`, the first argument, `names_from`, specifies the column you want to use for your new column headings. The second argument, `values_from`, tells R which column should provide the cell values.\n\nWe should also **rename the column names** to match those in the codebook. Conveniently, we can use a function called `rename()` that works exactly like `select()` (following the pattern `new_name = old_name`), but it keeps all other column names the same rather than reducing the number of columns.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show final `sats_t1` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nhead(sats_t1, n = 5)\n```\n\n::: {.cell-output-display}\n
\n\n|Code | SATS28_Affect_Time1_mean| SATS28_CognitiveCompetence_Time1_mean| SATS28_Difficulty_Time1_mean| SATS28_Value_Time1_mean|\n|:----|------------------------:|-------------------------------------:|----------------------------:|-----------------------:|\n|AD03 | 2.333333| 3.833333| 3.428571| 5.555556|\n|AD05 | 3.500000| 5.000000| 2.142857| 4.777778|\n|Ab01 | 5.166667| 5.666667| 4.142857| 5.444444|\n|Al05 | 2.166667| 2.666667| 2.857143| 3.777778|\n|Am05 | 4.166667| 5.666667| 5.571429| 4.888889|\n\n
\n:::\n:::\n\n\n:::\n\nAgain, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n # Step 1\n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE) %>% \n # step 4\n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ),\n Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n )) %>% \n # step 5\n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup() %>% \n # step 6\n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n:::\n\n\n\n## Activity 5 (Error Mode): Perceptions of supervisory support\n\n#### The main goal is to compute the mean score for perceived supervisory support per participant. {.unnumbered}\n\nLooking at the supervisory support data, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nI have outlined my steps as follows:\n\n* **Step 1**: Reverse-code the single column first because that's less hassle than having to do that with conditional statements (`Supervisor_15_R`). `mutate()` is my friend.\n* **Step 2**: I want to filter out everyone who failed the attention check in `Supervisor_7`. I can do this with a Boolean expression within the `filter()` function. The correct response was \"completely disagree\" which is 1.\n* **Step 3**: Select their id from time point 2 and all the columns that start with the word \"super\", apart from `Supervisor_7` and the original `Supervisor_15_R` column\n* **Step 4**: pivot into long format so I can calculate the averages better\n* **Step 5**: calculate the average scores per participant\n\nI've started coding but there are some errors in my code. Help me find and fix all of them. Try to go through the code line by line and read the error messages.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_ppr %>% \n mutate(Supervisor_15 = 9-supervisor_15_R) %>% \n filter(Supervisor_7 = 1) %>% \n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) \npivot_wider(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% \n group_by(Time2_Code) %>% \n summarise(Mean_Supervisor_Support = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## How many mistakes am I supposed to find?\n\nThere are 8 mistakes in the code.\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Reveal solution\n\nDid you spot all 8 mistakes? Let's go through them line by line.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_prp %>% # spelling mistake in data object\n mutate(Supervisor_15 = 8-Supervisor_15_R) %>% # semantic error: 8 minus response for a 7-point scale and supervisor_15_R needs a capital S\n filter(Supervisor_7 == 1) %>% # needs a Boolean expression == instead of =\n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) %>% # no pipe at the end, the rest is actually legit\n pivot_longer(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% # pivot_longer instead of pivot_wider\n group_by(Code) %>% # Code rather than Time2_Code - the reduced dataset does not contain Time2_Code\n summarise(Mean_Supervisor_Support = mean(Response, na.rm = TRUE)) %>% # Score_corrected doesn't exist; needs to be Response\n ungroup()\n```\n:::\n\n\n* Note that the **semantic error** in line 2 will not give you an error message.\n* Were you thrown off by the `starts_with(\"Super\")` expression in line 4? `starts_with()` and `ends_with()` are great alternatives to selecting columns via `:` But, using `select(Code, Supervisor_1:Supervisor_6, Supervisor_8:Supervisor_14)` would have given us the same result. *[I admit, that one was perhaps a bit mean]*\n\n:::\n\n## Activity 6: Join everything together with `???_join()`\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: `inner_join()`, `left_join()`, `right_join()`, and `full_join()`. Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n::: {.callout-note icon=\"false\"}\n\n## Info on mutating joins\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n::: panel-tabset\n\nA mutating join allows you to combine variables from two tables. It first matches observations by their keys, then copies across variables from one table to the other.\n\n## `inner_join()`\n\n`inner_join()` returns only the rows where the values in the column specified in the `by =` statement match in both tables.\n\n![inner_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/inner-join.gif)\n\n## `left_join()`\n\n`left_join()` retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the `by =` statement. Rows in the left table with no match in the right table will have missing values (`NA`) in the new columns.\n\n![left_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/left-join.gif)\n\n## `right_join()`\n\n`right_join()` retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the `by =` statement. Rows in the right table with no match in the left table will have missing values (`NA`) in the new columns.\n\n![right_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/right-join.gif)\n\n## `full_join()`\n\n`full_join()` returns all rows and all columns from both tables. `NA` values fill unmatched rows.\n\n![full_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/full-join.gif)\n\n:::\n\n:::\n\nFrom our original `data_prp`, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the `Global Environment`.\n\nWhile you may be familiar with `inner_join()` from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use `full_join()`. Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\n\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use `select()` at the end to order them better.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndata_prp_final <- data_prp %>% \n select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% \n full_join(qrp_t1) %>% \n full_join(understanding_t1) %>% \n full_join(sats_t1) %>% \n full_join(super) %>% \n select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n```\n:::\n\n\n\n::: {.callout-important icon=\"false\"}\n## No `by` argument in the code above? \n\nNote how I didn't include a `by` argument in the code above. If you leave `by =` out, R will join the 2 data objects by **ALL** columns that have the same name.\n\n**Special case 1: matching column names but different values**\n\nIf you want more control, you should include the `by` argument; for example, if both data objects include a column `age` but data was recorded at 2 different time points. In that case, the information from both `age` columns should be retained and the `by` argument would not include `age`.\n\n**Special case 2: different column names but matching values**\n\nAnother special case presents when both data objects contain identical information but the variable names don't match. Let's say, both data objects contain gender information, but in one data object the variable is named `gender` and in the other one `gender_label`. In that case, your `by` argument needs to be modified as: `by = join_by(gender == gender_label)`.\n\nMore info on joins can be found [https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/](https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/){target=\"_blank\"}\n:::\n\nAnd this is basically the dataset we need for @sec-dataviz and @sec-dataviz2.\n\n\n\n## Activity 7: Knit and export\n\nKnit the `.Rmd` file to ensure everything runs as expected. Once it does, export the data object `data_prp_final` as a csv for use in the @sec-dataviz. Name it something meaningful, something like `data_prp_for_ch4.csv`.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nwrite_csv(data_prp_final, \"data_prp_for_ch4.csv\")\n```\n:::\n\n\n:::\n\n\n## [Pair-coding]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\nWe will once again be working with data from Binfet et al. (2021), which focuses on the randomised controlled trials data involving therapy dog interventions. Today, our goal is to calculate the average `Loneliness` score for each participant measured at time point 1 (pre-intervention) using the raw data file `dog_data_raw`. Currently, the data looks like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| L1_1| L1_2| L1_3| L1_4| L1_5| L1_6| L1_7| L1_8| L1_9| L1_10| L1_11| L1_12| L1_13| L1_14| L1_15| L1_16| L1_17| L1_18| L1_19| L1_20|\n|---:|----:|----:|----:|----:|----:|----:|----:|----:|----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|\n| 1| 3| 3| 4| 3| 2| 3| 1| 2| 3| 4| 3| 1| 3| 1| 2| 3| 2| 3| 2| 4|\n| 2| 3| 2| 3| 3| 4| 3| 2| 2| 4| 3| 2| 2| 1| 2| 4| 3| 3| 2| 4| 3|\n| 3| 3| 3| 2| 3| 3| 4| 2| 3| 3| 3| 2| 2| 2| 2| 3| 3| 4| 3| 3| 3|\n| 4| 4| 2| 2| 3| 4| 4| 1| 3| 3| 4| 2| 1| 2| 2| 4| 4| 3| 3| 4| 3|\n| 5| 2| 3| 3| 3| 4| 3| 2| 2| 3| 2| 4| 4| 4| 3| 2| 2| 3| 4| 3| 2|\n\n
\n:::\n:::\n\n\nBut we want the data to look like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| Loneliness_pre|\n|---:|--------------:|\n| 1| 2.25|\n| 2| 1.90|\n| 3| 2.25|\n| 4| 1.75|\n| 5| 2.85|\n\n
\n:::\n:::\n\n\nThis task is a bit more challenging compared to last week's lab activity, as the Loneliness scale includes some reverse-coded items.\n\n### Task 1: Open the R project for the lab {.unnumbered}\n\n### Task 2: Open your `.Rmd` file from last week or create a new `.Rmd` file {.unnumbered}\n\nYou could continue the `.Rmd` file you used last week, or create a new `.Rmd`. If you need some guidance, have a look at @sec-rmd.\n\n### Task 3: Load in the library and read in the data {.unnumbered}\n\nThe data should already be in your project folder. If you want a fresh copy, you can download the data again here: [data_pair_ch1](data/data_pair_ch1.zip \"download\").\n\nWe are using the package `tidyverse` today, and the datafile we should read in is `dog_data_raw.csv`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(???)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"???\")\n```\n:::\n\n\n:::\n\n### Task 4: Calculating the mean for `Loneliness_pre` {.unnumbered}\n\n* **Step 1**: Select all relevant columns, such as the participant ID and all 20 items of the `Loneliness` questionnaire completed by participants before the intervention. Store this data in an object called `data_loneliness`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nLook at the codebook. Try to figure out\n\n* the variable name of the column in which the participant id is stored, and\n* which items relate to the Loneliness scale at Stage \"pre\"\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* the participant id column is called `RID`\n* The Loneliness items at pre-intervention stage start with `L1_`\n\n:::\n\n:::\n\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily (in step 3)\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n`pivot_`\n\nWe also need 3 arguments in that function:\n\n* the columns we want to select (e.g., all the loneliness items),\n* the name of the column in which the current column headings will be stored (e.g., \"Qs\"), and\n* the name of the column that should store all the values (e.g., \"Responses\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\")\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 3**: Reverse-scoring\n\nIdentify the items on the `Loneliness` scale that are reverse-coded, and then reverse-score them accordingly.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nWe need to figure out:\n\n* which are the items of the loneliness scale we need to reverse-score\n* what is the measuring scale of loneliness so we can determine the new values\n* which function to use to create a new column that has the corrected scores in it\n* which one of the `case_` functions will get us there\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* The items to be reverse-coded items can be found in the codebook: L1_1, L1_5, L1_6, L1_9, L1_10, L1_15, L1_16, L1_19, L1_20\n* the loneliness scale ranges from 1 to 4, so we need to replace 1 with 4, 2 with 3, 3 with 2, and 4 with 1\n* the function to create a new column `mutate()`\n* it's a conditional statement rather than \"just\" replacing values, hence we need `case_when()`\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n mutate(Score_corrected = case_when(\n ??? ~ ???,\n .default = ???\n ))\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 4**: Calculate the average Loneliness score per participant. To match with the table above, we want to call this column `Loneliness_pre`\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\ngrouping and summarising\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n group_by(???) %>% \n summarise(Loneliness_pre = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(tidyverse)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"dog_data_raw.csv\")\n\n# Task 4: Tidying \nloneliness_tidy <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L1\")) %>% # select(RID, L1_1:L1_20) also works\n # Step 2\n pivot_longer(cols = -RID, names_to = \"Qs\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score_corrected = case_when(\n Qs %in% c(\"L1_1\", \"L1_5\", \"L1_6\", \"L1_9\", \"L1_10\", \"L1_15\", \"L1_16\", \"L1_19\", \"L1_20\") ~ 5-Response,\n .default = Response\n )) %>% \n # Step 4\n group_by(RID) %>% \n summarise(Loneliness_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n## [Test your knowledge and challenge yourself]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n### Knowledge check {.unnumbered}\n\n\n#### Question 1 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to recode an existing variable? \n\n\n#### Question 2 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to create a new variable based on one or multiple conditional statements? \n\n\n#### Question 3 {.unnumbered}\n\nWhich of the following functions would you use if you wanted to join two data sets by their shared identifier? \n\n\n#### Question 4 {.unnumbered}\n\nYour data object contains a column `Score` with numbers, but they have been read in incorrectly as a character datatype. Which of the following functions would *not* work for fixing this issue? \n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Explain this answer\n\n* `parse_number()` from the `readr` package extracts numeric values from strings, so this would work.\n* `factor(Score)`: This would *not* work as expected because it converts the column into a factor, not a numeric datatype, leading to incorrect results if numeric operations are needed.\n* `mutate(Score = as.numeric(Score))`: This would work too because `mutate()` can be used in combination with `as.numeric()` to create a new numeric column or override the existing character column.\n* `as.numeric()`: This would also work to convert a character column to numeric. Without mutate, you could use it in a BaseR way, e.g., `data$Score <- as.numeric(data$Score)` (*shudder, BaseR!!! But effective*)\n\n:::\n\n\n\n### Challenge yourself {.unnumbered}\n\nIf you want to **challenge yourself** and further apply the skills from Chapter 3, you could wrangle the data from `dog_data_raw` for one of the other questionnaires. There are plenty of options to choose from:\n\n::: {.callout-tip collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: easy\n\n* recode column `Live_Pets` so the values read yes and no rather than 1 and 2\n* recode `Year_of_Study` so they have the labels from the codebook rather than the numbers\n* reverse-code the `Homesickness` scale for `_pre` and `_post`\n* renaming the columns of the other one-item scales as `Stress_pre`, `Stress_post`, `Engagement_pre` and `Engagement_post`\n\nAny of these tasks should be doable in one step. No need to select or pivot anything. You could just modify `dog_data_raw`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* For the **recoding tasks**, you need to work out which function to use to recode one value as another - just plain replacing, no conditional statements\n* The **reverse-coding** might sound daunting to do in one step, but it is only a single value that needs to be recoded. Take some inspiration from Activity 5 (error mode).\n* For the **renaming tasks**, check how you would change column names without reducing the number of columns overall\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - easy**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Live_Pets\ndog_data_raw <- dog_data_raw %>%\n mutate(Live_Pets = case_match(Live_Pets,\n 1 ~ \"yes\",\n 2 ~ \"no\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Year of Study\ndog_data_raw <- dog_data_raw %>%\n mutate(Year_of_Study = case_match(Year_of_Study,\n 1 ~ \"First\",\n 2 ~ \"Second\",\n 3 ~ \"Third\",\n 4 ~ \"Fourth\",\n 5 ~ \"Fifth or above\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Reverse-coding of homesickness pre and post. It's a 5-point scale, hence you'd calculate 6-the original response column\ndog_data_raw <- dog_data_raw %>% \n mutate(Homesick_pre = 6-HO1_1,\n Homesick_post = 6-HO2_1)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Renaming of Stress and Engagement\ndog_data_raw <- dog_data_raw %>% \n rename(Stress_pre = S1_1, Stress_post = S2_1, Engagement_pre = HO1_2, Engagement_post = HO2_2)\n```\n:::\n\n:::\n:::\n\n::: {.callout-warning collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: medium\n\n* reverse-code the Social connectedness scale (pre-intervention) and compute a mean score per participant\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\nThis task would take 4 steps to complete. These are the exact same steps we applied to `Loneliness_pre` in the lab activity. You would just need to figure out which items are related to the Social connectedness scale (pre-intervention) and which ones of those are reverse-coded. The codebook has all the answers.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - medium**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS pre\nscs_pre <- dog_data_raw %>% \n select(RID, starts_with(\"SC1\")) %>% \n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n mutate(Score_corrected = case_when(\n Names %in% c(\"SC1_3\", \"SC1_6\", \"SC1_7\", \"SC1_9\", \"SC1_11\", \"SC1_13\", \"SC1_15\", \"SC1_17\", \"SC1_18\", \"SC1_20\") ~ 7-Response,\n .default = Response\n )) %>% \n group_by(RID) %>% \n summarise(SCS_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: hard\n\n* reverse-code the Loneliness scale (post-intervention) and compute a mean score per participant\n* reverse-code the Social connectedness scale (post-intervention) and compute a mean score per participant\n\nBoth activities are similar to Activity 3 from the individual walkthrough and would take about 5 steps to complete. **Start by mapping out the steps**.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* **Step 1**: Select all relevant columns, such as participant ID and all the items that belong to the questionnaire that participants completed after the intervention\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily\n* **Step 3**: Recode the initial responses so that the new column has numbers instead of labels\n* **Step 4**: Reverse-score the items that are labelled as \"Reverse\" in the codebook and then reverse-score them\n* **Step 5**: Group by and summarise to calculate the mean Score\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## loneliness post\nlonely_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score = case_match(Response,\n \"never\" ~ 1,\n \"rarely\" ~ 2,\n \"sometimes\" ~ 3,\n \"often\" ~ 4,\n .default = NA\n ),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"L2_1\", \"L2_5\", \"L2_6\", \"L2_9\", \"L2_10\", \"L2_15\", \"L2_16\", \"L2_19\", \"L2_20\") ~ 5-Score,\n .default = Score\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(Loneliness_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS post\nscs_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"SC2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Response = case_match(Response,\n \"strongly disagree\" ~ \"1\",\n \"strongly agree\" ~ \"6\",\n .default = Response),\n Response = parse_number(Response),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"SC2_3\", \"SC2_6\", \"SC2_7\", \"SC2_9\", \"SC2_11\", \"SC2_13\", \"SC2_15\", \"SC2_17\", \"SC2_18\", \"SC2_20\") ~ 7-Response,\n .default = Response\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(SCS_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-important collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: extra hard\n\n* PANAS: positive and negative affect of pre- and post-intervention in a single pipe rather than in 4 different data objects (see last week's)\n\nThis task would take about 7 steps to get it from\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| PN1_1| PN1_2| PN1_3| PN1_4| PN1_5| PN1_6| PN1_7| PN1_8| PN1_9| PN1_10| PN2_1| PN2_2| PN2_3| PN2_4| PN2_5| PN2_6| PN2_7| PN2_8| PN2_9| PN2_10|\n|---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|\n| 1| 1| 1| 1| 1| 4| 1| 4| 3| 1| 4| 2| 1| 3| 1| 4| 1| 4| 4| 1| 4|\n| 2| 1| 2| 3| 2| 1| 3| 3| 4| 1| 4| 1| 1| 2| 1| 3| 1| 3| 4| 1| 4|\n| 3| 1| 1| 3| 1| 2| 4| 4| 3| 1| 2| 2| 2| 3| 1| 3| 2| 4| 3| 1| 2|\n| 4| 1| 1| 5| 1| 4| 3| 5| 5| 3| 2| 1| 1| 5| 1| 4| 3| 4| 4| 2| 2|\n| 5| 2| 3| 5| 2| 3| 2| 3| 4| 2| 2| 1| 2| 5| 2| 3| 2| 4| 5| 1| 3|\n\n
\n:::\n:::\n\n\nto\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID|Stage | PANAS_NA| PANAS_PA|\n|---:|:-----|--------:|--------:|\n| 1|post | 1.2| 3.8|\n| 1|pre | 1.0| 3.2|\n| 2|post | 1.0| 3.2|\n| 2|pre | 1.8| 3.0|\n| 3|post | 1.6| 3.0|\n\n
\n:::\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n**Start by mapping out the steps**\n\n* **Step 1**: select all relevant columns, such as participant ID and all the items that belong to PANAs scale (pos, neg, pre, and post)\n* **Step 2**: pivot the data from wide format to long format. You want to do that for ALL columns that are not the participant id. The data object should have 3 columns and 5680 observations, i.e. each participant has 20 rows.\n* **Step 3**: All of the items will have the structure `PN1_1`. Use separate to split the information across 2 columns. First column has information about the `Stage`, second column should turn into an `Item_number` and it should convert into a numeric column in the process to save you typing quotation marks in Step 5.\\\n* **Step 4**: recode the `Stage` column you just created so that everything that starts with PN1 relates to \"pre\" and PN2 as post.\n* **Step 5**: identify the subscales positive affect (PA) and negative affect (NA) by item number and recode them. This requires a conditional statement.\n* **Step 6**: group by and summarise to calculate the mean Score\n* **Step 7**: pivot, so that you have the 2 PANAS subscales presented in separate columns (see table above). You might need an extra step if the columns aren't labelled exactly as shown in the table above.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - extra hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nPANAS <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"PN\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Items\", values_to = \"Scores\") %>% \n # Step 3\n separate(Items, into = c(\"Stage\", \"Item_number\"), sep = \"_\", convert = TRUE) %>% \n # Step 4 recode Stage\n mutate(Stage = case_match(Stage,\n \"PN1\" ~ \"pre\",\n \"PN2\" ~ \"post\")) %>% \n # Step 5 identify subscales by item number\n mutate(Subscales = case_when(\n Item_number %in% c(3, 5, 7, 8, 10) ~ \"PANAS_PA\",\n .default = \"PANAS_NA\"\n )) %>% \n # Step 6 \n group_by(RID, Stage, Subscales) %>% \n summarise(Score = mean(Scores)) %>% \n ungroup() %>% \n # Step 7 - to make the data look like the data in `dog_data_clean_long.csv`\n pivot_wider(names_from = Subscales, values_from = Score)\n```\n:::\n\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/.quarto/cites/index.json b/.quarto/cites/index.json index d4ca146..fe50b04 100644 --- a/.quarto/cites/index.json +++ b/.quarto/cites/index.json @@ -1 +1 @@ -{"04-prob-binom-one-sample.qmd":[],"12-one-way-anova.qmd":[],"04-dataviz2.qmd":[],"11-multiple-regression.qmd":[],"07-independent.qmd":[],"05-independent.qmd":[],"06-chi-square-one-sample.qmd":[],"12-factorial-anova.qmd":[],"04-dataviz.qmd":[],"07-apes.qmd":[],"04-chi-square-one-sample.qmd":[],"webexercises.qmd":[],"references.qmd":[],"07-paired.qmd":[],"05-dataviz2.qmd":[],"11-one-way-anova.qmd":[],"instructions.qmd":["usethis"],"08-paired.qmd":[],"02-wrangling.qmd":[],"06-independent.qmd":[],"03-dataviz.qmd":[],"09-simple-regression.qmd":[],"01-basics.qmd":[],"appendix-b-updating-packages.qmd":[],"appendix-y-license.qmd":[],"13-factorial-anova.qmd":[],"10-multiple-regression.qmd":[],"09-correlation.qmd":[],"05-chi-square-one-sample.qmd":[],"08-correlation.qmd":[],"appendix-d-symbols.qmd":[],"10-regression.qmd":[],"appendix-a-installing-r.qmd":[],"appendix-c-exporting-server.qmd":[],"03-wrangling2.qmd":[],"appendix-x-How-to-cite-R.qmd":[],"06-paired.qmd":[],"index.qmd":[]} +{"webexercises.qmd":[],"appendix-a-installing-r.qmd":[],"appendix-c-exporting-server.qmd":[],"references.qmd":[],"04-dataviz.qmd":[],"10-regression.qmd":[],"05-dataviz2.qmd":[],"11-one-way-anova.qmd":[],"05-independent.qmd":[],"07-paired.qmd":[],"05-chi-square-one-sample.qmd":[],"02-wrangling.qmd":[],"appendix-b-updating-packages.qmd":[],"07-apes.qmd":[],"08-correlation.qmd":[],"appendix-y-license.qmd":[],"01-basics.qmd":[],"03-dataviz.qmd":[],"04-dataviz2.qmd":[],"12-one-way-anova.qmd":[],"index.qmd":[],"11-multiple-regression.qmd":[],"06-paired.qmd":[],"06-independent.qmd":[],"appendix-x-How-to-cite-R.qmd":[],"08-paired.qmd":[],"03-wrangling2.qmd":[],"09-simple-regression.qmd":[],"07-independent.qmd":[],"04-chi-square-one-sample.qmd":[],"10-multiple-regression.qmd":[],"06-chi-square-one-sample.qmd":[],"12-factorial-anova.qmd":[],"13-factorial-anova.qmd":[],"04-prob-binom-one-sample.qmd":[],"09-correlation.qmd":[],"instructions.qmd":["usethis"],"appendix-d-symbols.qmd":[]} diff --git a/.quarto/xref/5f34e12a b/.quarto/xref/5f34e12a index cf6f5ad..9d813ce 100644 --- a/.quarto/xref/5f34e12a +++ b/.quarto/xref/5f34e12a @@ -1 +1 @@ -{"entries":[{"key":"sec-wrangling2","order":{"section":[3,0,0,0,0,0,0],"number":1}}],"headings":["intended-learning-outcomes","individual-walkthrough","activity-1-setup","activity-2-load-in-the-libraries-and-read-in-the-data","activity-3-confidence-in-understanding-open-science-practices","the-main-goal-is-to-compute-the-mean-understanding-score-per-participant.","steps-1-and-2-select-and-pivot","step-3-recoding-the-values","activity-4-survey-of-attitudes-toward-statistics-sats-28","the-main-goal-is-to-compute-the-mean-sats-28-score-for-each-of-the-4-subscales-per-participant-for-time-point-1.","steps-1-and-2-select-and-pivot-1","step-3-separate-subscale-information","step-4-identifying-reverse-coded-items-and-then-correct-them","step-5","step-6","activity-5-error-mode-perceptions-of-supervisory-support","the-main-goal-is-to-compute-the-mean-score-for-perceived-supervisory-support-per-participant.","activity-6-join-everything-together-with-_join","activity-7-knit-and-export","pair-coding","task-1-open-the-r-project-for-the-lab","task-2-open-your-.rmd-file-from-last-week-or-create-a-new-.rmd-file","task-3-load-in-the-library-and-read-in-the-data","task-4-calculating-the-mean-for-loneliness_pre","test-your-knowledge-and-challenge-yourself","knowledge-check","question-1","question-2","question-3","question-4","challenge-yourself","sec-wrangling2"],"options":{"chapters":true,"chapter-id":"sec-wrangling2"}} \ No newline at end of file +{"headings":["intended-learning-outcomes","individual-walkthrough","activity-1-setup","activity-2-load-in-the-libraries-and-read-in-the-data","activity-3-confidence-in-understanding-open-science-practices","the-main-goal-is-to-compute-the-mean-understanding-score-per-participant.","steps-1-and-2-select-and-pivot","step-3-recoding-the-values","activity-4-survey-of-attitudes-toward-statistics-sats-28","the-main-goal-is-to-compute-the-mean-sats-28-score-for-each-of-the-4-subscales-per-participant-for-time-point-1.","steps-1-and-2-select-and-pivot-1","step-3-separate-subscale-information","step-4-identifying-reverse-coded-items-and-then-correct-them","step-5","step-6","activity-5-error-mode-perceptions-of-supervisory-support","the-main-goal-is-to-compute-the-mean-score-for-perceived-supervisory-support-per-participant.","activity-6-join-everything-together-with-_join","activity-7-knit-and-export","pair-coding","task-1-open-the-r-project-for-the-lab","task-2-open-your-.rmd-file-from-last-week-or-create-a-new-.rmd-file","task-3-load-in-the-library-and-read-in-the-data","task-4-calculating-the-mean-for-loneliness_pre","test-your-knowledge-and-challenge-yourself","knowledge-check","question-1","question-2","question-3","question-4","challenge-yourself","sec-wrangling2"],"entries":[{"key":"sec-wrangling2","order":{"number":1,"section":[3,0,0,0,0,0,0]}}],"options":{"chapter-id":"sec-wrangling2","chapters":true}} \ No newline at end of file diff --git a/03-wrangling2.qmd b/03-wrangling2.qmd index b7b2469..2b76305 100644 --- a/03-wrangling2.qmd +++ b/03-wrangling2.qmd @@ -605,7 +605,7 @@ Note: Since I (Gaby) like my columns arranged in a meaningful way, I will use `s ```{r eval=FALSE} data_prp_final <- data_prp %>% - select(Code:Plan_prereg, Other_OS_behav_2:Time2_Understanding_OS) %>% + select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% full_join(qrp_t1) %>% full_join(understanding_t1) %>% full_join(sats_t1) %>% diff --git a/_freeze/03-wrangling2/execute-results/html.json b/_freeze/03-wrangling2/execute-results/html.json index a71947f..1cef382 100644 --- a/_freeze/03-wrangling2/execute-results/html.json +++ b/_freeze/03-wrangling2/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "28db4924dbdf8795f0bba0859a044877", + "hash": "231b72f65fb32087caaf7d6986543529", "result": { - "markdown": "# Data wrangling II {#sec-wrangling2}\n\n## Intended Learning Outcomes {.unnumbered}\n\nBy the end of this chapter, you should be able to:\n\n- apply familiar data wrangling functions to novel datasets\n- read and interpret error messages\n- realise there are several ways of getting to the results\n\nIn this chapter, we will pick up where we left off in @sec-wrangling. We will calculate average scores for two of the questionnaires, address an error mode problem, and finally, join all data objects together. This will finalise our data for the upcoming data visualization sections (@sec-dataviz and @sec-dataviz2).\n\n\n## [Individual Walkthrough]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n## Activity 1: Setup\n\n* Go to the project folder we have been using in the last two weeks and double-click on the project icon to **open the project** in RStudio\n* Either **Create a new `.Rmd` file** for chapter 3 and save it to your project folder or continue the one from last week. See @sec-rmd if you need some guidance.\n\n\n\n## Activity 2: Load in the libraries and read in the data\n\nToday, we will be using `tidyverse` along with the two csv files created at the end of the last chapter: `data_prp_for_ch3.csv` and `qrp_t1.csv`. If you need to download them again for any reason, click on the following links: [data_prp_for_ch3.csv](data/data_prp_for_ch3.csv \"download\") and [qrp_t1.csv](data/qrp_t1.csv \"download\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(???)\ndata_prp <- read_csv(\"???\")\nqrp_t1 <- read_csv(\"???\")\n```\n:::\n\n\n\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidyverse)\ndata_prp <- read_csv(\"prp_data_reduced.csv\")\nqrp_t1 <- read_csv(\"qrp_t1.csv\")\n```\n:::\n\n\n:::\n\nIf you need a quick reminder what the dataset was about, have a look at the abstract in @sec-download_data_ch1. We also addressed the changes we made to the dataset there.\n\nAnd remember to have a quick `glimpse()` at your data.\n\n\n\n## Activity 3: Confidence in understanding Open Science practices\n\n#### The main goal is to compute the mean Understanding score per participant. {.unnumbered}\n\nThe mean Understanding score for time point 2 has already been calculated (in the `Time2_Understanding_OS` column), but we still need to compute it for time point 1.\n\nLooking at the Understanding data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nThe steps are quite similar to those for QRP, but we need to add an extra step: converting the character labels into numbers.\n\nAgain, let's do this step by step:\n\n* **Step 1**: Select the relevant columns `Code`, and every Understanding column from time point 1 (e.g., from `Understanding_OS_1_Time1` to `Understanding_OS_12_Time1`) and store them in an object called `understanding_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: Recode the values \"Not at all confident\" as 1 and \"Entirely confident\" as 7. All other values are already numbers. We can use functions `mutate()` in combination with `case_match()` for that\n* **Step 4**: Calculate the average QRP score (`QRPs_Acceptance_Time1_mean`) per participant using `group_by()` and `summarise()`\n\n#### Steps 1 and 2: Select and pivot {.unnumbered}\n\nHow about you try the first 2 steps yourself using the code from Chapter 2 Activity 4 (@sec-ch2_act4) as a template?\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2 - I picked different column labels this time for some variety\n pivot_longer(cols = Understanding_OS_1_Time1:Understanding_OS_12_Time1, names_to = \"Understanding_Qs\", values_to = \"Responses\") \n```\n:::\n\n\n:::\n\n#### Step 3: recoding the values {.unnumbered}\n\nOK, we now want to recode the values in the `Responses` column (or whatever name you picked for your column that has some of the numbers in it) so that \"Not at all confident\" = 1 and \"Entirely confident\" = 7. We want to keep all other values as they are (2-6 look already quite \"numeric\").\n\nLet's create a new column `Responses_corrected` that stores the new values with `mutate()`. Then we can combine that with the `case_match()` function.\n\n* The first argument in `case_match()` is the column name of the variable you want to recode.\n* Then you can start recoding the values in the way of `CurrentValue ~ NewValue` (~ is a tilde). Make sure you use the `~` and not `=`!\n* Lastly, the `.default` argument tells R what to do with values that are neither \"Not at all confident\" nor \"Entirely confident\". Here, we want to replace them with the original value of the `Responses` column. In other datasets, you may want to set the default to `NA` for missing values, a character string or a number, and `case_match()` is happy to oblige.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = Responses # all other values taken from column Responses\n ))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `mutate()`:\nℹ In argument: `Responses_corrected = case_match(...)`.\nCaused by error in `case_match()`:\n! Can't combine `..1 (right)` and `.default` .\n```\n:::\n:::\n\n\n::: {.callout-important collapse=\"true\"}\n\n## Error!!! Can you explain what is happening here?\n\nHave a look at the error message. It's pretty helpful this time. It says `Can't combine ..1 (right) and .default .` It means that the replacement values are expected to be data type character since the original column type was type character.\n\n:::\n\n**So how do we fix this?** Actually, there are several ways this could be done. Click on the tabs below to check out 3 possible solutions.\n\n::: {.panel-tabset group=\"layers\"}\n\n## Fix option 1\n\nOne option is to modify the `.default` argument `Responses` so that the values are copied over from the original column but as a number rather than the original character value. The function `as.numeric()` does the conversion.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n ))\n```\n:::\n\n\n## Fix option 2\n\nChange the numeric values on the right side of the `~` to character. Then in a second step, we would need to turn the character column into a numeric type. Again, we have several options to do so. We could either use the `parse_number()` function we encountered earlier during the demographics wrangling or the `as.numeric()` function.\n\n* V1: `Responses_corrected = parse_number(Responses_corrected)`\n* V2: `Responses_corrected = as.numeric(Responses_corrected)`\n\nJust pay attention that you are still working *within* the `mutate()` function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ \"1\",\n \"Entirely confident\" ~ \"7\",\n .default = Responses # all other values taken from column Responses (character)\n ),\n Responses_corrected = parse_number(Responses_corrected)) # turning Responses_corrected into a numeric column\n```\n:::\n\n\n\n## Fix option 3\n\nIf you recode all the labels into numbers (e.g., \"2\" into 2, \"3\" into 3, etc.) from the start, you won’t need to perform any additional conversions later.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_recoded = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # recode all of them\n \"2\" ~ 2,\n \"3\" ~ 3,\n \"4\" ~ 4,\n \"5\" ~ 5,\n \"6\" ~ 6,\n \"Entirely confident\" ~ 7))\n```\n:::\n\n\n:::\n\n::: {.callout-note icon=\"false\"}\n\n## Your Turn\n\nChoose the option that works best for you to modify the code above that didn't work. You should now be able to calculate the **mean Understanding Score per participant**. Store the average scores in a variable called `Time1_Understanding_OS`. If you need help, refer to the hint below or use Chapter 2 Activity 4 (@sec-ch2_act4) as guidance.\n\n::: {.callout-caution icon=\"false\" collapse=\"true\"}\n\n## One solution for Steps 3 and 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4: calculating averages per participant\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\nOf course, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 3\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Understanding_Qs\", values_to = \"Responses\") %>% \n # Step 3\n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n## Activity 4: Survey of Attitudes Toward Statistics (SATS-28)\n\n#### The main goal is to compute the mean SATS-28 score for each of the 4 subscales per participant for time point 1. {.unnumbered}\n\nLooking at the SATS data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n* Additionally, we are looking to compute the means for the 4 different subscales of the SAT-28 which are , , , and .\n\nThis scenario is slightly more tricky than the previous ones due to the reverse-coding and the 4 subscales. So, let's tackle this step by step again:\n\n* **Step 1**: Select the relevant columns `Code`, and every SATS28 column from time point 1 (e.g., from `SATS28_1_Affect_Time1` to `SATS28_28_Difficulty_Time1`) and store them in an object called `sats_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: We need to know which items belong to which subscale - fortunately, we have that information in the variable name and can use the `separate()` function to access it.\n* **Step 4**: We need to know which items are reverse-coded and then reverse-score them - unfortunately, the info is only in the codebook and we need to find a work-around. `case_when()` can help identify and re-score the reverse-coded items.\n* **Step 5**: Calculate the average SATS score per participant and subscale using `group_by()` and `summarise()`\n* **Step 6**: use `pivot_wider()` to spread out the dataframe into wide format and `rename()` to tidy up the column names\n\n#### Steps 1 and 2: select and pivot {.unnumbered}\n\nThe selecting and pivoting are exactly the same way as we already practiced in the other 2 questionnaires. Apply them here to this questionnaire.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% # Step 1\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") # Step 2\n```\n:::\n\n\n:::\n\n:::\n\n#### Step 3: separate Subscale information {.unnumbered}\n\nIf you look at the `Items` column more closely, you can see that there is information on the `Questionnaire`, the `Item_number`, the `Subscale`, and the `Timepoint` the data was collected at.\n\nWe can separate the information into separate columns using the `separate()` function. The function's first argument is the column to separate, then define `into` which columns you want the original column to split up, and lastly, define the separator `sep` (here an underscore). For our example, we would write:\n\n* V1: `separate(Items, into = c(\"SATS\", \"Item_number\", \"Subscale\", \"Time\"), sep = \"_\")`\n\nHowever, we don't need all of those columns, so we could just drop the ones we are not interested in by replacing them with `NA`.\n\n* V2: `separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\")`\n\nWe might also add an extra argument of `convert = TRUE` to have numeric columns (i.e., `Item_number`) converted to numeric as opposed to keeping them as characters. Saves us typing a few quotation marks later in Step 4.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE)\n```\n:::\n\n\n#### Step 4: identifying reverse-coded items and then correct them {.unnumbered}\n\nWe can use `case_when()` within the `mutate()` function here to create a new column `FW_RV` that stores information on whether the item is a reverse-coded item or not.\n\n`case_when()` works similarly to `case_match()`, however `case_match()` only allows us to \"recode\" values (i.e., replace one value with another), whereas `case_when()` is more flexible. It allows us to use **conditional statements** on the left side of the tilde which is useful when you want to change only *some* of the data based on specific conditions.\n\nLooking at the codebook, it seems that items 2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, and 28 are reverse-coded. The rest are forward-coded.\n\nWe want to tell R now, that\n\n* **if** the `Item_number` is any of those numbers listed above, R should write \"Reverse\" into the new column `FW_RV` we are creating. Since we have a few possible matches for `Item_number`, we need the Boolean expression `%in%` rather than `==`.\n* **if** `Item_number` is none of those numbers, then we would like the word \"Forward\" in the `FW_RV` column to appear. We can achieve that by specifying a `.default` argument again, but this time we want a \"word\" rather than a value from another column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ))\n```\n:::\n\n\nMoving on to correcting the scores: Once again, we can use `case_when ()` within the `mutate()` function to create another **conditional statement**. This time, the condition is:\n\n* **if** `FW_RV` column has a value of \"Reverse\" then we would like to turn all 1 into 7, 2 into 6, etc.\n* **if** `FW_RV` column has a value of \"Forward\" then we would like to keep the score from the `Response` column\n\nThere is a quick way and a not-so-quick way to achieve the actual **reverse-coding**.\n\n* **Option 1 (quick)**: The easiest way to reverse-code scores is to take the maximum value of the scale, add 1 unit, and subtract the original value. For example, on a 5-point Likert scale, it would be 6 minus the original rating; for a 7-point Likert scale, 8 minus the original rating, etc. (see *Option 1* tab).\n* **Option 2 (not so quick)**: This involves using two conditional statements (see *Option 2* tab).\n\nUse the one you find more intuitive.\n\n::: panel-tabset\n\n## Option 1\n\nHere we are using a Boolean expression to check if the string \"Reverse\" is present in the `FW_RV` column. If this condition is `TRUE`, the value in the new column we're creating, `Scores_corrected`, will be calculated as 8 minus the value from the Response column. If the condition is FALSE (handled by the .default argument), the original values from the `Response` column will be retained.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n ))\n```\n:::\n\n\n## Option 2\n\nAs stated above, the longer approach involves using two conditional statements. The first condition checks if the value in the `FW_RV` column is \"Reverse\", while the second condition checks if the value in the `Response` column equals a specific number. **When both conditions are met**, the corresponding value on the right side of the tilde is placed in the newly created `Scores_corrected_v2` column.\n\nFor example, line 3 would read: if the value in the `FW_RV` column is \"Reverse\" **AND** the value in the `Response` column is 1, then assign a value of 7 to the `Scores_corrected_v2` column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected_v2 = case_when(\n FW_RV == \"Reverse\" & Response == 1 ~ 7,\n FW_RV == \"Reverse\" & Response == 2 ~ 6,\n FW_RV == \"Reverse\" & Response == 3 ~ 5,\n # no need to recode 4 as 4\n FW_RV == \"Reverse\" & Response == 5 ~ 3,\n FW_RV == \"Reverse\" & Response == 6 ~ 2,\n FW_RV == \"Reverse\" & Response == 7 ~ 1,\n .default = Response\n ))\n```\n:::\n\n\nAs you can see now in `sats_t1`, both columns `Scores_corrected` and `Scores_corrected_v2` are identical.\n\n:::\n\nOne way to **check whether our reverse-coding worked** is by examining the `distinct` values in the original `Response` column and comparing them with the `Scores_corrected`. We should also retain the `FW_RV` column to observe how the reverse-coding applied.\n\nTo see the patterns more clearly, we can use `arrange()` to sort the values in a meaningful order. Remember, the default sorting order is ascending, so if you want to sort values in descending order, you’ll need to wrap your variable in the desc() function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding <- sats_t1 %>% \n distinct(FW_RV, Response, Scores_corrected) %>% \n arrange(desc(FW_RV), Response)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show `check_coding` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding\n```\n\n::: {.cell-output-display}\n
\n\n|FW_RV | Response| Scores_corrected|\n|:-------|--------:|----------------:|\n|Reverse | 1| 7|\n|Reverse | 2| 6|\n|Reverse | 3| 5|\n|Reverse | 4| 4|\n|Reverse | 5| 3|\n|Reverse | 6| 2|\n|Reverse | 7| 1|\n|Forward | 1| 1|\n|Forward | 2| 2|\n|Forward | 3| 3|\n|Forward | 4| 4|\n|Forward | 5| 5|\n|Forward | 6| 6|\n|Forward | 7| 7|\n\n
\n:::\n:::\n\n\n:::\n\n#### Step 5 {.unnumbered}\n\nNow that we know everything worked out as intended, we can calculate the mean scores of each subscale for each participant in `sats_t1`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(???, ???) %>% \n summarise(mean_score = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`summarise()` has grouped output by 'Code'. You can override using the\n`.groups` argument.\n```\n:::\n:::\n\n\n:::\n\n:::\n\n#### Step 6 {.unnumbered}\n\nThe final step is to transform the data back into wide format, ensuring that each subscale has its own column. This will make it easier to join the data objects later on. In `pivot_wider()`, the first argument, `names_from`, specifies the column you want to use for your new column headings. The second argument, `values_from`, tells R which column should provide the cell values.\n\nWe should also **rename the column names** to match those in the codebook. Conveniently, we can use a function called `rename()` that works exactly like `select()` (following the pattern `new_name = old_name`), but it keeps all other column names the same rather than reducing the number of columns.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show final `sats_t1` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nhead(sats_t1, n = 5)\n```\n\n::: {.cell-output-display}\n
\n\n|Code | SATS28_Affect_Time1_mean| SATS28_CognitiveCompetence_Time1_mean| SATS28_Difficulty_Time1_mean| SATS28_Value_Time1_mean|\n|:----|------------------------:|-------------------------------------:|----------------------------:|-----------------------:|\n|AD03 | 2.333333| 3.833333| 3.428571| 5.555556|\n|AD05 | 3.500000| 5.000000| 2.142857| 4.777778|\n|Ab01 | 5.166667| 5.666667| 4.142857| 5.444444|\n|Al05 | 2.166667| 2.666667| 2.857143| 3.777778|\n|Am05 | 4.166667| 5.666667| 5.571429| 4.888889|\n\n
\n:::\n:::\n\n\n:::\n\nAgain, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n # Step 1\n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE) %>% \n # step 4\n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ),\n Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n )) %>% \n # step 5\n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup() %>% \n # step 6\n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n:::\n\n\n\n## Activity 5 (Error Mode): Perceptions of supervisory support\n\n#### The main goal is to compute the mean score for perceived supervisory support per participant. {.unnumbered}\n\nLooking at the supervisory support data, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nI have outlined my steps as follows:\n\n* **Step 1**: Reverse-code the single column first because that's less hassle than having to do that with conditional statements (`Supervisor_15_R`). `mutate()` is my friend.\n* **Step 2**: I want to filter out everyone who failed the attention check in `Supervisor_7`. I can do this with a Boolean expression within the `filter()` function. The correct response was \"completely disagree\" which is 1.\n* **Step 3**: Select their id from time point 2 and all the columns that start with the word \"super\", apart from `Supervisor_7` and the original `Supervisor_15_R` column\n* **Step 4**: pivot into long format so I can calculate the averages better\n* **Step 5**: calculate the average scores per participant\n\nI've started coding but there are some errors in my code. Help me find and fix all of them. Try to go through the code line by line and read the error messages.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_ppr %>% \n mutate(Supervisor_15 = 9-supervisor_15_R) %>% \n filter(Supervisor_7 = 1) %>% \n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) \npivot_wider(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% \n group_by(Time2_Code) %>% \n summarise(Mean_Supervisor_Support = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## How many mistakes am I supposed to find?\n\nThere are 8 mistakes in the code.\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Reveal solution\n\nDid you spot all 8 mistakes? Let's go through them line by line.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_prp %>% # spelling mistake in data object\n mutate(Supervisor_15 = 8-Supervisor_15_R) %>% # semantic error: 8 minus response for a 7-point scale and supervisor_15_R needs a capital S\n filter(Supervisor_7 == 1) %>% # needs a Boolean expression == instead of =\n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) %>% # no pipe at the end, the rest is actually legit\n pivot_longer(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% # pivot_longer instead of pivot_wider\n group_by(Code) %>% # Code rather than Time2_Code - the reduced dataset does not contain Time2_Code\n summarise(Mean_Supervisor_Support = mean(Response, na.rm = TRUE)) %>% # Score_corrected doesn't exist; needs to be Response\n ungroup()\n```\n:::\n\n\n* Note that the **semantic error** in line 2 will not give you an error message.\n* Were you thrown off by the `starts_with(\"Super\")` expression in line 4? `starts_with()` and `ends_with()` are great alternatives to selecting columns via `:` But, using `select(Code, Supervisor_1:Supervisor_6, Supervisor_8:Supervisor_14)` would have given us the same result. *[I admit, that one was perhaps a bit mean]*\n\n:::\n\n## Activity 6: Join everything together with `???_join()`\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: `inner_join()`, `left_join()`, `right_join()`, and `full_join()`. Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n::: {.callout-note icon=\"false\"}\n\n## Info on mutating joins\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n::: panel-tabset\n\nA mutating join allows you to combine variables from two tables. It first matches observations by their keys, then copies across variables from one table to the other.\n\n## `inner_join()`\n\n`inner_join()` returns only the rows where the values in the column specified in the `by =` statement match in both tables.\n\n![inner_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/inner-join.gif)\n\n## `left_join()`\n\n`left_join()` retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the `by =` statement. Rows in the left table with no match in the right table will have missing values (`NA`) in the new columns.\n\n![left_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/left-join.gif)\n\n## `right_join()`\n\n`right_join()` retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the `by =` statement. Rows in the right table with no match in the left table will have missing values (`NA`) in the new columns.\n\n![right_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/right-join.gif)\n\n## `full_join()`\n\n`full_join()` returns all rows and all columns from both tables. `NA` values fill unmatched rows.\n\n![full_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/full-join.gif)\n\n:::\n\n:::\n\nFrom our original `data_prp`, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the `Global Environment`.\n\nWhile you may be familiar with `inner_join()` from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use `full_join()`. Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\n\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use `select()` at the end to order them better.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndata_prp_final <- data_prp %>% \n select(Code:Plan_prereg, Other_OS_behav_2:Time2_Understanding_OS) %>% \n full_join(qrp_t1) %>% \n full_join(understanding_t1) %>% \n full_join(sats_t1) %>% \n full_join(super) %>% \n select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n```\n:::\n\n\n\n::: {.callout-important icon=\"false\"}\n## No `by` argument in the code above? \n\nNote how I didn't include a `by` argument in the code above. If you leave `by =` out, R will join the 2 data objects by **ALL** columns that have the same name.\n\n**Special case 1: matching column names but different values**\n\nIf you want more control, you should include the `by` argument; for example, if both data objects include a column `age` but data was recorded at 2 different time points. In that case, the information from both `age` columns should be retained and the `by` argument would not include `age`.\n\n**Special case 2: different column names but matching values**\n\nAnother special case presents when both data objects contain identical information but the variable names don't match. Let's say, both data objects contain gender information, but in one data object the variable is named `gender` and in the other one `gender_label`. In that case, your `by` argument needs to be modified as: `by = join_by(gender == gender_label)`.\n\nMore info on joins can be found [https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/](https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/){target=\"_blank\"}\n:::\n\nAnd this is basically the dataset we need for @sec-dataviz and @sec-dataviz2.\n\n\n\n## Activity 7: Knit and export\n\nKnit the `.Rmd` file to ensure everything runs as expected. Once it does, export the data object `data_prp_final` as a csv for use in the @sec-dataviz. Name it something meaningful, something like `data_prp_for_ch4.csv`.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nwrite_csv(data_prp_final, \"data_prp_for_ch4.csv\")\n```\n:::\n\n\n:::\n\n\n## [Pair-coding]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\nWe will once again be working with data from Binfet et al. (2021), which focuses on the randomised controlled trials data involving therapy dog interventions. Today, our goal is to calculate the average `Loneliness` score for each participant measured at time point 1 (pre-intervention) using the raw data file `dog_data_raw`. Currently, the data looks like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| L1_1| L1_2| L1_3| L1_4| L1_5| L1_6| L1_7| L1_8| L1_9| L1_10| L1_11| L1_12| L1_13| L1_14| L1_15| L1_16| L1_17| L1_18| L1_19| L1_20|\n|---:|----:|----:|----:|----:|----:|----:|----:|----:|----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|\n| 1| 3| 3| 4| 3| 2| 3| 1| 2| 3| 4| 3| 1| 3| 1| 2| 3| 2| 3| 2| 4|\n| 2| 3| 2| 3| 3| 4| 3| 2| 2| 4| 3| 2| 2| 1| 2| 4| 3| 3| 2| 4| 3|\n| 3| 3| 3| 2| 3| 3| 4| 2| 3| 3| 3| 2| 2| 2| 2| 3| 3| 4| 3| 3| 3|\n| 4| 4| 2| 2| 3| 4| 4| 1| 3| 3| 4| 2| 1| 2| 2| 4| 4| 3| 3| 4| 3|\n| 5| 2| 3| 3| 3| 4| 3| 2| 2| 3| 2| 4| 4| 4| 3| 2| 2| 3| 4| 3| 2|\n\n
\n:::\n:::\n\n\nBut we want the data to look like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| Loneliness_pre|\n|---:|--------------:|\n| 1| 2.25|\n| 2| 1.90|\n| 3| 2.25|\n| 4| 1.75|\n| 5| 2.85|\n\n
\n:::\n:::\n\n\nThis task is a bit more challenging compared to last week's lab activity, as the Loneliness scale includes some reverse-coded items.\n\n### Task 1: Open the R project for the lab {.unnumbered}\n\n### Task 2: Open your `.Rmd` file from last week or create a new `.Rmd` file {.unnumbered}\n\nYou could continue the `.Rmd` file you used last week, or create a new `.Rmd`. If you need some guidance, have a look at @sec-rmd.\n\n### Task 3: Load in the library and read in the data {.unnumbered}\n\nThe data should already be in your project folder. If you want a fresh copy, you can download the data again here: [data_pair_ch1](data/data_pair_ch1.zip \"download\").\n\nWe are using the package `tidyverse` today, and the datafile we should read in is `dog_data_raw.csv`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(???)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"???\")\n```\n:::\n\n\n:::\n\n### Task 4: Calculating the mean for `Loneliness_pre` {.unnumbered}\n\n* **Step 1**: Select all relevant columns, such as the participant ID and all 20 items of the `Loneliness` questionnaire completed by participants before the intervention. Store this data in an object called `data_loneliness`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nLook at the codebook. Try to figure out\n\n* the variable name of the column in which the participant id is stored, and\n* which items relate to the Loneliness scale at Stage \"pre\"\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* the participant id column is called `RID`\n* The Loneliness items at pre-intervention stage start with `L1_`\n\n:::\n\n:::\n\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily (in step 3)\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n`pivot_`\n\nWe also need 3 arguments in that function:\n\n* the columns we want to select (e.g., all the loneliness items),\n* the name of the column in which the current column headings will be stored (e.g., \"Qs\"), and\n* the name of the column that should store all the values (e.g., \"Responses\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\")\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 3**: Reverse-scoring\n\nIdentify the items on the `Loneliness` scale that are reverse-coded, and then reverse-score them accordingly.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nWe need to figure out:\n\n* which are the items of the loneliness scale we need to reverse-score\n* what is the measuring scale of loneliness so we can determine the new values\n* which function to use to create a new column that has the corrected scores in it\n* which one of the `case_` functions will get us there\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* The items to be reverse-coded items can be found in the codebook: L1_1, L1_5, L1_6, L1_9, L1_10, L1_15, L1_16, L1_19, L1_20\n* the loneliness scale ranges from 1 to 4, so we need to replace 1 with 4, 2 with 3, 3 with 2, and 4 with 1\n* the function to create a new column `mutate()`\n* it's a conditional statement rather than \"just\" replacing values, hence we need `case_when()`\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n mutate(Score_corrected = case_when(\n ??? ~ ???,\n .default = ???\n ))\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 4**: Calculate the average Loneliness score per participant. To match with the table above, we want to call this column `Loneliness_pre`\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\ngrouping and summarising\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n group_by(???) %>% \n summarise(Loneliness_pre = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(tidyverse)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"dog_data_raw.csv\")\n\n# Task 4: Tidying \nloneliness_tidy <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L1\")) %>% # select(RID, L1_1:L1_20) also works\n # Step 2\n pivot_longer(cols = -RID, names_to = \"Qs\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score_corrected = case_when(\n Qs %in% c(\"L1_1\", \"L1_5\", \"L1_6\", \"L1_9\", \"L1_10\", \"L1_15\", \"L1_16\", \"L1_19\", \"L1_20\") ~ 5-Response,\n .default = Response\n )) %>% \n # Step 4\n group_by(RID) %>% \n summarise(Loneliness_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n## [Test your knowledge and challenge yourself]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n### Knowledge check {.unnumbered}\n\n\n#### Question 1 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to recode an existing variable? \n\n\n#### Question 2 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to create a new variable based on one or multiple conditional statements? \n\n\n#### Question 3 {.unnumbered}\n\nWhich of the following functions would you use if you wanted to join two data sets by their shared identifier? \n\n\n#### Question 4 {.unnumbered}\n\nYour data object contains a column `Score` with numbers, but they have been read in incorrectly as a character datatype. Which of the following functions would *not* work for fixing this issue? \n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Explain this answer\n\n* `parse_number()` from the `readr` package extracts numeric values from strings, so this would work.\n* `factor(Score)`: This would *not* work as expected because it converts the column into a factor, not a numeric datatype, leading to incorrect results if numeric operations are needed.\n* `mutate(Score = as.numeric(Score))`: This would work too because `mutate()` can be used in combination with `as.numeric()` to create a new numeric column or override the existing character column.\n* `as.numeric()`: This would also work to convert a character column to numeric. Without mutate, you could use it in a BaseR way, e.g., `data$Score <- as.numeric(data$Score)` (*shudder, BaseR!!! But effective*)\n\n:::\n\n\n\n### Challenge yourself {.unnumbered}\n\nIf you want to **challenge yourself** and further apply the skills from Chapter 3, you could wrangle the data from `dog_data_raw` for one of the other questionnaires. There are plenty of options to choose from:\n\n::: {.callout-tip collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: easy\n\n* recode column `Live_Pets` so the values read yes and no rather than 1 and 2\n* recode `Year_of_Study` so they have the labels from the codebook rather than the numbers\n* reverse-code the `Homesickness` scale for `_pre` and `_post`\n* renaming the columns of the other one-item scales as `Stress_pre`, `Stress_post`, `Engagement_pre` and `Engagement_post`\n\nAny of these tasks should be doable in one step. No need to select or pivot anything. You could just modify `dog_data_raw`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* For the **recoding tasks**, you need to work out which function to use to recode one value as another - just plain replacing, no conditional statements\n* The **reverse-coding** might sound daunting to do in one step, but it is only a single value that needs to be recoded. Take some inspiration from Activity 5 (error mode).\n* For the **renaming tasks**, check how you would change column names without reducing the number of columns overall\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - easy**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Live_Pets\ndog_data_raw <- dog_data_raw %>%\n mutate(Live_Pets = case_match(Live_Pets,\n 1 ~ \"yes\",\n 2 ~ \"no\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Year of Study\ndog_data_raw <- dog_data_raw %>%\n mutate(Year_of_Study = case_match(Year_of_Study,\n 1 ~ \"First\",\n 2 ~ \"Second\",\n 3 ~ \"Third\",\n 4 ~ \"Fourth\",\n 5 ~ \"Fifth or above\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Reverse-coding of homesickness pre and post. It's a 5-point scale, hence you'd calculate 6-the original response column\ndog_data_raw <- dog_data_raw %>% \n mutate(Homesick_pre = 6-HO1_1,\n Homesick_post = 6-HO2_1)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Renaming of Stress and Engagement\ndog_data_raw <- dog_data_raw %>% \n rename(Stress_pre = S1_1, Stress_post = S2_1, Engagement_pre = HO1_2, Engagement_post = HO2_2)\n```\n:::\n\n:::\n:::\n\n::: {.callout-warning collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: medium\n\n* reverse-code the Social connectedness scale (pre-intervention) and compute a mean score per participant\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\nThis task would take 4 steps to complete. These are the exact same steps we applied to `Loneliness_pre` in the lab activity. You would just need to figure out which items are related to the Social connectedness scale (pre-intervention) and which ones of those are reverse-coded. The codebook has all the answers.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - medium**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS pre\nscs_pre <- dog_data_raw %>% \n select(RID, starts_with(\"SC1\")) %>% \n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n mutate(Score_corrected = case_when(\n Names %in% c(\"SC1_3\", \"SC1_6\", \"SC1_7\", \"SC1_9\", \"SC1_11\", \"SC1_13\", \"SC1_15\", \"SC1_17\", \"SC1_18\", \"SC1_20\") ~ 7-Response,\n .default = Response\n )) %>% \n group_by(RID) %>% \n summarise(SCS_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: hard\n\n* reverse-code the Loneliness scale (post-intervention) and compute a mean score per participant\n* reverse-code the Social connectedness scale (post-intervention) and compute a mean score per participant\n\nBoth activities are similar to Activity 3 from the individual walkthrough and would take about 5 steps to complete. **Start by mapping out the steps**.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* **Step 1**: Select all relevant columns, such as participant ID and all the items that belong to the questionnaire that participants completed after the intervention\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily\n* **Step 3**: Recode the initial responses so that the new column has numbers instead of labels\n* **Step 4**: Reverse-score the items that are labelled as \"Reverse\" in the codebook and then reverse-score them\n* **Step 5**: Group by and summarise to calculate the mean Score\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## loneliness post\nlonely_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score = case_match(Response,\n \"never\" ~ 1,\n \"rarely\" ~ 2,\n \"sometimes\" ~ 3,\n \"often\" ~ 4,\n .default = NA\n ),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"L2_1\", \"L2_5\", \"L2_6\", \"L2_9\", \"L2_10\", \"L2_15\", \"L2_16\", \"L2_19\", \"L2_20\") ~ 5-Score,\n .default = Score\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(Loneliness_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS post\nscs_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"SC2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Response = case_match(Response,\n \"strongly disagree\" ~ \"1\",\n \"strongly agree\" ~ \"6\",\n .default = Response),\n Response = parse_number(Response),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"SC2_3\", \"SC2_6\", \"SC2_7\", \"SC2_9\", \"SC2_11\", \"SC2_13\", \"SC2_15\", \"SC2_17\", \"SC2_18\", \"SC2_20\") ~ 7-Response,\n .default = Response\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(SCS_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-important collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: extra hard\n\n* PANAS: positive and negative affect of pre- and post-intervention in a single pipe rather than in 4 different data objects (see last week's)\n\nThis task would take about 7 steps to get it from\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| PN1_1| PN1_2| PN1_3| PN1_4| PN1_5| PN1_6| PN1_7| PN1_8| PN1_9| PN1_10| PN2_1| PN2_2| PN2_3| PN2_4| PN2_5| PN2_6| PN2_7| PN2_8| PN2_9| PN2_10|\n|---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|\n| 1| 1| 1| 1| 1| 4| 1| 4| 3| 1| 4| 2| 1| 3| 1| 4| 1| 4| 4| 1| 4|\n| 2| 1| 2| 3| 2| 1| 3| 3| 4| 1| 4| 1| 1| 2| 1| 3| 1| 3| 4| 1| 4|\n| 3| 1| 1| 3| 1| 2| 4| 4| 3| 1| 2| 2| 2| 3| 1| 3| 2| 4| 3| 1| 2|\n| 4| 1| 1| 5| 1| 4| 3| 5| 5| 3| 2| 1| 1| 5| 1| 4| 3| 4| 4| 2| 2|\n| 5| 2| 3| 5| 2| 3| 2| 3| 4| 2| 2| 1| 2| 5| 2| 3| 2| 4| 5| 1| 3|\n\n
\n:::\n:::\n\n\nto\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID|Stage | PANAS_NA| PANAS_PA|\n|---:|:-----|--------:|--------:|\n| 1|post | 1.2| 3.8|\n| 1|pre | 1.0| 3.2|\n| 2|post | 1.0| 3.2|\n| 2|pre | 1.8| 3.0|\n| 3|post | 1.6| 3.0|\n\n
\n:::\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n**Start by mapping out the steps**\n\n* **Step 1**: select all relevant columns, such as participant ID and all the items that belong to PANAs scale (pos, neg, pre, and post)\n* **Step 2**: pivot the data from wide format to long format. You want to do that for ALL columns that are not the participant id. The data object should have 3 columns and 5680 observations, i.e. each participant has 20 rows.\n* **Step 3**: All of the items will have the structure `PN1_1`. Use separate to split the information across 2 columns. First column has information about the `Stage`, second column should turn into an `Item_number` and it should convert into a numeric column in the process to save you typing quotation marks in Step 5.\\\n* **Step 4**: recode the `Stage` column you just created so that everything that starts with PN1 relates to \"pre\" and PN2 as post.\n* **Step 5**: identify the subscales positive affect (PA) and negative affect (NA) by item number and recode them. This requires a conditional statement.\n* **Step 6**: group by and summarise to calculate the mean Score\n* **Step 7**: pivot, so that you have the 2 PANAS subscales presented in separate columns (see table above). You might need an extra step if the columns aren't labelled exactly as shown in the table above.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - extra hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nPANAS <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"PN\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Items\", values_to = \"Scores\") %>% \n # Step 3\n separate(Items, into = c(\"Stage\", \"Item_number\"), sep = \"_\", convert = TRUE) %>% \n # Step 4 recode Stage\n mutate(Stage = case_match(Stage,\n \"PN1\" ~ \"pre\",\n \"PN2\" ~ \"post\")) %>% \n # Step 5 identify subscales by item number\n mutate(Subscales = case_when(\n Item_number %in% c(3, 5, 7, 8, 10) ~ \"PANAS_PA\",\n .default = \"PANAS_NA\"\n )) %>% \n # Step 6 \n group_by(RID, Stage, Subscales) %>% \n summarise(Score = mean(Scores)) %>% \n ungroup() %>% \n # Step 7 - to make the data look like the data in `dog_data_clean_long.csv`\n pivot_wider(names_from = Subscales, values_from = Score)\n```\n:::\n\n:::\n:::\n", + "markdown": "# Data wrangling II {#sec-wrangling2}\n\n## Intended Learning Outcomes {.unnumbered}\n\nBy the end of this chapter, you should be able to:\n\n- apply familiar data wrangling functions to novel datasets\n- read and interpret error messages\n- realise there are several ways of getting to the results\n\nIn this chapter, we will pick up where we left off in @sec-wrangling. We will calculate average scores for two of the questionnaires, address an error mode problem, and finally, join all data objects together. This will finalise our data for the upcoming data visualization sections (@sec-dataviz and @sec-dataviz2).\n\n\n## [Individual Walkthrough]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n## Activity 1: Setup\n\n* Go to the project folder we have been using in the last two weeks and double-click on the project icon to **open the project** in RStudio\n* Either **Create a new `.Rmd` file** for chapter 3 and save it to your project folder or continue the one from last week. See @sec-rmd if you need some guidance.\n\n\n\n## Activity 2: Load in the libraries and read in the data\n\nToday, we will be using `tidyverse` along with the two csv files created at the end of the last chapter: `data_prp_for_ch3.csv` and `qrp_t1.csv`. If you need to download them again for any reason, click on the following links: [data_prp_for_ch3.csv](data/data_prp_for_ch3.csv \"download\") and [qrp_t1.csv](data/qrp_t1.csv \"download\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(???)\ndata_prp <- read_csv(\"???\")\nqrp_t1 <- read_csv(\"???\")\n```\n:::\n\n\n\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidyverse)\ndata_prp <- read_csv(\"prp_data_reduced.csv\")\nqrp_t1 <- read_csv(\"qrp_t1.csv\")\n```\n:::\n\n\n:::\n\nIf you need a quick reminder what the dataset was about, have a look at the abstract in @sec-download_data_ch1. We also addressed the changes we made to the dataset there.\n\nAnd remember to have a quick `glimpse()` at your data.\n\n\n\n## Activity 3: Confidence in understanding Open Science practices\n\n#### The main goal is to compute the mean Understanding score per participant. {.unnumbered}\n\nThe mean Understanding score for time point 2 has already been calculated (in the `Time2_Understanding_OS` column), but we still need to compute it for time point 1.\n\nLooking at the Understanding data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nThe steps are quite similar to those for QRP, but we need to add an extra step: converting the character labels into numbers.\n\nAgain, let's do this step by step:\n\n* **Step 1**: Select the relevant columns `Code`, and every Understanding column from time point 1 (e.g., from `Understanding_OS_1_Time1` to `Understanding_OS_12_Time1`) and store them in an object called `understanding_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: Recode the values \"Not at all confident\" as 1 and \"Entirely confident\" as 7. All other values are already numbers. We can use functions `mutate()` in combination with `case_match()` for that\n* **Step 4**: Calculate the average QRP score (`QRPs_Acceptance_Time1_mean`) per participant using `group_by()` and `summarise()`\n\n#### Steps 1 and 2: Select and pivot {.unnumbered}\n\nHow about you try the first 2 steps yourself using the code from Chapter 2 Activity 4 (@sec-ch2_act4) as a template?\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2 - I picked different column labels this time for some variety\n pivot_longer(cols = Understanding_OS_1_Time1:Understanding_OS_12_Time1, names_to = \"Understanding_Qs\", values_to = \"Responses\") \n```\n:::\n\n\n:::\n\n#### Step 3: recoding the values {.unnumbered}\n\nOK, we now want to recode the values in the `Responses` column (or whatever name you picked for your column that has some of the numbers in it) so that \"Not at all confident\" = 1 and \"Entirely confident\" = 7. We want to keep all other values as they are (2-6 look already quite \"numeric\").\n\nLet's create a new column `Responses_corrected` that stores the new values with `mutate()`. Then we can combine that with the `case_match()` function.\n\n* The first argument in `case_match()` is the column name of the variable you want to recode.\n* Then you can start recoding the values in the way of `CurrentValue ~ NewValue` (~ is a tilde). Make sure you use the `~` and not `=`!\n* Lastly, the `.default` argument tells R what to do with values that are neither \"Not at all confident\" nor \"Entirely confident\". Here, we want to replace them with the original value of the `Responses` column. In other datasets, you may want to set the default to `NA` for missing values, a character string or a number, and `case_match()` is happy to oblige.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = Responses # all other values taken from column Responses\n ))\n```\n\n::: {.cell-output .cell-output-error}\n```\nError in `mutate()`:\nℹ In argument: `Responses_corrected = case_match(...)`.\nCaused by error in `case_match()`:\n! Can't combine `..1 (right)` and `.default` .\n```\n:::\n:::\n\n\n::: {.callout-important collapse=\"true\"}\n\n## Error!!! Can you explain what is happening here?\n\nHave a look at the error message. It's pretty helpful this time. It says `Can't combine ..1 (right) and .default .` It means that the replacement values are expected to be data type character since the original column type was type character.\n\n:::\n\n**So how do we fix this?** Actually, there are several ways this could be done. Click on the tabs below to check out 3 possible solutions.\n\n::: {.panel-tabset group=\"layers\"}\n\n## Fix option 1\n\nOne option is to modify the `.default` argument `Responses` so that the values are copied over from the original column but as a number rather than the original character value. The function `as.numeric()` does the conversion.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n ))\n```\n:::\n\n\n## Fix option 2\n\nChange the numeric values on the right side of the `~` to character. Then in a second step, we would need to turn the character column into a numeric type. Again, we have several options to do so. We could either use the `parse_number()` function we encountered earlier during the demographics wrangling or the `as.numeric()` function.\n\n* V1: `Responses_corrected = parse_number(Responses_corrected)`\n* V2: `Responses_corrected = as.numeric(Responses_corrected)`\n\nJust pay attention that you are still working *within* the `mutate()` function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ \"1\",\n \"Entirely confident\" ~ \"7\",\n .default = Responses # all other values taken from column Responses (character)\n ),\n Responses_corrected = parse_number(Responses_corrected)) # turning Responses_corrected into a numeric column\n```\n:::\n\n\n\n## Fix option 3\n\nIf you recode all the labels into numbers (e.g., \"2\" into 2, \"3\" into 3, etc.) from the start, you won’t need to perform any additional conversions later.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1_step3_v2 <- understanding_t1 %>% \n mutate(Responses_recoded = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # recode all of them\n \"2\" ~ 2,\n \"3\" ~ 3,\n \"4\" ~ 4,\n \"5\" ~ 5,\n \"6\" ~ 6,\n \"Entirely confident\" ~ 7))\n```\n:::\n\n\n:::\n\n::: {.callout-note icon=\"false\"}\n\n## Your Turn\n\nChoose the option that works best for you to modify the code above that didn't work. You should now be able to calculate the **mean Understanding Score per participant**. Store the average scores in a variable called `Time1_Understanding_OS`. If you need help, refer to the hint below or use Chapter 2 Activity 4 (@sec-ch2_act4) as guidance.\n\n::: {.callout-caution icon=\"false\" collapse=\"true\"}\n\n## One solution for Steps 3 and 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- understanding_t1 %>% \n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4: calculating averages per participant\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\nOf course, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 3\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nunderstanding_t1 <- data_prp %>% \n # Step 1\n select(Code, Understanding_OS_1_Time1:Understanding_OS_12_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Understanding_Qs\", values_to = \"Responses\") %>% \n # Step 3\n mutate(Responses_corrected = case_match(Responses, # column of the values to recode\n \"Not at all confident\" ~ 1, # values to recode\n \"Entirely confident\" ~ 7,\n .default = as.numeric(Responses) # all other values taken from column Responses but as numeric data type \n )) %>% \n # Step 4\n group_by(Code) %>%\n summarise(Time1_Understanding_OS = mean(Responses_corrected)) %>%\n ungroup()\n```\n:::\n\n\n:::\n\n## Activity 4: Survey of Attitudes Toward Statistics (SATS-28)\n\n#### The main goal is to compute the mean SATS-28 score for each of the 4 subscales per participant for time point 1. {.unnumbered}\n\nLooking at the SATS data at time point 1, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n* Additionally, we are looking to compute the means for the 4 different subscales of the SAT-28 which are , , , and .\n\nThis scenario is slightly more tricky than the previous ones due to the reverse-coding and the 4 subscales. So, let's tackle this step by step again:\n\n* **Step 1**: Select the relevant columns `Code`, and every SATS28 column from time point 1 (e.g., from `SATS28_1_Affect_Time1` to `SATS28_28_Difficulty_Time1`) and store them in an object called `sats_t1`\n* **Step 2**: Pivot the data from wide format to long format using `pivot_longer()` so we can recode the labels into values (step 3) and calculate the average score (in step 4) more easily\n* **Step 3**: We need to know which items belong to which subscale - fortunately, we have that information in the variable name and can use the `separate()` function to access it.\n* **Step 4**: We need to know which items are reverse-coded and then reverse-score them - unfortunately, the info is only in the codebook and we need to find a work-around. `case_when()` can help identify and re-score the reverse-coded items.\n* **Step 5**: Calculate the average SATS score per participant and subscale using `group_by()` and `summarise()`\n* **Step 6**: use `pivot_wider()` to spread out the dataframe into wide format and `rename()` to tidy up the column names\n\n#### Steps 1 and 2: select and pivot {.unnumbered}\n\nThe selecting and pivoting are exactly the same way as we already practiced in the other 2 questionnaires. Apply them here to this questionnaire.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(???) %>% # Step 1\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\") # Step 2\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for steps 1 and 2\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% # Step 1\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") # Step 2\n```\n:::\n\n\n:::\n\n:::\n\n#### Step 3: separate Subscale information {.unnumbered}\n\nIf you look at the `Items` column more closely, you can see that there is information on the `Questionnaire`, the `Item_number`, the `Subscale`, and the `Timepoint` the data was collected at.\n\nWe can separate the information into separate columns using the `separate()` function. The function's first argument is the column to separate, then define `into` which columns you want the original column to split up, and lastly, define the separator `sep` (here an underscore). For our example, we would write:\n\n* V1: `separate(Items, into = c(\"SATS\", \"Item_number\", \"Subscale\", \"Time\"), sep = \"_\")`\n\nHowever, we don't need all of those columns, so we could just drop the ones we are not interested in by replacing them with `NA`.\n\n* V2: `separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\")`\n\nWe might also add an extra argument of `convert = TRUE` to have numeric columns (i.e., `Item_number`) converted to numeric as opposed to keeping them as characters. Saves us typing a few quotation marks later in Step 4.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE)\n```\n:::\n\n\n#### Step 4: identifying reverse-coded items and then correct them {.unnumbered}\n\nWe can use `case_when()` within the `mutate()` function here to create a new column `FW_RV` that stores information on whether the item is a reverse-coded item or not.\n\n`case_when()` works similarly to `case_match()`, however `case_match()` only allows us to \"recode\" values (i.e., replace one value with another), whereas `case_when()` is more flexible. It allows us to use **conditional statements** on the left side of the tilde which is useful when you want to change only *some* of the data based on specific conditions.\n\nLooking at the codebook, it seems that items 2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, and 28 are reverse-coded. The rest are forward-coded.\n\nWe want to tell R now, that\n\n* **if** the `Item_number` is any of those numbers listed above, R should write \"Reverse\" into the new column `FW_RV` we are creating. Since we have a few possible matches for `Item_number`, we need the Boolean expression `%in%` rather than `==`.\n* **if** `Item_number` is none of those numbers, then we would like the word \"Forward\" in the `FW_RV` column to appear. We can achieve that by specifying a `.default` argument again, but this time we want a \"word\" rather than a value from another column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ))\n```\n:::\n\n\nMoving on to correcting the scores: Once again, we can use `case_when ()` within the `mutate()` function to create another **conditional statement**. This time, the condition is:\n\n* **if** `FW_RV` column has a value of \"Reverse\" then we would like to turn all 1 into 7, 2 into 6, etc.\n* **if** `FW_RV` column has a value of \"Forward\" then we would like to keep the score from the `Response` column\n\nThere is a quick way and a not-so-quick way to achieve the actual **reverse-coding**.\n\n* **Option 1 (quick)**: The easiest way to reverse-code scores is to take the maximum value of the scale, add 1 unit, and subtract the original value. For example, on a 5-point Likert scale, it would be 6 minus the original rating; for a 7-point Likert scale, 8 minus the original rating, etc. (see *Option 1* tab).\n* **Option 2 (not so quick)**: This involves using two conditional statements (see *Option 2* tab).\n\nUse the one you find more intuitive.\n\n::: panel-tabset\n\n## Option 1\n\nHere we are using a Boolean expression to check if the string \"Reverse\" is present in the `FW_RV` column. If this condition is `TRUE`, the value in the new column we're creating, `Scores_corrected`, will be calculated as 8 minus the value from the Response column. If the condition is FALSE (handled by the .default argument), the original values from the `Response` column will be retained.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n ))\n```\n:::\n\n\n## Option 2\n\nAs stated above, the longer approach involves using two conditional statements. The first condition checks if the value in the `FW_RV` column is \"Reverse\", while the second condition checks if the value in the `Response` column equals a specific number. **When both conditions are met**, the corresponding value on the right side of the tilde is placed in the newly created `Scores_corrected_v2` column.\n\nFor example, line 3 would read: if the value in the `FW_RV` column is \"Reverse\" **AND** the value in the `Response` column is 1, then assign a value of 7 to the `Scores_corrected_v2` column.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n mutate(Scores_corrected_v2 = case_when(\n FW_RV == \"Reverse\" & Response == 1 ~ 7,\n FW_RV == \"Reverse\" & Response == 2 ~ 6,\n FW_RV == \"Reverse\" & Response == 3 ~ 5,\n # no need to recode 4 as 4\n FW_RV == \"Reverse\" & Response == 5 ~ 3,\n FW_RV == \"Reverse\" & Response == 6 ~ 2,\n FW_RV == \"Reverse\" & Response == 7 ~ 1,\n .default = Response\n ))\n```\n:::\n\n\nAs you can see now in `sats_t1`, both columns `Scores_corrected` and `Scores_corrected_v2` are identical.\n\n:::\n\nOne way to **check whether our reverse-coding worked** is by examining the `distinct` values in the original `Response` column and comparing them with the `Scores_corrected`. We should also retain the `FW_RV` column to observe how the reverse-coding applied.\n\nTo see the patterns more clearly, we can use `arrange()` to sort the values in a meaningful order. Remember, the default sorting order is ascending, so if you want to sort values in descending order, you’ll need to wrap your variable in the desc() function.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding <- sats_t1 %>% \n distinct(FW_RV, Response, Scores_corrected) %>% \n arrange(desc(FW_RV), Response)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show `check_coding` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncheck_coding\n```\n\n::: {.cell-output-display}\n
\n\n|FW_RV | Response| Scores_corrected|\n|:-------|--------:|----------------:|\n|Reverse | 1| 7|\n|Reverse | 2| 6|\n|Reverse | 3| 5|\n|Reverse | 4| 4|\n|Reverse | 5| 3|\n|Reverse | 6| 2|\n|Reverse | 7| 1|\n|Forward | 1| 1|\n|Forward | 2| 2|\n|Forward | 3| 3|\n|Forward | 4| 4|\n|Forward | 5| 5|\n|Forward | 6| 6|\n|Forward | 7| 7|\n\n
\n:::\n:::\n\n\n:::\n\n#### Step 5 {.unnumbered}\n\nNow that we know everything worked out as intended, we can calculate the mean scores of each subscale for each participant in `sats_t1`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(???, ???) %>% \n summarise(mean_score = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\n`summarise()` has grouped output by 'Code'. You can override using the\n`.groups` argument.\n```\n:::\n:::\n\n\n:::\n\n:::\n\n#### Step 6 {.unnumbered}\n\nThe final step is to transform the data back into wide format, ensuring that each subscale has its own column. This will make it easier to join the data objects later on. In `pivot_wider()`, the first argument, `names_from`, specifies the column you want to use for your new column headings. The second argument, `values_from`, tells R which column should provide the cell values.\n\nWe should also **rename the column names** to match those in the codebook. Conveniently, we can use a function called `rename()` that works exactly like `select()` (following the pattern `new_name = old_name`), but it keeps all other column names the same rather than reducing the number of columns.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- sats_t1 %>% \n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Show final `sats_t1` output\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nhead(sats_t1, n = 5)\n```\n\n::: {.cell-output-display}\n
\n\n|Code | SATS28_Affect_Time1_mean| SATS28_CognitiveCompetence_Time1_mean| SATS28_Difficulty_Time1_mean| SATS28_Value_Time1_mean|\n|:----|------------------------:|-------------------------------------:|----------------------------:|-----------------------:|\n|AD03 | 2.333333| 3.833333| 3.428571| 5.555556|\n|AD05 | 3.500000| 5.000000| 2.142857| 4.777778|\n|Ab01 | 5.166667| 5.666667| 4.142857| 5.444444|\n|Al05 | 2.166667| 2.666667| 2.857143| 3.777778|\n|Am05 | 4.166667| 5.666667| 5.571429| 4.888889|\n\n
\n:::\n:::\n\n\n:::\n\nAgain, this could have been written up as a single pipe.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Single pipe of activity 4\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsats_t1 <- data_prp %>% \n # Step 1\n select(Code, SATS28_1_Affect_Time1:SATS28_28_Difficulty_Time1) %>% \n # Step 2\n pivot_longer(cols = -Code, names_to = \"Items\", values_to = \"Response\") %>% \n # Step 3\n separate(Items, into = c(NA, \"Item_number\", \"Subscale\", NA), sep = \"_\", convert = TRUE) %>% \n # step 4\n mutate(FW_RV = case_when(\n Item_number %in% c(2, 3, 4, 6, 7, 8, 9, 12, 13, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28) ~ \"Reverse\",\n .default = \"Forward\"\n ),\n Scores_corrected = case_when(\n FW_RV == \"Reverse\" ~ 8-Response,\n .default = Response\n )) %>% \n # step 5\n group_by(Code, Subscale) %>% \n summarise(mean_score = mean(Scores_corrected)) %>% \n ungroup() %>% \n # step 6\n pivot_wider(names_from = Subscale, values_from = mean_score) %>% \n rename(SATS28_Affect_Time1_mean = Affect,\n SATS28_CognitiveCompetence_Time1_mean = CognitiveCompetence,\n SATS28_Value_Time1_mean = Value,\n SATS28_Difficulty_Time1_mean = Difficulty)\n```\n:::\n\n\n:::\n\n\n\n## Activity 5 (Error Mode): Perceptions of supervisory support\n\n#### The main goal is to compute the mean score for perceived supervisory support per participant. {.unnumbered}\n\nLooking at the supervisory support data, you determine that\n\n* individual item columns are , and\n* according to the codebook, there are reverse-coded items in this questionnaire.\n\nI have outlined my steps as follows:\n\n* **Step 1**: Reverse-code the single column first because that's less hassle than having to do that with conditional statements (`Supervisor_15_R`). `mutate()` is my friend.\n* **Step 2**: I want to filter out everyone who failed the attention check in `Supervisor_7`. I can do this with a Boolean expression within the `filter()` function. The correct response was \"completely disagree\" which is 1.\n* **Step 3**: Select their id from time point 2 and all the columns that start with the word \"super\", apart from `Supervisor_7` and the original `Supervisor_15_R` column\n* **Step 4**: pivot into long format so I can calculate the averages better\n* **Step 5**: calculate the average scores per participant\n\nI've started coding but there are some errors in my code. Help me find and fix all of them. Try to go through the code line by line and read the error messages.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_ppr %>% \n mutate(Supervisor_15 = 9-supervisor_15_R) %>% \n filter(Supervisor_7 = 1) %>% \n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) \npivot_wider(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% \n group_by(Time2_Code) %>% \n summarise(Mean_Supervisor_Support = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## How many mistakes am I supposed to find?\n\nThere are 8 mistakes in the code.\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Reveal solution\n\nDid you spot all 8 mistakes? Let's go through them line by line.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsuper <- data_prp %>% # spelling mistake in data object\n mutate(Supervisor_15 = 8-Supervisor_15_R) %>% # semantic error: 8 minus response for a 7-point scale and supervisor_15_R needs a capital S\n filter(Supervisor_7 == 1) %>% # needs a Boolean expression == instead of =\n select(Code, starts_with(\"Super\"), -Supervisor_7, -Supervisor_15_R) %>% # no pipe at the end, the rest is actually legit\n pivot_longer(cols = -Code, names_to = \"Item\", values_to = \"Response\") %>% # pivot_longer instead of pivot_wider\n group_by(Code) %>% # Code rather than Time2_Code - the reduced dataset does not contain Time2_Code\n summarise(Mean_Supervisor_Support = mean(Response, na.rm = TRUE)) %>% # Score_corrected doesn't exist; needs to be Response\n ungroup()\n```\n:::\n\n\n* Note that the **semantic error** in line 2 will not give you an error message.\n* Were you thrown off by the `starts_with(\"Super\")` expression in line 4? `starts_with()` and `ends_with()` are great alternatives to selecting columns via `:` But, using `select(Code, Supervisor_1:Supervisor_6, Supervisor_8:Supervisor_14)` would have given us the same result. *[I admit, that one was perhaps a bit mean]*\n\n:::\n\n## Activity 6: Join everything together with `???_join()`\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: `inner_join()`, `left_join()`, `right_join()`, and `full_join()`. Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n::: {.callout-note icon=\"false\"}\n\n## Info on mutating joins\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n::: panel-tabset\n\nA mutating join allows you to combine variables from two tables. It first matches observations by their keys, then copies across variables from one table to the other.\n\n## `inner_join()`\n\n`inner_join()` returns only the rows where the values in the column specified in the `by =` statement match in both tables.\n\n![inner_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/inner-join.gif)\n\n## `left_join()`\n\n`left_join()` retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the `by =` statement. Rows in the left table with no match in the right table will have missing values (`NA`) in the new columns.\n\n![left_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/left-join.gif)\n\n## `right_join()`\n\n`right_join()` retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the `by =` statement. Rows in the right table with no match in the left table will have missing values (`NA`) in the new columns.\n\n![right_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/right-join.gif)\n\n## `full_join()`\n\n`full_join()` returns all rows and all columns from both tables. `NA` values fill unmatched rows.\n\n![full_join(): gif by [Garrick Aden-Buie](https://www.garrickadenbuie.com/project/tidyexplain/){target=\"_blank\"}](images/full-join.gif)\n\n:::\n\n:::\n\nFrom our original `data_prp`, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the `Global Environment`.\n\nWhile you may be familiar with `inner_join()` from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use `full_join()`. Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\n\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use `select()` at the end to order them better.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndata_prp_final <- data_prp %>% \n select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% \n full_join(qrp_t1) %>% \n full_join(understanding_t1) %>% \n full_join(sats_t1) %>% \n full_join(super) %>% \n select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n```\n:::\n\n\n\n::: {.callout-important icon=\"false\"}\n## No `by` argument in the code above? \n\nNote how I didn't include a `by` argument in the code above. If you leave `by =` out, R will join the 2 data objects by **ALL** columns that have the same name.\n\n**Special case 1: matching column names but different values**\n\nIf you want more control, you should include the `by` argument; for example, if both data objects include a column `age` but data was recorded at 2 different time points. In that case, the information from both `age` columns should be retained and the `by` argument would not include `age`.\n\n**Special case 2: different column names but matching values**\n\nAnother special case presents when both data objects contain identical information but the variable names don't match. Let's say, both data objects contain gender information, but in one data object the variable is named `gender` and in the other one `gender_label`. In that case, your `by` argument needs to be modified as: `by = join_by(gender == gender_label)`.\n\nMore info on joins can be found [https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/](https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/){target=\"_blank\"}\n:::\n\nAnd this is basically the dataset we need for @sec-dataviz and @sec-dataviz2.\n\n\n\n## Activity 7: Knit and export\n\nKnit the `.Rmd` file to ensure everything runs as expected. Once it does, export the data object `data_prp_final` as a csv for use in the @sec-dataviz. Name it something meaningful, something like `data_prp_for_ch4.csv`.\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nwrite_csv(data_prp_final, \"data_prp_for_ch4.csv\")\n```\n:::\n\n\n:::\n\n\n## [Pair-coding]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\nWe will once again be working with data from Binfet et al. (2021), which focuses on the randomised controlled trials data involving therapy dog interventions. Today, our goal is to calculate the average `Loneliness` score for each participant measured at time point 1 (pre-intervention) using the raw data file `dog_data_raw`. Currently, the data looks like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| L1_1| L1_2| L1_3| L1_4| L1_5| L1_6| L1_7| L1_8| L1_9| L1_10| L1_11| L1_12| L1_13| L1_14| L1_15| L1_16| L1_17| L1_18| L1_19| L1_20|\n|---:|----:|----:|----:|----:|----:|----:|----:|----:|----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|\n| 1| 3| 3| 4| 3| 2| 3| 1| 2| 3| 4| 3| 1| 3| 1| 2| 3| 2| 3| 2| 4|\n| 2| 3| 2| 3| 3| 4| 3| 2| 2| 4| 3| 2| 2| 1| 2| 4| 3| 3| 2| 4| 3|\n| 3| 3| 3| 2| 3| 3| 4| 2| 3| 3| 3| 2| 2| 2| 2| 3| 3| 4| 3| 3| 3|\n| 4| 4| 2| 2| 3| 4| 4| 1| 3| 3| 4| 2| 1| 2| 2| 4| 4| 3| 3| 4| 3|\n| 5| 2| 3| 3| 3| 4| 3| 2| 2| 3| 2| 4| 4| 4| 3| 2| 2| 3| 4| 3| 2|\n\n
\n:::\n:::\n\n\nBut we want the data to look like this:\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| Loneliness_pre|\n|---:|--------------:|\n| 1| 2.25|\n| 2| 1.90|\n| 3| 2.25|\n| 4| 1.75|\n| 5| 2.85|\n\n
\n:::\n:::\n\n\nThis task is a bit more challenging compared to last week's lab activity, as the Loneliness scale includes some reverse-coded items.\n\n### Task 1: Open the R project for the lab {.unnumbered}\n\n### Task 2: Open your `.Rmd` file from last week or create a new `.Rmd` file {.unnumbered}\n\nYou could continue the `.Rmd` file you used last week, or create a new `.Rmd`. If you need some guidance, have a look at @sec-rmd.\n\n### Task 3: Load in the library and read in the data {.unnumbered}\n\nThe data should already be in your project folder. If you want a fresh copy, you can download the data again here: [data_pair_ch1](data/data_pair_ch1.zip \"download\").\n\nWe are using the package `tidyverse` today, and the datafile we should read in is `dog_data_raw.csv`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(???)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"???\")\n```\n:::\n\n\n:::\n\n### Task 4: Calculating the mean for `Loneliness_pre` {.unnumbered}\n\n* **Step 1**: Select all relevant columns, such as the participant ID and all 20 items of the `Loneliness` questionnaire completed by participants before the intervention. Store this data in an object called `data_loneliness`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nLook at the codebook. Try to figure out\n\n* the variable name of the column in which the participant id is stored, and\n* which items relate to the Loneliness scale at Stage \"pre\"\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* the participant id column is called `RID`\n* The Loneliness items at pre-intervention stage start with `L1_`\n\n:::\n\n:::\n\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily (in step 3)\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\n`pivot_`\n\nWe also need 3 arguments in that function:\n\n* the columns we want to select (e.g., all the loneliness items),\n* the name of the column in which the current column headings will be stored (e.g., \"Qs\"), and\n* the name of the column that should store all the values (e.g., \"Responses\").\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n pivot_longer(cols = ???, names_to = \"???\", values_to = \"???\")\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 3**: Reverse-scoring\n\nIdentify the items on the `Loneliness` scale that are reverse-coded, and then reverse-score them accordingly.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\nWe need to figure out:\n\n* which are the items of the loneliness scale we need to reverse-score\n* what is the measuring scale of loneliness so we can determine the new values\n* which function to use to create a new column that has the corrected scores in it\n* which one of the `case_` functions will get us there\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n* The items to be reverse-coded items can be found in the codebook: L1_1, L1_5, L1_6, L1_9, L1_10, L1_15, L1_16, L1_19, L1_20\n* the loneliness scale ranges from 1 to 4, so we need to replace 1 with 4, 2 with 3, 3 with 2, and 4 with 1\n* the function to create a new column `mutate()`\n* it's a conditional statement rather than \"just\" replacing values, hence we need `case_when()`\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n mutate(Score_corrected = case_when(\n ??? ~ ???,\n .default = ???\n ))\n```\n:::\n\n\n:::\n\n:::\n\n* **Step 4**: Calculate the average Loneliness score per participant. To match with the table above, we want to call this column `Loneliness_pre`\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hint\n\ngrouping and summarising\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## More concrete hint\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n group_by(???) %>% \n summarise(Loneliness_pre = ???(???)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# loading tidyverse into the library\nlibrary(tidyverse)\n\n# reading in `dog_data_raw.csv`\ndog_data_raw <- read_csv(\"dog_data_raw.csv\")\n\n# Task 4: Tidying \nloneliness_tidy <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L1\")) %>% # select(RID, L1_1:L1_20) also works\n # Step 2\n pivot_longer(cols = -RID, names_to = \"Qs\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score_corrected = case_when(\n Qs %in% c(\"L1_1\", \"L1_5\", \"L1_6\", \"L1_9\", \"L1_10\", \"L1_15\", \"L1_16\", \"L1_19\", \"L1_20\") ~ 5-Response,\n .default = Response\n )) %>% \n # Step 4\n group_by(RID) %>% \n summarise(Loneliness_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n\n:::\n\n## [Test your knowledge and challenge yourself]{style=\"color: #F39C12; text-transform: uppercase;\"} {.unnumbered}\n\n### Knowledge check {.unnumbered}\n\n\n#### Question 1 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to recode an existing variable? \n\n\n#### Question 2 {.unnumbered}\n\nWhen using `mutate()`, which additional function could you use to create a new variable based on one or multiple conditional statements? \n\n\n#### Question 3 {.unnumbered}\n\nWhich of the following functions would you use if you wanted to join two data sets by their shared identifier? \n\n\n#### Question 4 {.unnumbered}\n\nYour data object contains a column `Score` with numbers, but they have been read in incorrectly as a character datatype. Which of the following functions would *not* work for fixing this issue? \n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Explain this answer\n\n* `parse_number()` from the `readr` package extracts numeric values from strings, so this would work.\n* `factor(Score)`: This would *not* work as expected because it converts the column into a factor, not a numeric datatype, leading to incorrect results if numeric operations are needed.\n* `mutate(Score = as.numeric(Score))`: This would work too because `mutate()` can be used in combination with `as.numeric()` to create a new numeric column or override the existing character column.\n* `as.numeric()`: This would also work to convert a character column to numeric. Without mutate, you could use it in a BaseR way, e.g., `data$Score <- as.numeric(data$Score)` (*shudder, BaseR!!! But effective*)\n\n:::\n\n\n\n### Challenge yourself {.unnumbered}\n\nIf you want to **challenge yourself** and further apply the skills from Chapter 3, you could wrangle the data from `dog_data_raw` for one of the other questionnaires. There are plenty of options to choose from:\n\n::: {.callout-tip collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: easy\n\n* recode column `Live_Pets` so the values read yes and no rather than 1 and 2\n* recode `Year_of_Study` so they have the labels from the codebook rather than the numbers\n* reverse-code the `Homesickness` scale for `_pre` and `_post`\n* renaming the columns of the other one-item scales as `Stress_pre`, `Stress_post`, `Engagement_pre` and `Engagement_post`\n\nAny of these tasks should be doable in one step. No need to select or pivot anything. You could just modify `dog_data_raw`.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* For the **recoding tasks**, you need to work out which function to use to recode one value as another - just plain replacing, no conditional statements\n* The **reverse-coding** might sound daunting to do in one step, but it is only a single value that needs to be recoded. Take some inspiration from Activity 5 (error mode).\n* For the **renaming tasks**, check how you would change column names without reducing the number of columns overall\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - easy**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Live_Pets\ndog_data_raw <- dog_data_raw %>%\n mutate(Live_Pets = case_match(Live_Pets,\n 1 ~ \"yes\",\n 2 ~ \"no\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Year of Study\ndog_data_raw <- dog_data_raw %>%\n mutate(Year_of_Study = case_match(Year_of_Study,\n 1 ~ \"First\",\n 2 ~ \"Second\",\n 3 ~ \"Third\",\n 4 ~ \"Fourth\",\n 5 ~ \"Fifth or above\"))\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Reverse-coding of homesickness pre and post. It's a 5-point scale, hence you'd calculate 6-the original response column\ndog_data_raw <- dog_data_raw %>% \n mutate(Homesick_pre = 6-HO1_1,\n Homesick_post = 6-HO2_1)\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## Renaming of Stress and Engagement\ndog_data_raw <- dog_data_raw %>% \n rename(Stress_pre = S1_1, Stress_post = S2_1, Engagement_pre = HO1_2, Engagement_post = HO2_2)\n```\n:::\n\n:::\n:::\n\n::: {.callout-warning collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: medium\n\n* reverse-code the Social connectedness scale (pre-intervention) and compute a mean score per participant\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\nThis task would take 4 steps to complete. These are the exact same steps we applied to `Loneliness_pre` in the lab activity. You would just need to figure out which items are related to the Social connectedness scale (pre-intervention) and which ones of those are reverse-coded. The codebook has all the answers.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - medium**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS pre\nscs_pre <- dog_data_raw %>% \n select(RID, starts_with(\"SC1\")) %>% \n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n mutate(Score_corrected = case_when(\n Names %in% c(\"SC1_3\", \"SC1_6\", \"SC1_7\", \"SC1_9\", \"SC1_11\", \"SC1_13\", \"SC1_15\", \"SC1_17\", \"SC1_18\", \"SC1_20\") ~ 7-Response,\n .default = Response\n )) %>% \n group_by(RID) %>% \n summarise(SCS_pre = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: hard\n\n* reverse-code the Loneliness scale (post-intervention) and compute a mean score per participant\n* reverse-code the Social connectedness scale (post-intervention) and compute a mean score per participant\n\nBoth activities are similar to Activity 3 from the individual walkthrough and would take about 5 steps to complete. **Start by mapping out the steps**.\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n* **Step 1**: Select all relevant columns, such as participant ID and all the items that belong to the questionnaire that participants completed after the intervention\n* **Step 2**: Pivot the data from wide format to long format so we can reverse-score and calculate the average score more easily\n* **Step 3**: Recode the initial responses so that the new column has numbers instead of labels\n* **Step 4**: Reverse-score the items that are labelled as \"Reverse\" in the codebook and then reverse-score them\n* **Step 5**: Group by and summarise to calculate the mean Score\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## loneliness post\nlonely_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"L2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Score = case_match(Response,\n \"never\" ~ 1,\n \"rarely\" ~ 2,\n \"sometimes\" ~ 3,\n \"often\" ~ 4,\n .default = NA\n ),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"L2_1\", \"L2_5\", \"L2_6\", \"L2_9\", \"L2_10\", \"L2_15\", \"L2_16\", \"L2_19\", \"L2_20\") ~ 5-Score,\n .default = Score\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(Loneliness_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n## SCS post\nscs_post <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"SC2\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Names\", values_to = \"Response\") %>% \n # Step 3\n mutate(Response = case_match(Response,\n \"strongly disagree\" ~ \"1\",\n \"strongly agree\" ~ \"6\",\n .default = Response),\n Response = parse_number(Response),\n # Step 4 - we are still in the same mutate function (count the brackets)\n Score_corrected = case_when(\n Names %in% c(\"SC2_3\", \"SC2_6\", \"SC2_7\", \"SC2_9\", \"SC2_11\", \"SC2_13\", \"SC2_15\", \"SC2_17\", \"SC2_18\", \"SC2_20\") ~ 7-Response,\n .default = Response\n )) %>% \n # Step 5\n group_by(RID) %>% \n summarise(SCS_post = mean(Score_corrected, na.rm = TRUE)) %>% \n ungroup()\n```\n:::\n\n:::\n:::\n\n::: {.callout-important collapse=\"true\" icon=\"false\"}\n\n## Difficulty level: extra hard\n\n* PANAS: positive and negative affect of pre- and post-intervention in a single pipe rather than in 4 different data objects (see last week's)\n\nThis task would take about 7 steps to get it from\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID| PN1_1| PN1_2| PN1_3| PN1_4| PN1_5| PN1_6| PN1_7| PN1_8| PN1_9| PN1_10| PN2_1| PN2_2| PN2_3| PN2_4| PN2_5| PN2_6| PN2_7| PN2_8| PN2_9| PN2_10|\n|---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|------:|\n| 1| 1| 1| 1| 1| 4| 1| 4| 3| 1| 4| 2| 1| 3| 1| 4| 1| 4| 4| 1| 4|\n| 2| 1| 2| 3| 2| 1| 3| 3| 4| 1| 4| 1| 1| 2| 1| 3| 1| 3| 4| 1| 4|\n| 3| 1| 1| 3| 1| 2| 4| 4| 3| 1| 2| 2| 2| 3| 1| 3| 2| 4| 3| 1| 2|\n| 4| 1| 1| 5| 1| 4| 3| 5| 5| 3| 2| 1| 1| 5| 1| 4| 3| 4| 4| 2| 2|\n| 5| 2| 3| 5| 2| 3| 2| 3| 4| 2| 2| 1| 2| 5| 2| 3| 2| 4| 5| 1| 3|\n\n
\n:::\n:::\n\n\nto\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n
\n\n| RID|Stage | PANAS_NA| PANAS_PA|\n|---:|:-----|--------:|--------:|\n| 1|post | 1.2| 3.8|\n| 1|pre | 1.0| 3.2|\n| 2|post | 1.0| 3.2|\n| 2|pre | 1.8| 3.0|\n| 3|post | 1.6| 3.0|\n\n
\n:::\n:::\n\n\n::: {.callout-note collapse=\"true\" icon=\"false\"}\n\n## Hints\n\n**Start by mapping out the steps**\n\n* **Step 1**: select all relevant columns, such as participant ID and all the items that belong to PANAs scale (pos, neg, pre, and post)\n* **Step 2**: pivot the data from wide format to long format. You want to do that for ALL columns that are not the participant id. The data object should have 3 columns and 5680 observations, i.e. each participant has 20 rows.\n* **Step 3**: All of the items will have the structure `PN1_1`. Use separate to split the information across 2 columns. First column has information about the `Stage`, second column should turn into an `Item_number` and it should convert into a numeric column in the process to save you typing quotation marks in Step 5.\\\n* **Step 4**: recode the `Stage` column you just created so that everything that starts with PN1 relates to \"pre\" and PN2 as post.\n* **Step 5**: identify the subscales positive affect (PA) and negative affect (NA) by item number and recode them. This requires a conditional statement.\n* **Step 6**: group by and summarise to calculate the mean Score\n* **Step 7**: pivot, so that you have the 2 PANAS subscales presented in separate columns (see table above). You might need an extra step if the columns aren't labelled exactly as shown in the table above.\n:::\n\n::: {.callout-caution collapse=\"true\" icon=\"false\"}\n\n## Solution for **Challenge yourself - extra hard**\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nPANAS <- dog_data_raw %>% \n # Step 1\n select(RID, starts_with(\"PN\")) %>% \n # Step 2\n pivot_longer(cols = -RID, names_to = \"Items\", values_to = \"Scores\") %>% \n # Step 3\n separate(Items, into = c(\"Stage\", \"Item_number\"), sep = \"_\", convert = TRUE) %>% \n # Step 4 recode Stage\n mutate(Stage = case_match(Stage,\n \"PN1\" ~ \"pre\",\n \"PN2\" ~ \"post\")) %>% \n # Step 5 identify subscales by item number\n mutate(Subscales = case_when(\n Item_number %in% c(3, 5, 7, 8, 10) ~ \"PANAS_PA\",\n .default = \"PANAS_NA\"\n )) %>% \n # Step 6 \n group_by(RID, Stage, Subscales) %>% \n summarise(Score = mean(Scores)) %>% \n ungroup() %>% \n # Step 7 - to make the data look like the data in `dog_data_clean_long.csv`\n pivot_wider(names_from = Subscales, values_from = Score)\n```\n:::\n\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/docs/03-wrangling2.html b/docs/03-wrangling2.html index b835d5b..5de718b 100644 --- a/docs/03-wrangling2.html +++ b/docs/03-wrangling2.html @@ -1146,7 +1146,7 @@

Note: Since I (Gaby) like my columns arranged in a meaningful way, I will use select() at the end to order them better.

data_prp_final <- data_prp %>% 
-  select(Code:Plan_prereg, Other_OS_behav_2:Time2_Understanding_OS) %>% 
+  select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% 
   full_join(qrp_t1) %>% 
   full_join(understanding_t1) %>% 
   full_join(sats_t1) %>% 
diff --git a/docs/search.json b/docs/search.json
index 45f6ced..81ebab8 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -200,7 +200,7 @@
     "href": "03-wrangling2.html#activity-6-join-everything-together-with-_join",
     "title": "3  Data wrangling II",
     "section": "\n3.6 Activity 6: Join everything together with ???_join()\n",
-    "text": "3.6 Activity 6: Join everything together with ???_join()\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: inner_join(), left_join(), right_join(), and full_join(). Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n\n\n\n\n\nInfo on mutating joins\n\n\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n\ninner_join()\nleft_join()\nright_join()\nfull_join()\n\n\n\ninner_join() returns only the rows where the values in the column specified in the by = statement match in both tables.\n\n\ninner_join(): gif by Garrick Aden-Buie\n\n\n\nleft_join() retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the by = statement. Rows in the left table with no match in the right table will have missing values (NA) in the new columns.\n\n\nleft_join(): gif by Garrick Aden-Buie\n\n\n\nright_join() retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the by = statement. Rows in the right table with no match in the left table will have missing values (NA) in the new columns.\n\n\nright_join(): gif by Garrick Aden-Buie\n\n\n\nfull_join() returns all rows and all columns from both tables. NA values fill unmatched rows.\n\n\nfull_join(): gif by Garrick Aden-Buie\n\n\n\n\n\n\nFrom our original data_prp, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the Global Environment.\nWhile you may be familiar with inner_join() from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use full_join(). Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use select() at the end to order them better.\n\ndata_prp_final <- data_prp %>% \n  select(Code:Plan_prereg, Other_OS_behav_2:Time2_Understanding_OS) %>% \n  full_join(qrp_t1) %>% \n  full_join(understanding_t1) %>% \n  full_join(sats_t1) %>% \n  full_join(super) %>% \n  select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n\n\n\n\n\n\n\nNo by argument in the code above?\n\n\n\nNote how I didn’t include a by argument in the code above. If you leave by = out, R will join the 2 data objects by ALL columns that have the same name.\nSpecial case 1: matching column names but different values\nIf you want more control, you should include the by argument; for example, if both data objects include a column age but data was recorded at 2 different time points. In that case, the information from both age columns should be retained and the by argument would not include age.\nSpecial case 2: different column names but matching values\nAnother special case presents when both data objects contain identical information but the variable names don’t match. Let’s say, both data objects contain gender information, but in one data object the variable is named gender and in the other one gender_label. In that case, your by argument needs to be modified as: by = join_by(gender == gender_label).\nMore info on joins can be found https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/\n\n\nAnd this is basically the dataset we need for Chapter 4 and ?sec-dataviz2."
+    "text": "3.6 Activity 6: Join everything together with ???_join()\n\nTime to join all the relevant data files into a single dataframe, which will be used in the next chapters on data visualization. There are four ways to join data: inner_join(), left_join(), right_join(), and full_join(). Each function behaves differently in terms of what information is retained from the two data objects. Here is a quick overview:\n\n\n\n\n\n\nInfo on mutating joins\n\n\n\nYou have 4 types of join functions you could make use of. Click on the panels to know more\n\n\ninner_join()\nleft_join()\nright_join()\nfull_join()\n\n\n\ninner_join() returns only the rows where the values in the column specified in the by = statement match in both tables.\n\n\ninner_join(): gif by Garrick Aden-Buie\n\n\n\nleft_join() retains the complete first (left) table and adds values from the second (right) table that have matching values in the column specified in the by = statement. Rows in the left table with no match in the right table will have missing values (NA) in the new columns.\n\n\nleft_join(): gif by Garrick Aden-Buie\n\n\n\nright_join() retains the complete second (right) table and adds values from the first (left) table that have matching values in the column specified in the by = statement. Rows in the right table with no match in the left table will have missing values (NA) in the new columns.\n\n\nright_join(): gif by Garrick Aden-Buie\n\n\n\nfull_join() returns all rows and all columns from both tables. NA values fill unmatched rows.\n\n\nfull_join(): gif by Garrick Aden-Buie\n\n\n\n\n\n\nFrom our original data_prp, we need to select demographics data and all summarised questionnaire data from time point 2. Next, we will join this with all other aggregated datasets from time point 1 which are currently stored in separate data objects in the Global Environment.\nWhile you may be familiar with inner_join() from last year, for this task, we want to retain all data from all the data objects. Therefore, we will use full_join(). Keep in mind, you can only join two data objects at a time, so the upcoming code chunk will involve a fair bit of piping and joining.\nNote: Since I (Gaby) like my columns arranged in a meaningful way, I will use select() at the end to order them better.\n\ndata_prp_final <- data_prp %>% \n  select(Code:Plan_prereg, Pre_reg_group:Time2_Understanding_OS) %>% \n  full_join(qrp_t1) %>% \n  full_join(understanding_t1) %>% \n  full_join(sats_t1) %>% \n  full_join(super) %>% \n  select(Code:Plan_prereg, Pre_reg_group, SATS28_Affect_Time1_mean, SATS28_CognitiveCompetence_Time1_mean, SATS28_Value_Time1_mean, SATS28_Difficulty_Time1_mean, QRPs_Acceptance_Time1_mean, Time1_Understanding_OS, Other_OS_behav_2:Time2_Understanding_OS, Mean_Supervisor_Support)\n\n\n\n\n\n\n\nNo by argument in the code above?\n\n\n\nNote how I didn’t include a by argument in the code above. If you leave by = out, R will join the 2 data objects by ALL columns that have the same name.\nSpecial case 1: matching column names but different values\nIf you want more control, you should include the by argument; for example, if both data objects include a column age but data was recorded at 2 different time points. In that case, the information from both age columns should be retained and the by argument would not include age.\nSpecial case 2: different column names but matching values\nAnother special case presents when both data objects contain identical information but the variable names don’t match. Let’s say, both data objects contain gender information, but in one data object the variable is named gender and in the other one gender_label. In that case, your by argument needs to be modified as: by = join_by(gender == gender_label).\nMore info on joins can be found https://www.tidyverse.org/blog/2023/01/dplyr-1-1-0-joins/\n\n\nAnd this is basically the dataset we need for Chapter 4 and ?sec-dataviz2."
   },
   {
     "objectID": "03-wrangling2.html#activity-7-knit-and-export",