Correlation Coefficients by Teams and Tasks.Rmd

---
title: "Correlation indices - Forecast Tournament"
author: "Sangsuk Yoon & Igor Grossmann"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: 
  html_document:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r, echo=T, warning=F, message=F}
library(tidyverse)
library(dplyr)
library(reshape2)
```

# Read the dataset
```{r, echo=T, warning=F, message=F}
dat <- read.csv("dat_for_analyses.csv", stringsAsFactors = FALSE) # Read the dataset
```

# Getting Correlation Coefficients
## Part 1: For Phase 1 - Both Academics and General Population - to allow comparison
```{r, warning=F}
# Getting the Objective Values
objective <- dat %>%
  filter(Method == "Objective", phase == 1) %>%
  dplyr::select(domain:Month.12)


# Getting Phase 1 Data both for Academics and General Population
phase1_for_correlation <- dat %>% filter(phase == 1 & Method != "Objective")

# Creating an empty dataset to store correlation coefficients
correlation_phase1 <- data.frame(matrix(NA, nrow = nrow(phase1_for_correlation), ncol = 5))
colnames(correlation_phase1) <- c("X", "ResponseId", "domain", "phase1_cor_twelve", "phase1_cor_six")

# Looping the Phase 1 data
for (i in 1:nrow(phase1_for_correlation)) {
  
  temp_dat <- phase1_for_correlation[i,] # data for each team, each domain 
  
  # key variables to connect correlation coefficients with the original dataset
  temp_X <- temp_dat$X # X value in the dataset
  temp_responseid <- temp_dat$ResponseId # ResponseId in the dataset
  temp_domain <- temp_dat$domain # Domain name in the dataset
  
  # Reshape the Objective data from wide to long
  task_actual <- subset(objective, domain == temp_domain) %>% 
    pivot_longer(!domain, names_to="month", values_to="actual")
  
  # Reshape the team and the domain data from wide to long
  team_task_estimates <- temp_dat %>% 
    dplyr::select(domain:Month.12) %>%  
    pivot_longer(!domain, names_to="month", values_to="estimate")
  
  # Combine the objective data and estimates data
  temp_combined <- left_join(team_task_estimates, task_actual, by = c("domain", "month"))
  
  # Getting correlation coefficient for 12 months data (Month1 through Month12)
  temp_sd_twelve <- with(temp_combined, sd(estimate, na.rm=T)) # Getting SD of predictions across 12 months
  if (temp_sd_twelve == 0) {
    phase1_cor_twelve <- NA
  } else {
    phase1_cor_twelve <- with(temp_combined, cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
  }
  
  # Getting correlation coefficient for last 6 months data (Month7 through Month12)
  temp_sd_six <- with(temp_combined[7:12,], sd(estimate, na.rm=T)) # Getting SD of predictions across last six months
  if (temp_sd_six == 0) {
    phase1_cor_last_six <- NA
  } else {
    phase1_cor_last_six <- with(temp_combined[7:12,], cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
  }
  
  # Adding the correlation coefficients to the empty data frame for each team and for each domain
  correlation_phase1[i,] <- c(temp_X, temp_responseid, temp_domain, phase1_cor_twelve, phase1_cor_last_six)

}

```

## Part 2: For Phase 2 - Academics
```{r, warning=F}
# Academics & Phase 2
phase2_for_correlation <- dat %>%
  filter(isExpert == 1 & !(phase == 1 & revised == 1)) #just academics, omitting original (non-revised phase 1) - got this from Igor

# Creating an empty dataset where correlation coefficients will be stored
correlation_phase2 <- data.frame(matrix(NA, nrow = nrow(phase2_for_correlation), ncol = 5))
colnames(correlation_phase2) <- c("X", "ResponseId", "domain", "phase2_cor_twelve", "phase2_cor_six")

for (i in 1:nrow(phase2_for_correlation)) {
  
  temp_dat <- phase2_for_correlation[i,] # data for each team and each domain
  
  # key variables to connect correlation coefficients with the original dataset
  temp_X <- temp_dat$X # X value in the dataset
  temp_responseid <- temp_dat$ResponseId # ResponseId in the dataset
  temp_domain <- temp_dat$domain # task domain in the dataset
  
  # Reshape the Objective data from wide to long
  task_actual <- subset(objective, domain == temp_domain) %>% 
    pivot_longer(!domain, names_to="month", values_to="actual")
  
  # Reshape the team and the domain data from wide to long
  team_task_estimates <- temp_dat %>% 
    dplyr::select(domain:Month.12) %>%  
    pivot_longer(!domain, names_to="month", values_to="estimate")
  
  # Combine the objective data and estimates data
  temp_combined <- left_join(team_task_estimates, task_actual, by = c("domain", "month"))
  
  # Getting correlation coefficient for 12 months data (Month1 through Month12)
  temp_sd_twelve <- with(temp_combined, sd(estimate, na.rm=T)) # Getting SD of predictions across 12 months
  if (temp_sd_twelve == 0) {
    phase2_cor_twelve <- NA
  } else {
    phase2_cor_twelve <- with(temp_combined, cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
  }
  
  # Getting correlation coefficient for last 6 months data (Month7 through Month12)
  temp_sd_six <- with(temp_combined[7:12,], sd(estimate, na.rm=T)) # Getting SD of predictions across last six months
  if (temp_sd_six == 0) {
    phase2_cor_last_six <- NA
  } else {
    phase2_cor_last_six <- with(temp_combined[7:12,], cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
  }
  
  # Adding the correlation coefficients to the empty data frame for each team and for each domain
  correlation_phase2[i,] <- c(temp_X, temp_responseid, temp_domain, phase2_cor_twelve, phase2_cor_last_six)

}
```


# Combining correlation tables and the original data table
```{r, warning=F}
# Merge correlation tables from both phases
correlation_table_full <- full_join(correlation_phase1, correlation_phase2, by=c("X", "ResponseId", "domain"))
correlation_table_full$X <- as.integer(correlation_table_full$X)
correlation_table_full$phase1_cor_twelve<-as.numeric(correlation_table_full$phase1_cor_twelve)
correlation_table_full$phase2_cor_six<-as.numeric(correlation_table_full$phase2_cor_six)

# Merge the combined correlation data table with the original dataset
dat <- left_join(dat, correlation_table_full, by=c("X", "ResponseId", "domain"))
```

# associations with MASE scores in each tournament for each group
```{r}
academic_only <- filter(dat, isExpert == 1 )
#datasets that are filtered by phase (1 = May submission, 2 = November submission that were updated)
phase1 <- filter(dat, phase == 1)
phase2 <- academic_only %>% 
  filter(!(phase == 1 & revised == 1))

# Phase 1 further filtered to only include academics 
phase1_exp <- filter(phase1, isExpert == 1)
```

# test average correlations by domain and when comparing groups. 
```{r}

library(correlation)

#phase 1
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, team_name, domain)  %>%  correlation(multilevel=T)
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation()
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(mean) #get the mean
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(median) #get the median


phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, ResponseId, domain)  %>%  correlation(multilevel=T)
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation()
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(mean)
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(median)

#phase 1 in total
phase1 %>%dplyr::select(phase1_cor_twelve, MASE1_w1, ResponseId,domain)  %>% correlation(multilevel=T)

#phase 2
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, team_name, domain)  %>%  correlation(multilevel=T)
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain)  %>% group_by(domain)  %>% correlation()
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(mean)
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain)  %>% group_by(domain)  %>% correlation() %>% summarize_all(median)
```

# compare effects of lay versus academics
```{r}

library(lme4)
library(emmeans)
library(jtools)
#phase 1
model.phase1.corr<-  lmer(phase1_cor_twelve~domain*isExpert.factor+(1|ResponseId), data=phase1)
car::Anova(model.phase1.corr,type="III", test.statistic="F")
emmeans(model.phase1.corr, ~isExpert.factor) #collapsed estimate across domain # it is very small

emmeans(model.phase1.corr, pairwise~isExpert.factor|domain)  #specific domains
model.phase1.corr.explaycomp<-as.data.frame(emmeans(model.phase1.corr, pairwise~isExpert.factor|domain)$contrasts) #get the estimates in a dataframe

#get FDR correction across all pairwise tests
model.phase1.corr.explaycomp$Hochberg <-p.adjust(model.phase1.corr.explaycomp$p.value,
               method = "hochberg")
model.phase1.corr.explaycomp
#neg affect is the only domain with a difference, but in the opposite direction

#phase 2
model.phase2.corr<-  lmer(phase2_cor_six~domain+(1|ResponseId), data=phase2)
car::Anova(model.phase2.corr,type="III", test.statistic="F")
summ(model.phase2.corr)
emmeans(model.phase2.corr, ~1) #collapsed estimate across domain # it is very small


```