forked from grossmania/Forecasting-Tournament
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Correlation Coefficients by Teams and Tasks.Rmd
219 lines (169 loc) · 9.3 KB
/
Correlation Coefficients by Teams and Tasks.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
---
title: "Correlation indices - Forecast Tournament"
author: "Sangsuk Yoon & Igor Grossmann"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
code_folding: hide
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r, echo=T, warning=F, message=F}
library(tidyverse)
library(dplyr)
library(reshape2)
```
# Read the dataset
```{r, echo=T, warning=F, message=F}
dat <- read.csv("dat_for_analyses.csv", stringsAsFactors = FALSE) # Read the dataset
```
# Getting Correlation Coefficients
## Part 1: For Phase 1 - Both Academics and General Population - to allow comparison
```{r, warning=F}
# Getting the Objective Values
objective <- dat %>%
filter(Method == "Objective", phase == 1) %>%
dplyr::select(domain:Month.12)
# Getting Phase 1 Data both for Academics and General Population
phase1_for_correlation <- dat %>% filter(phase == 1 & Method != "Objective")
# Creating an empty dataset to store correlation coefficients
correlation_phase1 <- data.frame(matrix(NA, nrow = nrow(phase1_for_correlation), ncol = 5))
colnames(correlation_phase1) <- c("X", "ResponseId", "domain", "phase1_cor_twelve", "phase1_cor_six")
# Looping the Phase 1 data
for (i in 1:nrow(phase1_for_correlation)) {
temp_dat <- phase1_for_correlation[i,] # data for each team, each domain
# key variables to connect correlation coefficients with the original dataset
temp_X <- temp_dat$X # X value in the dataset
temp_responseid <- temp_dat$ResponseId # ResponseId in the dataset
temp_domain <- temp_dat$domain # Domain name in the dataset
# Reshape the Objective data from wide to long
task_actual <- subset(objective, domain == temp_domain) %>%
pivot_longer(!domain, names_to="month", values_to="actual")
# Reshape the team and the domain data from wide to long
team_task_estimates <- temp_dat %>%
dplyr::select(domain:Month.12) %>%
pivot_longer(!domain, names_to="month", values_to="estimate")
# Combine the objective data and estimates data
temp_combined <- left_join(team_task_estimates, task_actual, by = c("domain", "month"))
# Getting correlation coefficient for 12 months data (Month1 through Month12)
temp_sd_twelve <- with(temp_combined, sd(estimate, na.rm=T)) # Getting SD of predictions across 12 months
if (temp_sd_twelve == 0) {
phase1_cor_twelve <- NA
} else {
phase1_cor_twelve <- with(temp_combined, cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
}
# Getting correlation coefficient for last 6 months data (Month7 through Month12)
temp_sd_six <- with(temp_combined[7:12,], sd(estimate, na.rm=T)) # Getting SD of predictions across last six months
if (temp_sd_six == 0) {
phase1_cor_last_six <- NA
} else {
phase1_cor_last_six <- with(temp_combined[7:12,], cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
}
# Adding the correlation coefficients to the empty data frame for each team and for each domain
correlation_phase1[i,] <- c(temp_X, temp_responseid, temp_domain, phase1_cor_twelve, phase1_cor_last_six)
}
```
## Part 2: For Phase 2 - Academics
```{r, warning=F}
# Academics & Phase 2
phase2_for_correlation <- dat %>%
filter(isExpert == 1 & !(phase == 1 & revised == 1)) #just academics, omitting original (non-revised phase 1) - got this from Igor
# Creating an empty dataset where correlation coefficients will be stored
correlation_phase2 <- data.frame(matrix(NA, nrow = nrow(phase2_for_correlation), ncol = 5))
colnames(correlation_phase2) <- c("X", "ResponseId", "domain", "phase2_cor_twelve", "phase2_cor_six")
for (i in 1:nrow(phase2_for_correlation)) {
temp_dat <- phase2_for_correlation[i,] # data for each team and each domain
# key variables to connect correlation coefficients with the original dataset
temp_X <- temp_dat$X # X value in the dataset
temp_responseid <- temp_dat$ResponseId # ResponseId in the dataset
temp_domain <- temp_dat$domain # task domain in the dataset
# Reshape the Objective data from wide to long
task_actual <- subset(objective, domain == temp_domain) %>%
pivot_longer(!domain, names_to="month", values_to="actual")
# Reshape the team and the domain data from wide to long
team_task_estimates <- temp_dat %>%
dplyr::select(domain:Month.12) %>%
pivot_longer(!domain, names_to="month", values_to="estimate")
# Combine the objective data and estimates data
temp_combined <- left_join(team_task_estimates, task_actual, by = c("domain", "month"))
# Getting correlation coefficient for 12 months data (Month1 through Month12)
temp_sd_twelve <- with(temp_combined, sd(estimate, na.rm=T)) # Getting SD of predictions across 12 months
if (temp_sd_twelve == 0) {
phase2_cor_twelve <- NA
} else {
phase2_cor_twelve <- with(temp_combined, cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
}
# Getting correlation coefficient for last 6 months data (Month7 through Month12)
temp_sd_six <- with(temp_combined[7:12,], sd(estimate, na.rm=T)) # Getting SD of predictions across last six months
if (temp_sd_six == 0) {
phase2_cor_last_six <- NA
} else {
phase2_cor_last_six <- with(temp_combined[7:12,], cor(estimate, actual, method = c("pearson"), use = "pairwise.complete.obs"))
}
# Adding the correlation coefficients to the empty data frame for each team and for each domain
correlation_phase2[i,] <- c(temp_X, temp_responseid, temp_domain, phase2_cor_twelve, phase2_cor_last_six)
}
```
# Combining correlation tables and the original data table
```{r, warning=F}
# Merge correlation tables from both phases
correlation_table_full <- full_join(correlation_phase1, correlation_phase2, by=c("X", "ResponseId", "domain"))
correlation_table_full$X <- as.integer(correlation_table_full$X)
correlation_table_full$phase1_cor_twelve<-as.numeric(correlation_table_full$phase1_cor_twelve)
correlation_table_full$phase2_cor_six<-as.numeric(correlation_table_full$phase2_cor_six)
# Merge the combined correlation data table with the original dataset
dat <- left_join(dat, correlation_table_full, by=c("X", "ResponseId", "domain"))
```
# associations with MASE scores in each tournament for each group
```{r}
academic_only <- filter(dat, isExpert == 1 )
#datasets that are filtered by phase (1 = May submission, 2 = November submission that were updated)
phase1 <- filter(dat, phase == 1)
phase2 <- academic_only %>%
filter(!(phase == 1 & revised == 1))
# Phase 1 further filtered to only include academics
phase1_exp <- filter(phase1, isExpert == 1)
```
# test average correlations by domain and when comparing groups.
```{r}
library(correlation)
#phase 1
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, team_name, domain) %>% correlation(multilevel=T)
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation()
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(mean) #get the mean
phase1_exp %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(median) #get the median
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, ResponseId, domain) %>% correlation(multilevel=T)
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation()
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(mean)
phase1 %>%filter(isExpert==0) %>% dplyr::select(phase1_cor_twelve, MASE1_w1, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(median)
#phase 1 in total
phase1 %>%dplyr::select(phase1_cor_twelve, MASE1_w1, ResponseId,domain) %>% correlation(multilevel=T)
#phase 2
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, team_name, domain) %>% correlation(multilevel=T)
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain) %>% group_by(domain) %>% correlation()
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(mean)
phase2 %>% dplyr::select(phase2_cor_six, MASE1_w2, domain) %>% group_by(domain) %>% correlation() %>% summarize_all(median)
```
# compare effects of lay versus academics
```{r}
library(lme4)
library(emmeans)
library(jtools)
#phase 1
model.phase1.corr<- lmer(phase1_cor_twelve~domain*isExpert.factor+(1|ResponseId), data=phase1)
car::Anova(model.phase1.corr,type="III", test.statistic="F")
emmeans(model.phase1.corr, ~isExpert.factor) #collapsed estimate across domain # it is very small
emmeans(model.phase1.corr, pairwise~isExpert.factor|domain) #specific domains
model.phase1.corr.explaycomp<-as.data.frame(emmeans(model.phase1.corr, pairwise~isExpert.factor|domain)$contrasts) #get the estimates in a dataframe
#get FDR correction across all pairwise tests
model.phase1.corr.explaycomp$Hochberg <-p.adjust(model.phase1.corr.explaycomp$p.value,
method = "hochberg")
model.phase1.corr.explaycomp
#neg affect is the only domain with a difference, but in the opposite direction
#phase 2
model.phase2.corr<- lmer(phase2_cor_six~domain+(1|ResponseId), data=phase2)
car::Anova(model.phase2.corr,type="III", test.statistic="F")
summ(model.phase2.corr)
emmeans(model.phase2.corr, ~1) #collapsed estimate across domain # it is very small
```