-
Notifications
You must be signed in to change notification settings - Fork 5
/
flusight-seasonal-baseline.R
296 lines (232 loc) · 11.2 KB
/
flusight-seasonal-baseline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
##The following code produces the peak week and peak intensity historic baseline estimates for the 2024/2025 season of the FluSight Challenge.
##This method and code was adapted from the 2015/16 through 2019/20 FluSight Challenge historical baseline generating method.
##Packages
library(lubridate)
library(readr)
library(dplyr)
library(tidyr)
library(checkmate)
#specify output path
userid <- Sys.info()["user"]
output_dirpath <- paste0("C:/Users/",userid,"/Desktop/GitHub/Flusight-baseline/weekly-submission/forecasts/Flusight-seasonal-baseline/")
#hub_outputpath <- paste0("C:/Users/",userid,"/Desktop/Github/FluSight-forecast-hub/model-output/Flusight-seasonal-baseline/")
#########################
## Functions
# Reference Date
curr_else_next_date_with_ltwday <- function(date, ltwday) {
assert_class(date, "Date")
assert_integerish(ltwday, lower = 0L, upper = 7L)
#
date + (ltwday - as.POSIXlt(date)$wday) %% 7L
}
forecast_as_of_date <- Sys.Date()
reference_date <- curr_else_next_date_with_ltwday(forecast_as_of_date, 6L) # Saturday
# Conversion from epi week to date
epiweekToDate <- function(year, weekno) {
# Create a date object for January 1 of the given year
first_day_of_year <- as.Date(paste(year, "-01-01", sep=""))
# Calculate the first Monday of the given ISO week (week starts on Monday)
start_date <- first_day_of_year + weeks(weekno - 1)
# Adjust the start_date to the first Monday of the given week
weekday_start <- wday(start_date, week_start = 1) # Week starts on Monday
# If the start date is not Monday, calculate the first Monday of the week
if (weekday_start != 1) {
start_date <- start_date - days(weekday_start - 1)
}
# Now that we know the first Monday, adjust to Saturday (5 days after Monday)
last_saturday <- start_date + days(5) # Saturday is 5 days after Monday
return(last_saturday)
}
#########################
## Load Data
#FluSurv-NET data 2010/11 through 2019/20
flusurv_state <- read.csv(paste0("C:/Users/",userid,"/Desktop/Github/Flusight-baseline/seasonal-historic/FluSurv-NET_pastdata.csv"))
fsn_data <- flusurv_state %>%
filter(AGE.CATEGORY=="Overall", SEX.CATEGORY=="Overall", RACE.CATEGORY=="Overall", VIRUS.TYPE.CATEGORY=="Overall") %>%
mutate(weekly_rate = as.numeric(WEEKLY.RATE)) %>%
rename("season"="YEAR", "epidemic_week"="WEEK", "location"= "CATCHMENT") %>%
filter(epidemic_week <= 22 | epidemic_week >= 35) %>%
mutate(`epidemic_week`=as.numeric(`epidemic_week`), week = ifelse(epidemic_week < 36, epidemic_week + 52, epidemic_week))%>%
filter(!(season %in% c("2009-10","2020-21","2021-22", "2023-24", "2024-25"))) %>%
select(season, epidemic_week, week, weekly_rate, location)
# State population data
state_pop<-read.csv(paste0("C:/Users/",userid,"/Desktop/GitHub/FluSight-forecast-hub/auxiliary-data/locations.csv")) %>%
select(location, abbreviation, population)
# list of locations codes
states <- state_pop %>% pull(location)
# NHSN data 2021-2024
nhsn <- read_csv("../Documents/Data/Flu Admissions 21-24.csv")
# Format NHSN data
nhsn_rates <- nhsn %>%
mutate(
date = mdy(date), # Convert MM/DD/YYYY to Date
epidemic_week = epiweek(date), # Extract epidemic week using lubridate
year = year(date)) %>%
filter(epidemic_week <= 22 | epidemic_week >= 35) %>%
mutate(week = ifelse(epidemic_week < 36, epidemic_week + 52, epidemic_week)) %>%
group_by(state, year, epidemic_week,week) %>% # Aggregate by state and week
summarise(
total_flu_admissions = sum(admissions, na.rm = TRUE),
.groups = 'drop' # To drop the grouping after summarization
)%>%
mutate(
season = case_when(
epidemic_week >= 35 & epidemic_week <= 52 ~ paste(year, year + 1, sep = "/"), # For weeks 35-52 of a year
epidemic_week >= 1 & epidemic_week <= 20 ~ paste(year - 1, year, sep = "/"), # For weeks 1-20 of the following year
TRUE ~ NA_character_ # For weeks outside of the flu season (unlikely, but safe to include)
)
) %>% filter(!is.na(season)) %>% filter(!(season %in% c("2020/2021","2019/2020"))) %>%
left_join(state_pop, by=c("state"="abbreviation")) %>% # Calculate rates using state populations
mutate(rate=total_flu_admissions/population *100000) %>% select(epidemic_week, week, season, rate, state) %>%
rename("weekly_rate"= "rate", "location"="state")
#Combine FSN and NHSN data
combo<- rbind(fsn_data, nhsn_rates) %>% filter(!is.na(weekly_rate))
####################
##Peak Week Function
predict_peak_week <- function(these_data) {
# Create data table of peak weeks across years
these_peaks <- these_data %>%
group_by(season, location) %>%
mutate(observation = round(weekly_rate, 1)) %>%
filter(observation == max(observation, na.rm = TRUE)) %>%
ungroup()
# Generate function approximating kernel of past peak values
pkwk_kernel <- these_peaks %>%
pull(week) %>%
density(kernel = "gaussian", bw = "sj", from = min(these_data$week)) %>%
approxfun(rule = 1:2)
pkper_kernel <- these_peaks %>%
pull(weekly_rate) %>%
density(kernel = "gaussian", bw = "sj", from = min(these_data$weekly_rate)) %>%
approxfun(rule = 1:2)
#### Peak week forecasts
pkwk_pred <- tibble(reference_date = rep(reference_date, 35),
target = rep("peak week inc flu hosp", 35),
horizon = rep(NA,35),
target_end_date = rep(NA, 35),
output_type = rep("pmf", 35),
unit = rep("week", 35),
bin_start_incl = seq(40, 74, 1),
bin_end_notincl = seq(41, 75, 1))%>%
rowwise() %>%
mutate(value = integrate(pkwk_kernel, bin_start_incl, bin_end_notincl)[[1]]) %>%
ungroup() %>%
mutate(bin_start_incl = ifelse(bin_start_incl > 52, bin_start_incl - 52,
bin_start_incl),
bin_end_notincl = ifelse(bin_end_notincl > 53, bin_end_notincl - 52,
bin_end_notincl),
year = ifelse(bin_start_incl >= 40, 2024, 2025), # Assign year based on epidemic week
output_type_id = as.Date(mapply(epiweekToDate, year, bin_start_incl)),
value = value / sum(value)) %>%
rename(epidemic_week = bin_start_incl) %>%
select(-c(bin_end_notincl, year, unit)) %>%
select(-value, everything(), value)
return(pkwk_pred)
}
peak_week <- predict_peak_week(combo)
#########################
# Redistributing Probabilities
##The distribution was calculated across weeks 40-52 and 1-22 to encompass a typical influenza season, and the below code
##redistributes the probabilities of weeks not included in the current season's FluSight Challenge to the remaining weeks.
# Define the weeks to remove (weeks 40-46)
removed_weeks <- 40:46
# Define the remaining weeks (weeks 47-52 and 1-22)
remaining_weeks <- c(47:52, 1:22)
#Function for reallocation
redistribute_probabilities <- function(data, removed_weeks, remaining_weeks) {
# Step 1: Calculate the total probability of the removed weeks (40-46)
removed_prob_sum <- sum(data$value[data$epidemic_week %in% removed_weeks])
# Step 2: Calculate the total probability of the remaining weeks (47-52, 1-22)
remaining_prob_sum <- sum(data$value[data$epidemic_week%in% remaining_weeks])
# Step 3: Proportional redistribution of the removed probability across the remaining weeks
# Calculate the proportional redistribution for each remaining week
adjusted_remaining_probs <- data$value[data$epidemic_week %in% remaining_weeks] +
(removed_prob_sum * data$value[data$epidemic_week %in% remaining_weeks] / remaining_prob_sum)
# Step 4: Update the probabilities for the remaining weeks
data$value[data$epidemic_week %in% remaining_weeks] <- adjusted_remaining_probs
# Step 5: Remove remaining weeks and epidemic week column
data <- data %>% filter(epidemic_week!= removed_weeks) %>%
select(-epidemic_week)
# Return updated data
return(data)
}
peak_week <- redistribute_probabilities(peak_week, removed_weeks, remaining_weeks)
# Repeat for each state
peak_week <- peak_week %>%
crossing(location = states) %>% arrange(location) %>%
mutate(output_type_id = as.character(output_type_id))
#########################
## Peak Intensity
#specify quantiles
quantiles <- c(0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4,
0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
0.95, 0.975, 0.99)
calculate_kde_quantiles <- function(data){
# Loop through each week, but skip those with fewer than 2 data points
kde_results <- list()
for (week_num in 40:72) {
# Filter data for the current week
week_data <- data %>% filter(week == week_num)
# Skip weeks with fewer than 2 data points
if (nrow(week_data) < 2) {
next # Skip this week if not enough data points
}
# Ensure that WEEKLY.RATE is numeric
week_data$weekly_rate <- as.numeric(week_data$weekly_rate)
# Fit the kernel density estimate (bandwidth chosen automatically)
kde <- tryCatch(
density(week_data$weekly_rate, bw = "SJ"),
error = function(e) NULL # Catch errors and assign NULL if the calculation fails
)
# If the KDE was successful, store the result
if (!is.null(kde)) {
kde_results[[week_num]] <- kde
}
}
# Create a data frame to store the quantile results
quantile_predictions <- data.frame(
week = rep(40:72, each = length(quantiles)),
quantile = rep(quantiles, times = length(40:72)),
value = NA
)
# Loop through each filtered week and calculate the quantiles from the KDE
for (week_num in 40:72) {
# Get the KDE for the current week
kde <- kde_results[[week_num]]
# Check if there is a valid KDE
if (!is.null(kde)) {
# Calculate the quantiles based on the KDE's cumulative distribution
for (q in quantiles) {
# Calculate the quantile from the cumulative distribution
quantile_predictions$value[quantile_predictions$week == week_num &
quantile_predictions$quantile == q] <-
quantile(kde$x, probs = q)
}
}
}
# Calculate the peak intensity for each quantile
peak_intensity <- quantile_predictions %>%
group_by(quantile) %>%
summarize(value = max(value, na.rm = TRUE)) %>% # Handle NA values
rename("output_type_id"="quantile") %>%
mutate(reference_date=reference_date,
target = "peak inc flu hosp",
horizon = NA,
target_end_date = NA,
output_type = "quantile") %>%
# Expand the grid by repeating the original data for each state
crossing(location = states) %>%
arrange(location)%>%
left_join(state_pop, by = c("location" = "location")) %>%
mutate(value = round((value * population) / 100000),
output_type_id = as.character(output_type_id)) %>%
select(-c(population, abbreviation)) %>%
select(reference_date, target,horizon, target_end_date, location, output_type, output_type_id, value)
# Return peak intensity
return(peak_intensity = peak_intensity)
}
peak_intensity <- calculate_kde_quantiles(combo)
##Export csv
full <- bind_rows(peak_intensity, peak_week) %>% mutate(location=as.character(location))
write.csv(full, file = paste0(output_dirpath,sprintf("%s-FluSight-base_seasonal.csv", reference_date)))
#write.csv(full, file = paste0(hub_outputpath,sprintf("%s-FluSight-seasonal-baseline.csv", reference_date)))