-
Notifications
You must be signed in to change notification settings - Fork 0
/
m7_walkthrough.Rmd
450 lines (354 loc) · 10.3 KB
/
m7_walkthrough.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
---
title: "Module 7 (SQL & GIS) in R"
author: "Andrew Fullerton"
date: "2024-11-07"
output: github_document
---
# 1) Setup
```{r}
library(sqldf)
```
```{r}
# Import mortality data
mortality <- readRDS("data/m3_mortality.rds")
# Import m4dat data
m4dat <- readRDS("data/m4dat.rds")
```
# 2) Examining the PCCF
```{r}
# Read in pccf data
pccf <- read.csv("data/postal_code_conversion_file.csv")
pccf$start_date <- as.Date(pccf$start_date) # Format start date
pccf$end_date <- as.Date(pccf$end_date) # Format end date
str(pccf) # See what the data looks like
```
```{r}
# Order by postal code (ascending) and by end_date (descending)
pccf <- pccf[order(pccf$postal_code, -as.numeric(pccf$end_date)), ]
# View first 10 rows, only these three variables
head(pccf[, c("postal_code", "start_date", "end_date")], n = 10)
```
### Reduce the dataset to the essential variables
```{r}
pccf_short <- sqldf(
"SELECT postal_code,
start_date,
end_date,
longitude,
latitude
FROM pccf"
)
str(pccf_short) # View changes
```
### Explore how many distinct postal code entries are in the pccf
```{r}
sqldf(
"SELECT count(postal_code) AS tot_postal_code,
count(distinct(postal_code)) AS unique_postal_code
FROM pccf_short"
)
```
### Explore the number of times a given postal code appears
```{r}
sqldf(
"SELECT postal_code,
count(postal_code) AS pc_count
FROM pccf_short
GROUP BY postal_code
LIMIT 10"
)
```
### Get the average, min, and max number of times each postal code appears
```{r}
sqldf(
"SELECT AVG(pc_count) AS mean_pc_count,
MIN(pc_count) AS min_pc_count,
MAX(pc_count) AS max_pc_count
FROM (SELECT COUNT(postal_code) AS pc_count
FROM pccf_short
GROUP BY postal_code)"
)
```
# 3) Geocoding the Mortality Dataset
```{r}
str(mortality) # Remind ourselves what this dataset looks like
```
### Count null postal codes (attempt 1)
```{r}
sqldf(
"SELECT COUNT(uid)
FROM mortality
WHERE postcode IS NULL"
)
```
Hmm ... this is exactly how we did it in *SAS*. Why the difference in output?
```{r}
sum(is.na(mortality$postcode)) # Count number of null values using base R
```
```{r}
nrow(mortality[mortality$postcode == '.', ]) # Count number of cells with '.'
```
```{r}
nrow(mortality[mortality$postcode == '', ]) # Count number of empty cells
```
### Count null postal codes (attempt 2)
```{r}
sqldf(
"SELECT COUNT(uid)
FROM mortality
WHERE postcode = ''"
)
```
That looks better! Different software packages handle missing values differently. Since postcode is stored as a character variable in R, it will not convert missing cells to `NA`s by default.
### Create a version of the mortality data with no missing values
```{r}
m7_mortality <- sqldf(
"SELECT *
FROM mortality
WHERE postcode != ''"
)
str(m7_mortality) # 230916 observations in the new mortality dataset
```
### Test the linkage plan
```{r}
str(pccf_short) # Remind ourselves what our pccf data looks like
```
```{r}
sqldf(
"SELECT uid,
postcode AS pc_mort,
postal_code AS pc_pccf,
start_date,
death_date,
end_date,
longitude,
latitude
FROM m7_mortality AS a JOIN pccf_short AS b ON a.postcode=b.postal_code
WHERE a.death_date BETWEEN b.start_date AND b.end_date"
) |>
head(n = 10)
```
### Linking the data
```{r}
mortality_comp <- sqldf(
"SELECT uid,
postcode AS pc_mort,
postal_code AS pc_pccf,
start_date,
death_date,
end_date,
longitude,
latitude
FROM m7_mortality AS a JOIN pccf_short AS b ON a.postcode=b.postal_code
WHERE a.death_date BETWEEN b.start_date AND b.end_date"
)
str(mortality_comp) # Note that the number of observations has changed
```
Look at the dropped observations. Since we know that any rows in `m7_mortality` without a corresponding entry (i.e. were dropped) will have a null id in `mortality_comp`, we can produce a list of the dropped observations.
```{r}
sqldf(
"SELECT a.uid AS orig_id,
b.uid AS unlinked_id,
a.postcode
FROM m7_mortality AS a LEFT JOIN mortality_comp as b ON a.uid=b.uid
WHERE b.uid is NULL"
)
```
We lost 15 observations.
```{r}
# Let's investigate further...
sqldf(
"SELECT a.uid,
postcode,
death_date,
postal_code,
start_date,
end_date
FROM (SELECT a.uid,
b.uid AS unlinked_id,
a.postcode,
a.death_date
FROM m7_mortality AS a LEFT JOIN mortality_comp AS b ON a.uid=b.uid
WHERE b.uid is NULL) AS a
LEFT JOIN pccf_short AS b ON a.postcode=b.postal_code"
)
```
From this, we can see that the mortality data were dropped during linkage because the death dates were not found between the start and end dates while some were missing the necessary data
### Create the final geocoded dataset
```{r}
mortality_geocoded <- sqldf(
"SELECT uid,
death_date,
longitude,
latitude
FROM mortality_comp
WHERE longitude is NOT NULL AND latitude is NOT NULL"
)
str(mortality_geocoded) # Verify that the dataset has 229557 rows as intended
```
### Save the final dataset to the m7 folder
```{r}
getwd() # check what local directory you're in
```
```{r}
setwd("/Users/andrewfullerton/Desktop/Code/spph553-in-r") # replace with local file path
if (!dir.exists("m7")) { # if no folder named m7 exists, create one
dir.create("m7")
}
saveRDS(mortality_geocoded, "m7/mortality_geocoded.rds") # save in m7 as RDS
```
# 4) Mapping
### Select only the variables we want to use going forward
```{r}
mapping_mortality <- sqldf(
"SELECT uid,
death_date,
lhacode
FROM mortality
WHERE lhacode IS NOT NULL"
)
```
```{r}
temp <- sqldf(
"SELECT date,
temperature_Min,
temperature_Max
FROM m4dat"
)
```
### Testing our linkage
```{r}
sqldf(
"SELECT a.*,
b.temperature_Min,
b.temperature_Max
FROM mapping_mortality AS a LEFT JOIN temp AS b ON a.death_date=b.date"
) |>
head(n = 10)
```
### Linking the data
```{r}
mortality_lha <- sqldf(
"SELECT a.*,
b.temperature_Min,
b.temperature_Max
FROM mapping_mortality AS a LEFT JOIN temp AS b ON a.death_date=b.date"
)
str(mortality_lha) # Verify that the dataset has 231178 rows as intended
```
```{r}
sqldf(
"SELECT COUNT(*)
FROM mortality_lha
WHERE lhacode IS NOT NULL"
) # To be extra sure, run this SQL query
```
### Create hot days and cold days datasets
### Store the 99.9th hottest temperature and the 0.1th coldest temperature
```{r}
quantile(mortality_lha$temperature_Max, probs = 0.999) # hottest temps
```
```{r}
quantile(mortality_lha$temperature_Min, probs = 0.001) # coldest temps
```
### Create hot days dataset
```{r}
mortality_hot_days <- sqldf(
"SELECT lhacode AS id_number,
COUNT(uid) AS death_counts
FROM mortality_lha
WHERE temperature_Max > 29.5
GROUP BY lhacode"
)
```
### Create cold days datase
```{r}
mortality_cold_days <- sqldf(
"SELECT lhacode AS id_number,
COUNT(uid) AS death_counts
FROM mortality_lha
WHERE temperature_Min < (-9.5)
GROUP BY lhacode"
)
```
# 4) Choropleth maps
Unfortunately, base R doesn't support reading/writing shape files 'out of the box'. Technically, we could write a function to do this, but this would require dealing with regular expressions (and matrices) and is way beyond our scope.
So, below are two methods of mapping the data:
The first uses base R only and a bit of data processing magic. It falls short of a choropleth map but is a visual representation of the data. The second uses a lightweight package for handling GIS data called `sf`.
```{r}
gva <- c(161:166, 201, 202, 37, 38, 42, 44, 45, 43, 42, 75, 34, 35) # gva codes
mortality_hot_days_gva <- mortality_hot_days[mortality_hot_days$id_number %in% gva, ]
str(mortality_hot_days_gva) # View the new dataset
```
```{r}
mortality_cold_days_gva <- mortality_cold_days[mortality_cold_days$id_number %in% gva, ]
str(mortality_cold_days_gva) # View the new dataset
```
## Base R method
```{r}
bcmap_baseR <- readRDS("data/bcmap_data.rds") # Note that this is only possible with some data pre-processing
str(bcmap_baseR) # View the data we've just loaded into R
```
### Create the plot for hot days
```{r}
gvamap_baseR_hot <- merge(bcmap_baseR, mortality_hot_days_gva, by.x = "ID_NUMBER", by.y = "id_number")
str(gvamap_baseR_hot) # Verify that the merge worked
```
```{r}
par(mar = c(10, 4, 4, 2))
barplot(gvamap_baseR_hot$death_counts,
names.arg = gvamap_baseR_hot$LHA_NAME, # Region names on the x-axis
col = "lightyellow", # Bar colour
ylab = "Death Counts", # Y-axis label
main = "Death Counts by LHA Region", # Plot title
las = 2, # Rotate x-axis labels for readability
cex.names = 0.7, # Reduce label size
ylim = c(0, 20)) # y-axis range
```
### Create the plot for cold days
```{r}
gvamap_baseR_cold <- merge(bcmap_baseR, mortality_cold_days_gva, by.x = "ID_NUMBER", by.y = "id_number")
str(gvamap_baseR_cold) # Verify that the merge worked
```
```{r}
par(mar = c(10, 4, 4, 2))
barplot(gvamap_baseR_cold$death_counts,
names.arg = gvamap_baseR_cold$LHA_NAME, # Region names on the x-axis
col = "lightblue", # Bar colour
ylab = "Death Counts", # Y-axis label
main = "Death Counts by LHA Region", # Plot title
las = 2, # Rotate x-axis labels for readability
cex.names = 0.7, # Reduce label size
ylim = c(0, 20)) # y-axis range
```
## SF (simple features) method
```{r}
library(sf) # load the 'sf' library
```
```{r}
bcmap <- st_read("shapefile/lha.shp") # Read shapefile into R
str(bcmap) # View the data we've just loaded into R
```
```{r}
plot(bcmap) # View the basemap we've just loaded into R
```
### Create the map for hot days
```{r}
gvamap_hot <- merge(bcmap, mortality_hot_days_gva, by.x = "ID_NUMBER", by.y = "id_number")
str(gvamap_hot) # Confirm the merge worked
```
```{r}
plot(gvamap_hot["death_counts"],
key.pos = 1,
main = "Death Counts by LHA Region")
```
### Create the map for cold days
```{r}
gvamap_cold <- merge(bcmap, mortality_cold_days_gva, by.x = "ID_NUMBER", by.y = "id_number")
str(gvamap_cold)
```
```{r}
plot(gvamap_cold["death_counts"],
key.pos = 1,
main = "Death Counts by LHA Region")
```