-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path01_source_me.R
154 lines (147 loc) · 6.61 KB
/
01_source_me.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
library(tidyverse)
library(sf)
#One way to create an index based on several economic indicators is to bin the data e.g. deciles and then create the index as an average
#of the decile scores for each indicator.
bins <- 10 # decile scores.
bc_regional_districts <- read_rds("https://github.com/bcgov/bc_regional_districts_sf/raw/main/out/bc_regional_districts_sf.rds")
# #the shape data for maps---------
# #mostly regional districts---------
# regional_districts <- sf::st_read(here::here(
# "map_data",
# "Regional Districts",
# "ABMS_RD_polygon.shp"
# )) %>%
# janitor::clean_names() %>%
# select(aa_name, geometry)
# #plus one large municipality-------
# one_large_municipality <- sf::st_read(here::here(
# "map_data",
# "Municipalities",
# "ABMS_MUNI_polygon.shp"
# )) %>%
# janitor::clean_names() %>%
# filter(area_sqm == max(area_sqm)) %>%
# select(aa_name, geometry)
# #bind the regional districts and one large municipality together
# regional_districts <- bind_rows(regional_districts, one_large_municipality) %>%
# mutate(
# aa_name = janitor::make_clean_names(aa_name),
# aa_name = str_replace(aa_name, "regional_district_of_", ""),
# aa_name = str_replace(aa_name, "_regional_district", ""),
# aa_name = str_replace(aa_name, "_region_unincorporated", ""),
# aa_name = str_replace(aa_name, "_regional_municipality", "")
# )
#the excel file features non-unique column names, and missing names (for empty columns)
#create column names (this could easily break if the file structure changes)
column_names <- c(
"Region",
"District",
"Income Assistance: raw data",
"Income Assistance: per capita",
"Building Permits: raw data",
"Building Permits: per 1,000 capita",
"Job Postings: raw data",
"Job Postings: per capita",
"MPI Estimated Cost: raw data",
"MPI Estimated Cost: per 1,000 capita",
"Small Business Registrations: raw data",
"Population 2020: raw data",
"Population 2021: raw data",
"Population: growth",
"x1",
"x2",
"Income Assistance: index",
"Building Permits: index",
"Job Postings: index",
"MPI Estimated Cost: index",
"Small Business Registration: index",
"Population: index",
"Overall: index"
)
#read the excel file. This could break if structure of file changes. e.g. reordering columns, additional columns, etc.
district_conditions <- readxl::read_excel(here::here("raw_data", "2022 update data.xlsx"),
sheet = "RD Current Conditions Monitor",
range = "B6:X42",
col_names = column_names,
na = c("n/a", "n.a.", "n.a")
) %>%
select(-x1, -x2) %>%
fill(Region) %>%
filter(!is.na(District)) %>%
#modify names so they match up with the spatial data-------
mutate(
District = janitor::make_clean_names(District),
District = str_replace(District, "greater_vancouver", "metro_vancouver"),
District = str_replace(District, "powell_river", "qathet"),
District = str_replace(District, "skeena_queen_charlotte", "north_coast")
) %>%
select(Region, District, ends_with("raw data")) %>%
#calculate the normalized measures.-----------
mutate(
`Income Assistance: per capita` = `Income Assistance: raw data` / `Population 2021: raw data`,
`Building Permits: per capita` = `Building Permits: raw data` / (`Population 2021: raw data`),
`Job Postings: per capita` = `Job Postings: raw data` / `Population 2021: raw data`,
`MPI Estimated Cost: per capita` = `MPI Estimated Cost: raw data` / (`Population 2021: raw data`),
`Small Business Registrations: per capita` = `Small Business Registrations: raw data` / (`Population 2021: raw data`),
`Population: growth` = `Population 2021: raw data` / `Population 2020: raw data` - 1
)
# An alternative way of creating an index based on multiple measures is to perform principal component analysis, where the first
# principal component serves as the index.
# Here we use Nipals because it can deal with missing values.
pca <- district_conditions %>%
select(District, contains("per capita"), contains("growth")) %>%
column_to_rownames(var = "District") %>%
as.matrix() %>%
nipals::nipals(fitted = TRUE)
class(pca) <- "princomp"
pca$sdev <- sqrt(pca$eig)
pca$n.obs <- dim(pca$scores)[1]
pca_index <- as_tibble(pca$scores, rownames = "district") %>%
select(district, `PCA: index` = PC1)
district_conditions <- full_join(district_conditions, pca_index, by = c("District" = "district"))%>%
#create the decile scores-----------
mutate(
`Income Assistance: decile` = ntile(-`Income Assistance: per capita`, bins), # negative because lower is better.
`Building Permits: decile` = ntile(`Building Permits: per capita`, bins),
`Job Postings: decile` = ntile(`Job Postings: per capita`, bins),
`MPI Estimated Cost: decile` = ntile(`MPI Estimated Cost: per capita`, bins),
`Small Business Registrations: decile` = ntile(`Small Business Registrations: per capita`, bins),
`Population: decile` = ntile(`Population: growth`, bins)
) %>%
rowwise() %>%
#overall index is the mean of the AVAILABLE decile scores (implicitly assuming missing at random)
mutate(`Overall: index` = mean(c(
`Income Assistance: decile`,
`Building Permits: decile`,
`Job Postings: decile`,
`MPI Estimated Cost: decile`,
`Small Business Registrations: decile`,
`Population: decile`
), na.rm = TRUE))
district_conditions<- left_join(bc_regional_districts, district_conditions, by = c("regional_district"="District"))%>%
pivot_longer(cols = -c(id, regional_district, Region, geometry), names_to = "name", values_to = "value")%>%
mutate(name2=name,
regional_district= str_to_title(str_replace_all(regional_district, "_", " ")))%>%
separate(name2, into=c("thing", "type"), sep = ": ")%>%
mutate(type=case_when(type=="per capita"~"normalized",
type=="growth"~"normalized",
TRUE~type))
#for the district plot we need calculate the proportion of the maximal value---------
standardized_vars <- unique(district_conditions$name)[
str_detect(unique(district_conditions$name), pattern="per capita") |
str_detect(unique(district_conditions$name), pattern="growth")]
standardized <- district_conditions%>%
select(-geometry)%>%
filter(name %in% standardized_vars)%>%
group_by(name)%>%
mutate(prop_of_max = value/max(value, na.rm=TRUE))
#save the data for the app.----
saveRDS(district_conditions, here::here("processed_data", "district_conditions.rds"))
saveRDS(standardized, here::here("processed_data", "standardized.rds"))
saveRDS(pca,here::here("processed_data", "pca.rds"))
# plt <- ggplot(filter(district_conditions, thing=="Overall"),aes(fill=value))+
# geom_sf()+
# scale_fill_viridis_c()+
# ggthemes::theme_map()
#
# plotly::ggplotly(plt)