-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClass notes 9_5_24.R
96 lines (77 loc) · 3.54 KB
/
Class notes 9_5_24.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
install.packages(c("here", "sessioninfo"))
install.packages("tidyverse")
install.packages("ggplot2")
library(tidyverse)
library(here)
library(sessioninfo)
#load chicago data set
chicago <- readRDS("chicago.rds")
#Create tibble from scratch with coercing a double to integer
tibble(
a = 1:5,
b = 6:10,
c = 1,
z = (a + b)^2 + c
)
df <- tibble(
a = 1:5,
b = 6:10,
c = 1,
z = (a + b)^2 + c
)
# Extract by name using $ or [[]]
df$b
#You can also omit variables using the select() function by using the negative sign.
select(chicago, -(city:dptp))
#if we wanted to keep every variable that starts with a “d”
subset <- select(chicago, starts_with("d"))
str(subset)
#The filter() function is used to extract subsets of rows from a data frame.
#Suppose we wanted to extract the rows of the chicago data frame where the levels of PM2.5 are greater than 30 (which is a reasonably high level), we could do
chic.f <- filter(chicago, pm25tmean2 > 30)
str(chic.f)
#You can see that there are now only 194 rows in the data frame and the distribution of the pm25tmean2 values is.
summary(chic.f$pm25tmean2)
#Project 1 will onclude filtering through this method
#The arrange() function is used to reorder rows of a data frame according to one of the variables/columns.
#order the rows of the data frame by date, so that the first row is the earliest (oldest) observation and the last row is the latest (most recent) observation.
chicago <- arrange(chicago, date)
#check the first few rows
head(select(chicago, date, pm25tmean2), 3)
#Columns can be arranged in descending order too by useing the special desc() operator.
chicago <- arrange(chicago, desc(date))
#Renaming a variable in a data frame in R
# First look at the names of the first five variables in the chicago data frame.
head(chicago[, 1:5], 3)
#these names are pretty obscure or awkward and probably be renamed to something more sensible
chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
head(chicago[, 1:5], 3)
#Mutate Columns
#with air pollution data, we often want to detrend the data by subtracting the mean from the data.
#Here, we create a pm25detrend variable that subtracts the mean from the pm25 variable.
chicago <- mutate(chicago, pm25detrend = pm25 - mean(pm25, na.rm = TRUE))
head(chicago)
#group_by() function is used to generate summary statistics from the data frame within strata defined by a variable.
#First, we can create a year variable using as.POSIXlt().
chicago <- mutate(chicago, year = as.POSIXlt(date)$year + 1900)
#Now we can create a separate data frame that splits the original data frame by year.
years <- group_by(chicago, year)
#Finally, we compute summary statistics for each year in the data frame with the summarize() function.
summarize(years,
pm25 = mean(pm25, na.rm = TRUE),
o3 = max(o3tmean2, na.rm = TRUE),
no2 = median(no2tmean2, na.rm = TRUE)
)
#The pipeline operator %>% is very handy for stringing together multiple dplyr functions in a sequence of operations
#The %>% operator allows you to string operations in a left-to-right fashion, i.e.
chicago %>%
mutate(year = as.POSIXlt(date)$year + 1900) %>%
group_by(year) %>%
summarize(
pm25 = mean(pm25, na.rm = TRUE),
o3 = max(o3tmean2, na.rm = TRUE),
no2 = median(no2tmean2, na.rm = TRUE)
)
#The slice_sample() function of the dplyr package will allow you to see a sample of random rows in random order.
#This is useful when you're just looking at your data and want to understand it from a random subsample to look at different values you might encounter
slice_sample(chicago, n = 10)