forked from clayford/DataWranglingInR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dwr_05_combine_merge_rehsape_data.R
532 lines (395 loc) · 20.7 KB
/
dwr_05_combine_merge_rehsape_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
#' ---
#' title: "Data Wrangling in R: Combining, Merging and Reshaping Data"
#' author: "Clay Ford"
#' date: "Spring 2016"
#' output: pdf_document
#' ---
# load data from last lecture
load("../data/datasets_L04.Rda")
# Sometimes we have multiple data frames we want to combine. There are typically
# three ways to do this: (1) stack on top of each other, (2) place side-by-side,
# or (3) merge together based on common variables.
# Stacking ----------------------------------------------------------------
# Let's generate some fake data to illustrate combining data frames by stacking.
first <- data.frame(x0=1:5,
x1=rnorm(5),
x2=c("M","F","M","F","F"))
first
second <- data.frame(x0=10:14,
x1=rnorm(5),
x2=c("M","F","M","F","F"))
second
third <- data.frame(x4=c(3,3,1,3,2),
x5=c("e","g","v","b","z"))
third
# We can use the rbind() function to stack data frames. Make sure the number of
# columns match. Also, the names and classes of values being joined must match.
# Here we stack the first, the second and then the first again:
rbind(first, second, first)
class(rbind(first, second, first)) # still a data frame
# works with vectors too:
rbind(1:3,4:6)
class(rbind(1:3,4:6)) # matrix
# Remember the allStocks data? This is a list containing 7 data frames of stock
# data for 7 different companies.
names(allStocks)
# We can use rbind to combine these into one data frame. The thing is we have to
# call rbind repeatedly since there are so many data frames.
# rbind(allStocks$bbby.csv, allStocks$flws.csv, allStocks$foxa.csv,...)
# A useful function for this type of task in R is the do.call() function. This
# function allows you to call any R function, but instead of writing out the
# arguments one by one, you can use a list to hold the arguments of the
# function. The basic syntax is do.call(what, args), where "what" is a function
# and "args" are the arguments to pass to the function IN A LIST.
# Since allStocks is a LIST of data frames, and rbind can take data frames as
# arguments, we can simply pass allStocks to rbind via the do.call function.
allStocks <- do.call(rbind, allStocks)
str(allStocks)
# Let's go ahead and fix the Date column name:
names(allStocks)[1] <- "Date"
# Let's look at the first few:
head(allStocks)
# We see that R very kindly created row names for us that identifies the source
# data frame for each row! That's because rbind() has a logical argument called
# "make.row.names" that defaults to TRUE. We can exploit that information to
# create a variable that indicates which record belongs to which company. In
# other words, we can add a column called "company" that will list "BBBY",
# "FLWS", etc. for their respective rows. We will do this in a later lecture.
# But what if you didn't want that behavior? We could set make.row.names =
# FALSE. The trick to doing that with do.call is that you need c() to add
# further arguments. So you have to do something like this:
#
# allStocks <- do.call(rbind, c(allStocks, make.row.names = FALSE))
# Recall that c() combines values into vectors OR lists.
is.vector(
c(1,2,3)
)
is.list(
c(1, 2, list(g=c("m","f"), x=2:4))
)
apply(iris[,-5], 2, function(x) c(mean(x), sd(x), length(x)))
# Side-by-side ------------------------------------------------------------
# Use the cbind function to combine data frames side-by-side:
cbind(first,third)
class(cbind(first,third))
# WARNING: cbind does not require matching heights; if one data frame is shorter
# it will recycle it. Notice below the third data frame is recycled.
cbind(rbind(first,second),third)
# However, if the number of rows of the shorter data frame does not evenly
# divide into the number of rows of the taller data frame, then R throws an
# error.
# cbind(rbind(first,second),third[-1,])
# Error in data.frame(..., check.names = FALSE) :
# arguments imply differing number of rows: 10, 4
# A note about cbind and rbind --------------------------------------------
# Using cbind and rbind on vectors does NOT produce data frames.
x <- 1:3; y <- letters[1:3]
class(cbind(x,y))
is.data.frame(cbind(x,y))
is.data.frame(rbind(x,y))
dat <- data.frame(x,y)
# However if one of the objects you're binding is a data frame, then you do end
# up with a data frame.
dat <- data.frame(x,y)
z <- c("a","a","c")
is.data.frame(cbind(dat,z))
# Merging -----------------------------------------------------------------
# When we wish to join two data sets together based on common variables, we use
# the merge() function. For example, let's say we have a data set of crime
# statistics for all 50 US states, and another data set of demographic
# statistics for all 50 US states. We may want to merge them together so we have
# one row per state that contains crime and demographic statistics. The common
# variable between the two data sets would be "state".
# The basic syntax for merge() is merge(x, y, by, by.x, by.y), where "x" and "y"
# are the respective data sets, "by" is the column(s) to merge by (assuming the
# column names match between the two data sets), and "by.x" and "by.y" are also
# columns to merge by in the event the column names do not match between the two
# data sets. You either use by or by.x and by.y, not all three. From the help
# file: "By default the data frames are merged on the columns with names they
# both have, but separate specifications of the columns can be given by by.x and
# by.y." In other words, you don't have to use the by argument if your data
# frames have matching column names that you want to merge on.
# Let's create some more fake data to illustrate:
left <- data.frame(id=c(2:7),
y2=rnorm(6,100,5))
left
right <- data.frame(id=rep(1:4,each=2),
z2=sample(letters,8, replace=TRUE))
right
# Data frames left and right have columns "id" in common. Let's merge them
# together based on id:
merge(left, right)
# Notice y2 from the left data frame is recycled to match up with multiple id in
# the right data frame. Also notice only rows with matching ids in both data
# frames are retained. In database terminology this is known as an INNER JOIN.
# Only those records with matching "by" variables are joined.
# If we wanted to merge all rows regardless of match, we use the argument
# all=TRUE. It is FALSE by default. This creates an OUTER JOIN.
merge(left, right, all=TRUE)
# If we want to retain everything in the left data frame and merge only what
# matches in the right data frame, we specify all.x=TRUE. This is known as a
# LEFT JOIN.
merge(left, right, all.x=TRUE)
# If we want to retain everything in the right data frame and merge only what
# matches in the left data frame, we specify all.y=TRUE. This is known as a
# RIGHT JOIN.
merge(left, right, all.y=TRUE)
# When merging two data frames that do not have matching column names, we can
# use the by.x and by.y arguments to specify columns to merge on.
# Let's say we want to merge the first and left data frames by x0 and id. The
# by.x and by.y arguments specify which columns to use for merging.
first
left
merge(first, left, by.x="x0", by.y="id")
# Notice the merged data frame has an "x0" column, not an "id" column. And this
# of course is an inner join.
# Let's try to merge the second and left by x0 and id:
second
left
merge(second, left, by.x="x0", by.y="id")
# There are no matches, so no merging happens. What if we don't specify columns
# to merge on?
merge(second, left)
# We get a "cartesian product"; every possible combination of rows.
# Dimension of result:
c(nrow(second)*nrow(left), ncol(second) + ncol(left))
# match(), %in%, intersect(), union(), setdiff(), setequal() --------------
# Sometimes we don't actually want to merge data but rather just find out which
# records they have in common. We can use the match() and intersect() functions
# and the %in% operator. Let's once again create some data to demonstrate:
set.seed(111) # this ensures we get the same random numbers
alot <- round(runif(100,1,1000)) # 100 numbers from interval [1,1000]
alot
few <- round(runif(10,1,1000)) # 10 numbers from interval [1,1000]
few
# Are there any values in "few" also in "alot"?
# First let's use match(). The basic syntax of match() is match(x, table) where
# x is the values to be matched and table is the values to be matched against.
# This asks the question: "do any values in 'few' match values in 'alot', and if
# so, which indices do they match?"
match(few, alot)
# this says the 1st, 6th and 8th values of the "few" vector matches the 45th,
# 69th and 77th values of the "alot" vector.
# The %in% operator is perhaps more intutive. It returns a logical vector.
few %in% alot
# This says the 1st, 6th and 8th values of the "few" vector match values of the
# "alot" vector. Notice it doesn't return the actual number. But we could do
# this:
few[few %in% alot]
# Using %in% means we can easily count the number of matches:
sum(few %in% alot)
# intersect() returns values in the first AND second vector:
intersect(few,alot)
# union() returns values in the first OR second vector:
union(few, alot)
# setdiff() returns the list of items in the first vector not in the 2nd vector:
setdiff(few, alot)
# setequal() asks if both vectors are equal and returns TRUE or FALSE:
setequal(few, alot)
# Note from documentation: "Each of union, intersect, setdiff and setequal will
# discard any duplicated values in the arguments"
# Reshaping Data ----------------------------------------------------------
# It's often helpful to think of data as "wide" or "long". When there are
# multiple occurrences of values for a single observation in one row, the data
# is said to be wide. When there are multiple occurrences of values for a single
# observation in multiple rows, the data is said to be long.
# Examples:
wide <- data.frame(name=c("Clay","Garrett","Addison"),
test1=c(78, 93, 90),
test2=c(87, 91, 97),
test3=c(88, 99, 91))
wide
long <- data.frame(name=rep(c("Clay","Garrett","Addison"),each=3),
test=rep(1:3, 3),
score=c(78, 87, 88, 93, 91, 99, 90, 97, 91))
long
##############
# wide to long
##############
# Many R functions require data in "long" format in order to perform
# calculations on, or create graphs of, the data. Therefore it's important to
# know how to reshape data from wide to long. A very popular package for this
# task is the reshape2 package. If you don't already have it, please install it:
# install.packages("reshape2)
library(reshape2)
# The star function of the reshape2 package is melt(). It basically "melts" wide
# data into long format. The basic syntax is melt(data, id.vars, measure.vars),
# where "data" is your data frame, "id.vars" are the ID variables (ie, variables
# that will still have their own column after reshaping) and "measure.vars" are
# the variables that are getting "melted". Column headers of the "measure.vars"
# become a single variable in the melted data frame as does the values under
# those column headers. This is best explained with an example.
# To make our "wide" data frame long
wide
melt(wide, id.vars = "name", measure.vars = c("test1","test2","test3"))
# Notice the "test" column headers in wide are now in a column called
# "variable", and the values under the "test" columns in wide are now in a
# single column called "value". We can provide our own names for those columns
# using the optional "variable.name" and "value.name" arguments, like so:
melt(wide, id.vars = "name", measure.vars = c("test1","test2","test3"),
variable.name = "test", value.name="score")
# It should be noted that you can melt a data frame without explicitly using the
# measure.vars argument. If you leave it blank, melt will use all the variables
# not named in the id.vars argument:
melt(wide, "name")
# As another example, consider the airquality dataset that comes with R.
head(airquality)
# Notice there are multiple measurements on each day in a single row. Hence we
# can think of this data as "wide". To make it "long" we require that each
# measurement have its own row with a column to identify which measurement it
# is. We can do this with melt as follows:
aqLong <- melt(airquality, id.vars=c("Month", "Day"),
variable.name = "Measurement", value.name="Reading")
head(aqLong)
# Data in long format makes summaries like this easy.
with(aqLong, tapply(Reading, list(Month, Measurement), mean,na.rm=TRUE))
# There is a reshape() function that comes with base R, but I find melt much
# easier to use. Here is how to use reshape() to do what we just did with
# melt(). Notice we need to use three extra arguments.
aqLong2 <- reshape(airquality, idvar=c("Month","Day"), times=names(airquality)[1:4],
timevar = "Measurement", v.names="Reading",
varying = list(names(airquality)[1:4]), direction="long")
head(aqLong2)
rm(aqLong2)
# Let's subset and reshape the election data to long format to facilitate a bar
# graph.
names(electionData)[1] <- "State"
edSub <- subset(electionData, select=c("State", "Obama Democratic", "Romney Republican", "Elec Vote D"))
edSub$Winner <- ifelse(is.na(edSub$"Elec Vote D"),"Romney","Obama")
edSub$"Elec Vote D" <- NULL
edSub <- melt(edSub, id.vars = c("State","Winner"), value.name="Votes",
variable.name="Candidate")
head(edSub)
library(ggplot2)
library(scales) # for the comma function
ggplot(edSub, aes(x=State, y=Votes, group=Candidate, fill=Candidate)) +
geom_bar(position="dodge", stat="identity") + facet_wrap(~Winner) +
scale_fill_manual(values=c("blue","red")) + scale_y_continuous(labels=comma) +
coord_flip()
##############
# long to wide
##############
# The dcast() function can reshape a long data frame to wide. First we'll
# demonstrate and then explain.
# Let's reshape our aqLong data frame back to its original wide format.
aqOrig <- dcast(aqLong, Month + Day ~ Measurement, value.var = "Reading")
head(aqOrig)
# A good way to think of the dcast() function is to imagine it creating a matrix
# from the melted (long) data. The formula specifes the rows and columns where
# the LHS is the "rows" and the RHS is the "columns"; the value.var specifies
# what goes into the interior of the matrix.
# We can also use the dcast function to calculate summaries such as means by
# supplying an aggregation function as the third argument. For example, we can
# find the mean measurement by month:
dcast(aqLong, Month ~ Measurement, mean, na.rm=TRUE,
value.var = "Reading")
# Notice the result is a data frame
# We can also make our own function to count the number of missing measurements
# by month:
dcast(aqLong, Month ~ Measurement, function(x)sum(is.na(x)),
value.var = "Reading")
# See the examples for help(cast) for more complex examples.
# tidyr -------------------------------------------------------------------
# Tidy data is a concept put forth in Hadley Wickham's 2014 paper, Tidy Data
# (http://www.jstatsoft.org/v59/i10/). To quote the abstract: "Tidy datasets are
# easy to manipulate, model and visualize, and have a specific structure: each
# variable is a column, each observation is a row, and each type of
# observational unit is a table."
# Hadley created a package called tidyr to help tidy R data frames. Among other
# things it can be used to reshape data. The two main functions are gather() and
# spread().
# install.packages("tidyr")
library(tidyr)
# gather ------------------------------------------------------------------
# gather is sort of like melt. It can make a wide data set long.
# Documentation description: Gather columns into key-value pairs.
# Syntax: gather(data, key, value, columns to gather) where data is your data
# frame, key is the name of the new key column, value is the name of the new
# value column, and the last part is names or numeric indices of columns to
# collapse (or to exclude from collapsing).
# Let's use gather() on the airquality data
aqLong2 <- gather(airquality, key = Measurement, value = Reading, -Month, -Day)
# Notice gather() handles character data differently than melt()!
str(aqLong$Measurement)
str(aqLong2$Measurement)
# Let's compare the syntax from melt and gather:
# melt(airquality, id.vars=c("Month", "Day"),
# variable.name = "Measurement", value.name="Reading")
#
# gather(airquality, key = Measurement, value = Reading, -Month, -Day)
# The big difference is that in melt you identify the id.vars, the columns that
# will remain in the long data set after conversion from wide. In the gather
# function, you indicate which columns are being "gathered", either explicitly
# or by not excluding them.
# We see that the key and value arguments in gather correspond to the
# variable.name and value.name arguments in melt. We also don't need to quote
# variable names in gather.
# spread ------------------------------------------------------------------
# spread is sort of like dcast. It can make a long data set wide.
# Documentation description: Spread a key-value pair across multiple columns.
# Basic syntax: spread(data, key, value) where data is a data frame, key is name
# of the column with the (unique) values you want turned into column headers,
# and value is the name of the column that has the values you want placed under
# your new column headers.
# Let's use spread() on the aqLong2 data
head(aqLong2)
# I want the unique values of Measurement to become column headers. I want the
# corresponding values in the Reading column to go under the new column headers.
aqOrig2 <- spread(aqLong2, key = Measurement, value = Reading)
# Does this return the same wide data frame as dcast()? Not quite.
names(aqOrig)
names(aqOrig2)
# The last two columns are swapped because dcast had to deal with Measure as a
# factor (and it's associated ordering of levels) whereas spread had to deal
# with Measure as a character vector and thus determined order of columsn
# alphabetically.
# Let's compare the syntax from dcast and spread:
# dcast(aqLong, Month + Day ~ Measurement, value.var = "Reading")
#
# spread(aqLong2, key = Measurement, value = Reading)
# The big difference is that spread() doesn't require a formula. You just
# indicate the column(s) you want to "spread" out.
# We also see that the value argument in spread() corresponds to the value.var
# argument in dcast(). And spread() doesn't require quoting variable names.
# Another major difference is that dcast() allows you to supply an aggregation
# function for generating summary statistics. spread() is just spreading out
# key-value pairs across multiple columns.
# tidyr helper functions --------------------------------------------------
# tidyr includes a few helper functions. One that I really like is
# extract_numeric().
# This uses a regular expression to strip all non-numeric characters from a
# string and then coerces the result to a number. This strips all non-numeric
# characters from a string and then coerces the result to a number.
extract_numeric("$1,200.34")
extract_numeric("-2%")
# Let's generate some dollar amounts
money <- dollar(round(runif(100,100,200),2)) # dollar() function from scales package
money
typeof(money)
extract_numeric(money)
typeof(extract_numeric(money))
# The heuristic is not perfect - it won't fail for things that clearly aren't
# numbers
extract_numeric("12abc34")
# Another helper function that may come in handy is separate().
# Given either regular expression or a vector of character positions, separate()
# turns a single character column into multiple columns. The default separation
# value is a regular expression that matches any sequence of non-alphanumeric
# values.
df <- data.frame(x = c("a.b", "a.d", "b.c"))
df
# split column x into two new columns called A and B
separate(df, x, c("A", "B"))
# Example: separate() can be useful for splitting times into components.
# Create a place holder data frame
dat <- data.frame(i=1:10, time=character(10), stringsAsFactors = F)
# loop through 10 iterations of logging the system time
for(i in 1:10){
dat[i,2] <- format(Sys.time(), format = "%H:%M:%OS3")
# %OS3 = fractional seconds to 3 places; see ?strptime
Sys.sleep(0.01) # delay 0.01 seconds
}
dat
separate(dat, time, c("H","M","S","FS"))
# save data for next set of lecture notes
save(list=c("electionData", "weather", "arrests", "allStocks"), file="../data/datasets_L05.Rda")