-
Notifications
You must be signed in to change notification settings - Fork 150
/
Copy pathrun_analysis.R
138 lines (103 loc) · 5.22 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
##############################################################################
#
# FILE
# run_analysis.R
#
# OVERVIEW
# Using data collected from the accelerometers from the Samsung Galaxy S
# smartphone, work with the data and make a clean data set, outputting the
# resulting tidy data to a file named "tidy_data.txt".
# See README.md for details.
#
library(dplyr)
##############################################################################
# STEP 0A - Get data
##############################################################################
# download zip file containing data if it hasn't already been downloaded
zipUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
zipFile <- "UCI HAR Dataset.zip"
if (!file.exists(zipFile)) {
download.file(zipUrl, zipFile, mode = "wb")
}
# unzip zip file containing data if data directory doesn't already exist
dataPath <- "UCI HAR Dataset"
if (!file.exists(dataPath)) {
unzip(zipFile)
}
##############################################################################
# STEP 0B - Read data
##############################################################################
# read training data
trainingSubjects <- read.table(file.path(dataPath, "train", "subject_train.txt"))
trainingValues <- read.table(file.path(dataPath, "train", "X_train.txt"))
trainingActivity <- read.table(file.path(dataPath, "train", "y_train.txt"))
# read test data
testSubjects <- read.table(file.path(dataPath, "test", "subject_test.txt"))
testValues <- read.table(file.path(dataPath, "test", "X_test.txt"))
testActivity <- read.table(file.path(dataPath, "test", "y_test.txt"))
# read features, don't convert text labels to factors
features <- read.table(file.path(dataPath, "features.txt"), as.is = TRUE)
## note: feature names (in features[, 2]) are not unique
## e.g. fBodyAcc-bandsEnergy()-1,8
# read activity labels
activities <- read.table(file.path(dataPath, "activity_labels.txt"))
colnames(activities) <- c("activityId", "activityLabel")
##############################################################################
# Step 1 - Merge the training and the test sets to create one data set
##############################################################################
# concatenate individual data tables to make single data table
humanActivity <- rbind(
cbind(trainingSubjects, trainingValues, trainingActivity),
cbind(testSubjects, testValues, testActivity)
)
# remove individual data tables to save memory
rm(trainingSubjects, trainingValues, trainingActivity,
testSubjects, testValues, testActivity)
# assign column names
colnames(humanActivity) <- c("subject", features[, 2], "activity")
##############################################################################
# Step 2 - Extract only the measurements on the mean and standard deviation
# for each measurement
##############################################################################
# determine columns of data set to keep based on column name...
columnsToKeep <- grepl("subject|activity|mean|std", colnames(humanActivity))
# ... and keep data in these columns only
humanActivity <- humanActivity[, columnsToKeep]
##############################################################################
# Step 3 - Use descriptive activity names to name the activities in the data
# set
##############################################################################
# replace activity values with named factor levels
humanActivity$activity <- factor(humanActivity$activity,
levels = activities[, 1], labels = activities[, 2])
##############################################################################
# Step 4 - Appropriately label the data set with descriptive variable names
##############################################################################
# get column names
humanActivityCols <- colnames(humanActivity)
# remove special characters
humanActivityCols <- gsub("[\\(\\)-]", "", humanActivityCols)
# expand abbreviations and clean up names
humanActivityCols <- gsub("^f", "frequencyDomain", humanActivityCols)
humanActivityCols <- gsub("^t", "timeDomain", humanActivityCols)
humanActivityCols <- gsub("Acc", "Accelerometer", humanActivityCols)
humanActivityCols <- gsub("Gyro", "Gyroscope", humanActivityCols)
humanActivityCols <- gsub("Mag", "Magnitude", humanActivityCols)
humanActivityCols <- gsub("Freq", "Frequency", humanActivityCols)
humanActivityCols <- gsub("mean", "Mean", humanActivityCols)
humanActivityCols <- gsub("std", "StandardDeviation", humanActivityCols)
# correct typo
humanActivityCols <- gsub("BodyBody", "Body", humanActivityCols)
# use new labels as column names
colnames(humanActivity) <- humanActivityCols
##############################################################################
# Step 5 - Create a second, independent tidy set with the average of each
# variable for each activity and each subject
##############################################################################
# group by subject and activity and summarise using mean
humanActivityMeans <- humanActivity %>%
group_by(subject, activity) %>%
summarise_each(funs(mean))
# output to file "tidy_data.txt"
write.table(humanActivityMeans, "tidy_data.txt", row.names = FALSE,
quote = FALSE)