GCD_CourseProject/run_analysis.R at master · marioem/GCD_CourseProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#################################################################################
#
#  Filename: run_analysis.R
#   Version: 1.0.1
#      Date: 2015.08.23
#    Author: Mariusz Musiał
# Rev. Info: 1.0 - initial version of the script
#            1.0.1 - version info added. Some spelling and white space cleanup
#
# This script is aimed at generating tidy data set from the data collected during
# the project "Human Activity Recognition Using Smartphones Data Set". It forms
# part of the "Getting and Cleaning Data" course project offered by JHU through
# Coursera.com. More data about the "Human Activity Recognition Using Smartphones
# Data Set" project can be found at http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones#
#
# According to instructions set out in the course project description, this script
# does the following:
#
# 1. Merges the training and the test sets to create one data set.
# 2. Extracts only the measurements on the mean and standard deviation for each
#    measurement.
# 3. Uses descriptive activity names to name the activities in the data set
# 4. Appropriately labels the data set with descriptive variable names.
# 5. From the data set in step 4, creates a second, independent tidy data set
#    with the average of each variable for each activity and each subject.
#
#         Input: the original data structure extracted from
#                https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
#                into the working directory, where also this script should
#                be present.
#        Output: file "UCItidyDataMeans.txt" stored in "UCI HAR Dataset" directory
#     Execution: run this script from your current working directory using
#                source("run_analysis.R")
# Prerequisites: 1) this script is located in your current working directory
#                2) UCI HAR Dataset is extracted from the downloaded zip file
#                   into your current working directory. This would result in a
#                   "UCI HAR Dataset" data directory in current working directory.
#                3) System which this script is run upon does not require escaping
#                   of the spaces in the filenames passed to R functions
#                4) dplyr package is installed on your system
#
# The tidy data set is written using write.table function with row.names = FALSE.
# It can be read back using
#        read.table("./UCI HAR Dataset/UCItidyDataMeans.txt", header = T)
#
#################################################################################


if(!require("dplyr")) {
    stop("Please install dplyr package.")
}

# Check if diectory with data exists. It is assumed that if the top level directory
# exists, then all the corresponding directory and file structure exists, too.
if(!dir.exists("./UCI HAR Dataset")) {
    stop("Data directory structure \"UCI HAR Dataset\" missing")
}

cat("Reading UCI HAR Dataset. This will take a while...\n")

# Some constants useful for reading data files

# Files containing feature data
testFile <- "./UCI HAR Dataset/test/X_test.txt"
trainFile <- "./UCI HAR Dataset/train/X_train.txt"

# File containing feature names (i.e. column names for feature data)
featureNamesFile <- "./UCI HAR Dataset/features.txt"

# Files containig subject IDs identifying which row in feature data comes from which subject
subTestFile <- "./UCI HAR Dataset/test/subject_test.txt"
subTrainFile <- "./UCI HAR Dataset/train/subject_train.txt"

# Files containig activity IDs identifying which row in feature data comes from which activity
activityIdTestFile <- "./UCI HAR Dataset/test/y_test.txt"
activityIdTrainFile <- "./UCI HAR Dataset/train/y_train.txt"

# File containign activity names
activityNameFile <- "./UCI HAR Dataset/activity_labels.txt"

featureNames <- read.table(featureNamesFile, col.names = c("id", "name"), stringsAsFactors = F)
featureData <- read.table(testFile, col.names = featureNames$name, stringsAsFactors = F)
tmpData <- read.table(trainFile, col.names = featureNames$name, stringsAsFactors = F)

# Join the test and train feature data (in this order) by rows

cat("Merging data...\n")

featureData <- rbind(featureData, tmpData)
rm("tmpData")

# Join the subject IDs for test and train files
subject <- read.table(subTestFile, col.names = "SubjectID", stringsAsFactors = F)
subject <- rbind(subject, read.table(subTrainFile, col.names = "SubjectID", stringsAsFactors = F))

# Join the activities IDs for test and train files
activity <- read.table(activityIdTestFile, col.names = "ActivityID", stringsAsFactors = F)
activity <- rbind(activity, read.table(activityIdTrainFile, col.names = "ActivityID", stringsAsFactors = F))

# Read activity name file
activityName <- read.table(activityNameFile, col.names = c("ActivityID", "ActivityName"), stringsAsFactors = F)

# Extract from featureData only measurements on the mean and standard dev on each measurement.
# As indicated in the CodeBook the angle variables are not considered means on other measurements.
cat("Extracting means and standard deviations...\n")
featureDataSlim <- select(featureData, matches("mean|std", ignore.case = T))
featureDataSlim <- select(featureDataSlim, -starts_with("angle"))
rm(featureData) # we don't need it any more, free the memory

# Add columns of SubjectID and ActivityID

activity <- left_join(activity, activityName, by = "ActivityID")

featureDataSlim <- cbind(ActivityName = activity$ActivityName, featureDataSlim)
featureDataSlim <- cbind(SubjectID = subject$SubjectID, featureDataSlim)

rm(activity, activityName, subject, featureNames)

# Group resulting feature data by subject and activity and calculate means

cat("Assembling tidy data set...\n")
tidyData <- (featureDataSlim %>% group_by(SubjectID, ActivityName) %>% summarise_each(funs(mean)))

# Clean up a bit the variable names

nicerNames <- gsub("\\.\\.\\.", "\\.", names(tidyData))
nicerNames <- gsub("\\.\\.", "", nicerNames)
nicerNames <- gsub("BodyBody", "Body", nicerNames)

colnames(tidyData) <- nicerNames

rm(featureDataSlim)

# write the tidy data to a file

cat("Writing tidy data to a file...\n")

tidyFile <- "./UCI HAR Dataset/UCItidyDataMeans.txt"

if(file.exists(tidyFile)) {
    # warning("Tidy data file exists. Overwriting!")
    file.remove(tidyFile) # We assume here that the file is not write-protected
}

write.table(tidyData, tidyFile, row.names = F)

cat("Done\n")