adeb09 / GettingAndCleaningProject

Getting and Cleaning Data Project

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

README

This file shows the script from run_analysis.R but with comments explaining the program and logic behind each statement or block of code.

library(hash)
library(plyr)

#Downloading dataset
fileUrl = 'https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip'
download.file(fileUrl, destfile='data.zip')

#Loading accelerometer data into a data frame
files = unzip(zipfile = 'data.zip', list = TRUE)$Name

#display what files are in this zip file
files

#only reading in files that are relevant to the analysis
activityLabels = read.table(unz('data.zip', files[1]))
features = read.table(unz('data.zip', files[2]))
subjectsTest = read.table(unz('data.zip', files[16]))
XTest = read.table(unz('data.zip', files[17]))
YTest = read.table(unz('data.zip', files[18]))
subjectsTrain = read.table(unz('data.zip', files[30]))
XTrain = read.table(unz('data.zip', files[31]))
YTrain = read.table(unz('data.zip', files[32]))

#Set the proper variable names from the features data frame
names(XTrain) = as.character(features$V2)
names(XTest) = as.character(features$V2)
names(YTrain) = 'Activity'
names(YTest) = 'Activity'
names(subjectsTest) = 'Subject'
names(subjectsTrain) = 'Subject'

#This grep takes only mean and standard deviation of measurements
subsetIndices = grep('mean()|std()', features$V2)
XTest = XTest[, subsetIndices]
XTrain = XTrain[, subsetIndices]

#Remove meanFreq() measurements since they are not strictly mean or standard deviation measurements of the activities
TestsubsetIndices = grep('meanFreq()', names(XTest), invert = TRUE)
TrainsubsetIndices = grep('meanFreq()', names(XTrain), invert = TRUE)
XTest = XTest[, TestsubsetIndices]
XTrain = XTrain[, TrainsubsetIndices]

#Create a hash for activityLabels data frame
h = hash(1:6, as.character(activityLabels$V2))

#Label the Activities in the Activity column using the hash created before from the activityLabels data frame
for(i in 1:length(YTest$Activity)){
        YTest$Activity[i] = h[[as.character(YTest$Activity[i])]]
}

for(i in 1:length(YTrain$Activity)){
        YTrain$Activity[i] = h[[as.character(YTrain$Activity[i])]]
}

#Combine the columns to gather the Train and Test data frames
Train = cbind(subjectsTrain, YTrain, XTrain)
Test = cbind(subjectsTest, YTest, XTest)

#Combine the Train and Test sets to create one data set
dataset = rbind(Train, Test)

#Now use dplyr package to find the mean values of all measurements grouped by Subject and Activity
tidyDataSet = ddply(dataset, .(Subject, Activity), numcolwise(mean))

#Write the final, tidy data set to a text file
write.table(tidyDataSet, file = 'tidyDataSet.txt', row.names = FALSE)

About

Getting and Cleaning Data Project


Languages

Language:R 100.0%