In this step we will analyze the data in order to establish trends, and relationships, thus providing insights that will help answer business questions.
Key Tasks:
For Python analysis in Jupyter Notebook: https://eamoned.github.io/google-data-analytics-casestudy-python-analyse/
A full report on processing the datasets (and final transformations) can be found here: https://eamoned.github.io/google-data-analytics-casestudy-process/
library(ggplot2)
library(Hmisc)
library(dplyr)
library(patchwork)
library(reshape2)
library(gridExtra)
library(ggcorrplot)
library(reshape2)
library(ggbeeswarm)
library("tidyverse")
dailyAct <- read.csv("clean_data/dailyActivity.csv")
str(dailyAct)
## 'data.frame': 940 obs. of 14 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : chr "2016-04-12" "2016-04-13" "2016-04-14" "2016-04-15" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
dim(dailyAct)
## [1] 940 14
summary(dailyAct)
## Id ActivityDate TotalSteps TotalDistance
## Min. :1.504e+09 Length:940 Min. : 0 Min. : 0.000
## 1st Qu.:2.320e+09 Class :character 1st Qu.: 3790 1st Qu.: 2.620
## Median :4.445e+09 Mode :character Median : 7406 Median : 5.245
## Mean :4.855e+09 Mean : 7638 Mean : 5.490
## 3rd Qu.:6.962e+09 3rd Qu.:10727 3rd Qu.: 7.713
## Max. :8.878e+09 Max. :36019 Max. :28.030
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## Min. :0.0000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.0000 Median : 0.210 Median :0.2400
## Mean :0.1082 Mean : 1.503 Mean :0.5675
## 3rd Qu.:0.0000 3rd Qu.: 2.053 3rd Qu.:0.8000
## Max. :4.9421 Max. :21.920 Max. :6.4800
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## Min. : 0.000 Min. :0.000000 Min. : 0.00
## 1st Qu.: 1.945 1st Qu.:0.000000 1st Qu.: 0.00
## Median : 3.365 Median :0.000000 Median : 4.00
## Mean : 3.341 Mean :0.001606 Mean : 21.16
## 3rd Qu.: 4.782 3rd Qu.:0.000000 3rd Qu.: 32.00
## Max. :10.710 Max. :0.110000 Max. :210.00
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8 1st Qu.:1828
## Median : 6.00 Median :199.0 Median :1057.5 Median :2134
## Mean : 13.56 Mean :192.8 Mean : 991.2 Mean :2304
## 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5 3rd Qu.:2793
## Max. :143.00 Max. :518.0 Max. :1440.0 Max. :4900
print(paste0("Number of unique Ids: " ,length(unique(dailyAct$Id))))
## [1] "Number of unique Ids: 33"
print(paste0("There are " ,length(unique(dailyAct$ActivityDate)), " days of data"))
## [1] "There are 31 days of data"
print(paste0("Are there any NAs = ",any(is.na(dailyAct))))
## [1] "Are there any NAs = FALSE"
# create data frame for activity percentage (Distance)
# total distance for each level
Level <- c("sedentary", "light", "moderate", "very")
Distance <- c(sum(dailyAct$SedentaryActiveDistance),
sum(dailyAct$LightActiveDistance),
sum(dailyAct$ModeratelyActiveDistance),
sum(dailyAct$VeryActiveDistance))
# Plot Pie chart for Distance
piepercent<- round(100*(Distance/sum(Distance)), 2)
pie(Distance, radius=1, labels = piepercent, col = rainbow(length(Distance)),
main = "Activity Level: 12/04/2016-12/05/2026 (Distance)", cex.main = 1)
legend("topright", Level,fill = rainbow(length(Distance)))
# Create data frame for activity percentage (Minutes)
# Total Minutes for each Level
Level2 <- c('sedentary','fairly', 'lightly', 'very')
Minutes <- c(sum(dailyAct$SedentaryMinutes),
sum(dailyAct$LightlyActiveMinutes),
sum(dailyAct$VeryActiveMinutes),
sum(dailyAct$FairlyActiveMinutes))
# Plot Pie Chart for Minutes
piepercent<- round(100*(Minutes/sum(Minutes)), 2)
pie(Minutes, radius=1, labels = piepercent, col = rainbow(length(Minutes)),
main = "Activity Level: 12/04/2016-12/05/2026 (Minutes)", cex.main = 1)
legend("topright",Level2, cex = 0.8,fill = rainbow(length(Minutes)))
For Light, moderate and very active, zero distance is not relevant so lets remove zeros (has some impact on mean)
Removing zeros distance will make a difference to the mean
activityDist <- subset(dailyAct, select=c("ActivityDate", "LightActiveDistance","ModeratelyActiveDistance", "VeryActiveDistance"))
activityDist <- melt(activityDist, id.vars = "ActivityDate", variable.name="level")
# remove zeros
activityDist[activityDist == 0] <- NA
activityMins <- subset(dailyAct,select=c("ActivityDate","LightlyActiveMinutes",
"FairlyActiveMinutes", "VeryActiveMinutes"))
activityMins <- melt(activityMins, id.vars = "ActivityDate", variable.name="level")
# remove zeros
activityMins[activityMins == 0] <- NA
z1 <- ggplot(activityDist, aes(x=level,y=value, fill=level)) +
geom_boxplot() +
labs(title="Distance Distribution by Active Level", x="") +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
z2 <- ggplot(activityMins, aes(x=level,y=value, fill=level)) +
geom_boxplot() +
labs(title="Minutes Distribution by Active Level", x="") +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
grid.arrange(z1, z2, nrow = 1)
## [1] "Average Daily Distance = 5.48970212191541"
## [1] "Average Daily Steps = 7637.91063829787"
## [1] "Average Daily Calories = 2303.60957446809"
## [1] "Percentage light active distance: 60.8561097559676"
## [1] "Percentage moderate active distance: 10.3383123303724"
## [1] "Percentage very active distance: 27.3727210990392"
## [1] "Percentage sedentary active distance: 0.0292617508384631"
## [1] "Percentage lightly active minutes:84.7370143531722"
## [1] "Percentage fairly active minutes: 5.96147552480247"
## [1] "Percentage very active minutes: 9.30151012202534"
Activity Levels:
Just over 61% of total distance achieved is through light active activities. This is followed by 28% very active distance activities. Only 10% is achieved through moderate activities. Perhaps there’s an opportunity to encourage people to plan & set goals to achieve higher activity levels which will in turn burn more calories and improve fitness.
In percentage terms for minutes spent on activities, Lightly has almost 85%, followed by very and then fairly active.
# Calories
a1 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = Calories))+
geom_line(stat="identity") +
ggtitle("Average Calories Burned") +
labs(x="", y="Calories") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
# Distance
a21 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = LightActiveDistance))+
geom_line(stat="identity", color = "brown") +
ggtitle('Light Active Distance') +
labs(x="", y="Average Distance") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
a22 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = ModeratelyActiveDistance))+
geom_line(stat="identity", color = "blue") +
ggtitle('Moderately Active Distance') +
labs(x="", y="Average Distance") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
a23 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = VeryActiveDistance))+
geom_line(stat="identity", color = "red") +
ggtitle('Very Active Distance') +
labs(x="", y="") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
# Minutes
a24 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = LightlyActiveMinutes))+
geom_line(stat="identity", color = "brown") +
ggtitle('Lightly Active Minutes') +
labs(x="", y="Average Minutes") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
a25 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = FairlyActiveMinutes))+
geom_line(stat="identity", color = "blue") +
ggtitle('Fairly Active Minutes') +
labs(x="", y="Average Minutes") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
a26 <- ggplot(dailyAct, aes(x=as.character(ActivityDate), y = VeryActiveMinutes))+
geom_line(stat="identity", color = "red") +
ggtitle('Very Active Minutes') +
labs(x="", y="Average Minutes") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
grid.arrange(a1, a21, nrow = 1)
grid.arrange(a22, a23, nrow = 1)
grid.arrange(a24, a25, nrow = 1)
grid.arrange(a26, nrow = 1)
To support visualisations, lets format the datetime, and create a DayName column from the datetime.
# change to date format
dailyAct$ActivityDate <- as.POSIXct(dailyAct$ActivityDate, format="%Y-%m-%d")
# order the date
dailyAct <-dailyAct[order(dailyAct$ActivityDate),]
# Create a DayName column from the datetime
dailyAct$DayName <- weekdays(dailyAct$ActivityDate)
unique(dailyAct$DayName)
## [1] "Tuesday" "Wednesday" "Thursday" "Friday" "Saturday" "Sunday"
## [7] "Monday"
# change DayName to a factor, i.e. for day order - Tues, Wed, etc)
dailyAct$DayName <- factor(dailyAct$DayName, levels = c("Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday","Sunday","Monday"))
# Calculate average/median Calories for Day Name
calMean <- dailyAct %>%
group_by(DayName) %>%
summarise(meanCals = round(mean(Calories),0),
medianCals = round(median(Calories),0))
# Calculate average/median Steps for Day Name
stepsMean <- dailyAct %>%
group_by(DayName) %>%
summarise(meanSteps = round(mean(TotalSteps),0),
medianSteps = round(median(TotalSteps),0))
# Calculate Average/median Distance for Day Name
distMean <- dailyAct %>%
group_by(DayName) %>%
summarise(sumDist = round(sum(TotalDistance),0),
meanDist = round(mean(TotalDistance),0),
medianDist = round(median(TotalDistance),0))
b1 <- ggplot(calMean, aes(x=DayName, y=medianCals)) +
geom_bar(stat="identity", fill="lightblue") +
geom_line(aes(x=DayName,y=meanCals,group=1), stat="identity", color = "green") +
geom_point(color = "green", aes(x=DayName,y=meanCals)) +
theme(legend.position = "none") +
geom_text(size=3,aes(label=meanCals, x=DayName, y=meanCals), colour="black") +
ggtitle("Median/Mean(line) Calories Burned per Day")+
theme(plot.title = element_text(size = 10)) +
ylab("Calories") +
xlab("") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
b2 <- ggplot(stepsMean, aes(x=DayName, y=medianSteps)) +
geom_bar(stat="identity", fill="lightblue") +
geom_line(aes(x=DayName,y=meanSteps,group=1), stat="identity", color = "green") +
geom_point(color = "green", aes(x=DayName,y=meanSteps)) +
theme(legend.position = "none") +
geom_text(size=3,aes(label=meanSteps, x=DayName, y=meanSteps), colour="black") +
ggtitle("Median/Mean(line) Steps per Day")+
theme(plot.title = element_text(size = 10)) +
ylab("Steps") +
xlab("") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
b3 <- ggplot(distMean, aes(x=DayName, y=medianDist)) +
geom_bar(stat="identity", fill="lightblue") +
geom_line(aes(x=DayName,y=meanDist,group=1), stat="identity", color = "green") +
geom_point(color = "green", aes(x=DayName,y=meanDist)) +
theme(legend.position = "none") +
geom_text(size=4,aes(label=meanDist, x=DayName, y=meanDist), colour="black") +
ggtitle("Median/Mean(line) Distance per Day")+
theme(plot.title = element_text(size = 10)) +
ylab("Distance") +
xlab("") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
b4 <- ggplot(distMean, aes(x=DayName, y=sumDist, fill = DayName)) +
geom_bar(stat="identity") +
theme(legend.position = "none") +
ggtitle("Total Distance per Day of the Week")+
theme(plot.title = element_text(size = 10)) +
ylab("Total Distance") +
xlab("") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
grid.arrange(b1, b2, b3, b4, nrow = 2)
day_mins <- dailyAct %>%
group_by(DayName) %>%
summarise(Very = mean(VeryActiveMinutes),
Fairly = mean(FairlyActiveMinutes),
Lightly = mean(LightlyActiveMinutes),
Sedentary = mean(SedentaryMinutes))
day_m <- melt(day_mins, id.vars = "DayName")
c1 <- ggplot(day_m, aes(x=DayName, y = value, fill=variable)) +
geom_bar(stat="summary", fun=sum, position = position_dodge()) +
labs(x="Day of Week", y="Minutes") +
ggtitle("Mean Minutes for Days of the Week") +
theme(plot.title = element_text(size = 9)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
# Excluding Sedentary
day_mins <- dailyAct %>%
group_by(DayName) %>%
summarise(Very = mean(VeryActiveMinutes),
Fairly = mean(FairlyActiveMinutes),
Lightly = mean(LightlyActiveMinutes))
day_m <- melt(day_mins, id.vars = "DayName")
c2 <- ggplot(day_m, aes(x=DayName, y = value, group=variable, fill=variable)) +
geom_line(stat="summary", fun=mean, aes(colour = variable)) +
ggtitle("Mean Mins for Days of Week (Excl Sedentary") +
geom_point(stat="summary", fun=mean, aes(colour = variable)) +
labs(x="Day of Week", y="") +
theme(plot.title = element_text(size = 9)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.7))
grid.arrange(c1, c2, nrow = 1)
act_sleepDay = read.csv("clean_data/act_sleepDay_inner.csv")
str(act_sleepDay)
## 'data.frame': 410 obs. of 17 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : chr "2016-04-12" "2016-04-13" "2016-04-15" "2016-04-16" ...
## $ TotalSteps : int 13162 10735 9762 12669 9705 15506 10544 9819 14371 10039 ...
## $ TotalDistance : num 8.5 6.97 6.28 8.16 6.48 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.14 2.71 3.19 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 1.26 0.41 0.78 ...
## $ LightActiveDistance : num 6.06 4.71 2.83 5.04 2.51 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 29 36 38 50 28 19 41 39 ...
## $ FairlyActiveMinutes : int 13 19 34 10 20 31 12 8 21 5 ...
## $ LightlyActiveMinutes : int 328 217 209 221 164 264 205 211 262 238 ...
## $ SedentaryMinutes : int 728 776 726 773 539 775 818 838 732 709 ...
## $ Calories : int 1985 1797 1745 1863 1728 2035 1786 1775 1949 1788 ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep : int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
dim(act_sleepDay)
## [1] 410 17
summary(subset(act_sleepDay, select=c("TotalTimeInBed","TotalMinutesAsleep")))
## TotalTimeInBed TotalMinutesAsleep
## Min. : 61.0 Min. : 58.0
## 1st Qu.:403.8 1st Qu.:361.0
## Median :463.0 Median :432.5
## Mean :458.5 Mean :419.2
## 3rd Qu.:526.0 3rd Qu.:490.0
## Max. :961.0 Max. :796.0
print(paste0("Are there any NAs = ",any(is.na(act_sleepDay))))
## [1] "Are there any NAs = FALSE"
print(paste0("There are " ,length(unique(act_sleepDay$ActivityDate)), " days of data"))
## [1] "There are 31 days of data"
print(paste0("Number of unique Ids: " ,length(unique(act_sleepDay$Id))))
## [1] "Number of unique Ids: 24"
# Change data format & create day name
act_sleepDay$ActivityDate <- as.POSIXct(act_sleepDay$ActivityDate, format="%Y-%m-%d")
act_sleepDay <- act_sleepDay[order(act_sleepDay$ActivityDate),]
act_sleepDay$DayName <- weekdays(act_sleepDay$ActivityDate)
# Change DayName to a factor
act_sleepDay$DayName <- factor(act_sleepDay$DayName,
levels = c("Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday","Sunday","Monday"))
# Time sleeping
d1 <- ggplot(act_sleepDay, aes(x=as.character(ActivityDate), y = TotalMinutesAsleep))+
geom_line(stat="identity", color = "orange") +
ggtitle("Total Minutes Sleeping") +
labs(x="Activity Date", y="Minutes") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=5))
# Time in bed
d2 <- ggplot(act_sleepDay, aes(x=as.character(ActivityDate), y = TotalTimeInBed))+
geom_line(stat="identity", color = "blue") +
ggtitle('Total Time In Bed') +
labs(x="Activity Date", y="") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=5))
grid.arrange(d1, d2, nrow = 1)
ggplot(act_sleepDay, aes(x=as.character(ActivityDate), y = TotalSleepRecords))+
geom_line(stat="identity", color = "blue") +
ggtitle('Total Sleep Records') +
labs(x="Activity Date", y="No. of Records") +
theme(plot.title = element_text(size = 11)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55),
axis.text=element_text(size=6))
act_avg <- act_sleepDay %>%
group_by(DayName) %>%
summarise(TotalTimeInBed = mean(TotalTimeInBed),
TotalMinutesAsleep = mean(TotalMinutesAsleep))
act_avg <- melt(act_avg, id.vars = "DayName", variable.name="Mins")
ggplot(act_avg, aes(x=DayName, y = value, fill=Mins)) +
geom_bar(stat="identity", position = position_dodge()) +
labs(x="Day of Week", y="Minutes") +
theme(plot.title = element_text(size = 12)) +
ggtitle("Day of the Week with Average Time in Bed & Total Sleep")
e1 <- ggplot(act_sleepDay, aes(x=DayName,y=TotalTimeInBed, fill=DayName)) +
geom_boxplot() +
labs(title="Total Time in Bed by Days", x="", y="Minutes") +
theme(plot.title = element_text(size = 11)) +
theme(legend.position="none")+
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))+
ylim(0, 1000)
e2 <- ggplot(act_sleepDay, aes(x=DayName,y=TotalMinutesAsleep, fill=DayName)) +
geom_boxplot() +
labs(title="Total Minutes Sleep by Day", x="", y="") +
theme(plot.title = element_text(size = 11)) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))+
ylim(0, 1000)
grid.arrange(e1, e2, nrow=1, ncol=2)
Lets look at days In bed > 600 minutes and < 240 minutes
# Lets look at days In bed > 600 minutes in bed
# Count how many time in bed > 600 Minutes (> 10 hours)
topTimeInBed_n <- filter(act_sleepDay, TotalTimeInBed > 600) %>%
group_by(DayName) %>%
summarise(countTimeInBed = n())
# Total time in bed & asleep > 600 Minutes (> 10 hours)
topTimeInBed <- filter(act_sleepDay, TotalTimeInBed > 600) %>%
group_by(DayName) %>%
summarise(TotalTimeInBed = sum(TotalTimeInBed),
TotalMinutesAsleep = sum(TotalMinutesAsleep))
topTimeInBed <- melt(topTimeInBed, id.vars = "DayName", variable.name="Record")
f1 <- ggplot(topTimeInBed_n, aes(x=DayName, y = countTimeInBed)) +
geom_bar(stat="identity", width=0.6,position = position_dodge(), fill="steelblue") +
labs(x="Day of Week", y="Minutes") +
ggtitle("Count of Time in Bed > 600 minutes")+
theme(plot.title = element_text(size = 10))
f2 <- ggplot(topTimeInBed, aes(x=DayName, y = value, fill=Record)) +
geom_bar(stat="identity", width=0.6,position = position_dodge()) +
labs(x="Day of Week", y="") +
ggtitle("Total Time in Bed > 600 minutes")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))
# Lets look at days In bed < 240 minutes
# Count how many time in bed < 240 Minutes (< 4 hours)
lessTimeInBed_n <- filter(act_sleepDay, TotalTimeInBed < 240) %>%
group_by(DayName) %>%
summarise(countTimeInBed = n())
# Total time in bed & asleep < 240 Minutes (< 4 hours)
lessTimeInBed <- filter(act_sleepDay, TotalTimeInBed < 240) %>%
group_by(DayName) %>%
summarise(TotalTimeInBed = sum(TotalTimeInBed),
TotalMinutesAsleep = sum(TotalMinutesAsleep))
lessTimeInBed <- melt(lessTimeInBed, id.vars = "DayName", variable.name="Record")
f3 <- ggplot(lessTimeInBed_n, aes(x=DayName, y = countTimeInBed)) +
geom_bar(stat="identity", width=0.6,position = position_dodge(), fill="steelblue") +
labs(x="Day of Week", y="Minutes") +
ggtitle("Count of Time in Bed < 240 minutes")+
theme(plot.title = element_text(size = 10))
f4 <- ggplot(lessTimeInBed, aes(x=DayName, y = value, fill=Record)) +
geom_bar(stat="identity", width=0.6,position = position_dodge()) +
labs(x="Day of Week", y="") +
ggtitle("Total Time in Bed < 240 minutes")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))
grid.arrange(f1, f2, f3, f4, nrow=2, ncol=2)
Number of sleep records > 1
# Number of Sleep Records > 1 for Day of the Week
sleepRecords <- filter(act_sleepDay, TotalSleepRecords > 1) %>%
group_by(DayName) %>%
summarise(TotalSleepRecords = sum(TotalSleepRecords))
ggplot(sleepRecords, aes(x=DayName, y = TotalSleepRecords)) +
geom_bar(stat="identity", width=0.6,position = position_dodge(), fill="steelblue") +
labs(x="Day of Week", y="Records") +
ggtitle("Sleep Records > 1 for Day of the Week")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))
# Remove zeros
ts <- dailyAct
ts[ts == 0] <- NA
g1 <- ggplot(ts, aes(x=DayName,y=VeryActiveDistance, fill=DayName)) +
geom_boxplot() +
labs(title="Ids with Top Very Active Distance (by Day)", x="") +
geom_quasirandom(method = "smiley", cex=2) +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
g2 <- ggplot(ts, aes(x=DayName,y=VeryActiveMinutes, fill=DayName)) +
geom_boxplot() +
labs(title="Ids with Top Very Active Minutes (by Day)", x="") +
geom_quasirandom(method = "smiley", cex=2) +
theme(legend.position="none") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
grid.arrange(g1, g2,nrow=1, ncol=2)
g3 <- ggplot(ts, aes(x=DayName,y=Calories, fill=DayName)) +
geom_boxplot() +
labs(title="Ids and Calories Burned (by Day)", x="") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
g4 <- ggplot(ts, aes(x=DayName,y=TotalDistance, fill=DayName)) +
geom_boxplot() +
labs(title='Ids with Top Total Distance (by Day)', x="") +
geom_quasirandom(method = "smiley", cex=2) +
theme(legend.position="none") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 30, vjust=+0.55))
grid.arrange(g3, g4,nrow=1, ncol=2)
## [1] "Top Ids with very active distance"
## Id VeryActiveDistance DayName
## 51 1624580081 21.92 Sunday
## 928 8877689391 21.66 Saturday
## 795 8053475328 13.40 Sunday
## [1] "Top Ids with very active minutes"
## Id VeryActiveMinutes DayName
## 580 5577150313 210 Sunday
## 586 5577150313 207 Saturday
## 573 5577150313 200 Sunday
## 587 5577150313 194 Sunday
## 51 1624580081 186 Sunday
## [1] "Ids with the highest total Distance"
## Id TotalDistance DayName
## 51 1624580081 28.03 Sunday
## 928 8877689391 26.72 Saturday
## 914 8877689391 25.29 Saturday
## 925 8877689391 20.65 Wednesday
## [1] "Ids with the highest calories burned"
## Id Calories DayName
## 607 6117666160 4900 Thursday
## 573 5577150313 4552 Sunday
## 914 8877689391 4547 Saturday
## 587 5577150313 4546 Sunday
(most active Participants (Ids))
# To allow us to view the correlation of dayName we can add dayName as a number
dailyAct$DayOfWeek <- as.integer(factor(dailyAct$DayName,
levels=c("Tuesday","Wednesday","Thursday",
"Friday","Saturday","Sunday",
"Monday"),ordered=TRUE))
num.cols <- sapply(dailyAct, is.numeric) #returns numeric columns
# filter numeric columns only for correlation
numeric_dailyAct <- dailyAct[, num.cols]
# Visualise the correlations
ggcorrplot(cor(numeric_dailyAct), lab = TRUE, lab_size = 2.3, legend.title = "Correlation",
tl.cex = 8) +
labs(title='Activities Correlation')
# Visualise calories correlation
cor.data <- cor(dailyAct[, num.cols]) # correlations
cal.corr <- data.frame(var = colnames(cor.data),
calories = cor.data[,13]) # capture the 13th column only
cal.corr <- cal.corr[order(cal.corr$calories, decreasing = FALSE),]
# geom_bar plots in alphabetical order. Factorise to current order.
cal.corr$var <- factor(cal.corr$var,
levels = cal.corr$var)
cal.corr <- head(cal.corr,-1) # remove last row (calories)
ggplot(cal.corr, aes(x=var, y = calories)) +
geom_bar(stat="summary", fun=sum, width=0.6, fill="steelblue") +
labs(y="Correlation") +
ggtitle("Activities Correlation with Calories") +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55))
# Scatter plots
h1 = ggplot(ts, aes(x = TotalDistance, y = Calories)) +
geom_point(aes(color = factor(DayName))) +
labs(color = "DayName") +
theme(plot.title = element_text(size = 10)) +
ggtitle("Distance Correlation vs Calories")
h2 = ggplot(ts, aes(x = VeryActiveMinutes, y = Calories)) +
geom_point(aes(color = factor(DayName))) +
labs(color = "DayName") +
theme(plot.title = element_text(size = 10)) +
ggtitle("VeryActiveMinutes vs Calories")
grid.arrange(h1, h2, nrow=1, ncol=2)
Note, sleep data has less data points (24 participants)
# add DayOfWeek - to view correlation with sleep factors
# to allow us to view the correlation of dayName we can add dayName as a factor
act_sleepDay$DayOfWeek <- as.integer(factor(act_sleepDay$DayName,
levels=c("Tuesday","Wednesday","Thursday", "Friday",
"Saturday","Sunday","Monday"),ordered=TRUE))
num.cols_2 <- sapply(act_sleepDay, is.numeric) # returns numeric columns
# filter numeric columns only for correlation
numeric_sleep <- act_sleepDay[, num.cols_2]
# Visualise the correlations
ggcorrplot(cor(numeric_sleep), lab = TRUE, lab_size = 2.3, legend.title = "Correlation",
tl.cex = 8) +
labs(title='Sleep Correlation')
# Visualise calories correlation
cor.sleep <- cor(numeric_sleep) # correlations
cal.cor_df <- data.frame(var = colnames(cor.sleep),
calories = cor.sleep[,13],
TotalMinutesAsleep = cor.sleep[,15]) # capture the 13th column
# Order to Calories
cal.cor_sleep <- cal.cor_df[order(cal.cor_df$calories, decreasing = FALSE),]
# geom_bar plots in alphabetical order. Factorize to current order.
cal.cor_sleep$var <- factor(cal.cor_sleep$var,
levels = cal.cor_sleep$var)
# remove last row (calories)
cal.cor_sleep <- head(cal.cor_sleep,-1)
# Visualise Sleep Correlation
# Order to Sleep
tot.cor_sleep <- cal.cor_df[order(cal.cor_df$TotalMinutesAsleep, decreasing = FALSE),]
# geom_bar plots in alphabetical order. Factorize to current order.
tot.cor_sleep$var <- factor(tot.cor_sleep$var,
levels = tot.cor_sleep$var)
# remove last row (Sleep)
tot.cor_sleep <- head(tot.cor_sleep,-1)
i1 <- ggplot(cal.cor_sleep, aes(x=var, y = calories)) +
geom_bar(stat="summary", fun=sum, width=0.6, fill="steelblue") +
labs(y="Correlation", x ="") +
ggtitle("Correlation with Calories") +
theme(plot.title = element_text(size = 10)) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))
i2 <- ggplot(tot.cor_sleep, aes(x=var, y = TotalMinutesAsleep)) +
geom_bar(stat="summary", fun=sum, width=0.6, fill="steelblue") +
labs(y="Correlation", x="") +
ggtitle("Correlation with Total Minutes Asleep") +
theme(plot.title = element_text(size = 10)) +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55))
grid.arrange(i1, i2, nrow=2, ncol=1)
Sleep data has less data points (24 participants) than the daily activity data set
There’s a little correlation (-0.25) between levels of activity and time spent sleeping but it’s not significant. So users that stayed in bed less are not particularly more active during the day. Although there is no significant correlation between time spent in bed and calories burned throughout the day, it is higher than minutes spent sleeping and calories burned.
dat_records_2 <- filter(act_sleepDay, TotalSleepRecords > 1) %>%
group_by(Id) %>%
summarise(TotalSleepRecords = sum(TotalSleepRecords))
bedTime <- act_sleepDay %>% group_by(Id) %>%
summarise(TotalTimeInBed = median(TotalTimeInBed))
sleepTime <- act_sleepDay %>% group_by(Id) %>%
summarise(TotalMinutesAsleep = median(TotalMinutesAsleep))
topsleepers <- act_sleepDay %>% group_by(Id) %>%
summarise(TotalTimeInBed = sum(TotalTimeInBed),
TotalMinutesAsleep = sum(TotalMinutesAsleep))
topsleepers <- melt(topsleepers, id.vars = "Id", variable.name="var")
ggplot(dat_records_2, aes(x=reorder(factor(Id), -TotalSleepRecords), y = TotalSleepRecords)) +
geom_bar(fill="steelblue",stat="identity", width=0.6,position = position_dodge()) +
labs(x="", y="Sleep Records") +
ggtitle("Ids with Total Sleep Records > 1") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55),
axis.text=element_text(size=6))
ggplot(bedTime, aes(x=reorder(factor(Id), -TotalTimeInBed), y = TotalTimeInBed)) +
geom_bar(stat="identity", width=0.6,position = position_dodge(), fill="steelblue") +
labs(x="", y="Minutes") +
ggtitle("Top Ids for Median Total Time in Bed") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55),
axis.text=element_text(size=6))
ggplot(sleepTime, aes(x=reorder(factor(Id), -TotalMinutesAsleep), y = TotalMinutesAsleep)) +
geom_bar(stat="identity", width=0.6,position = position_dodge(), fill="steelblue") +
labs(x="", y="Minutes") +
ggtitle("Top Ids for Median Total Minutes Sleeping") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55),
axis.text=element_text(size=6))
ggplot(topsleepers, aes(reorder(x=factor(Id), -value), y = value, fill=var)) +
geom_bar(stat="identity", width=0.6,position = position_dodge()) +
labs(x="", y="") +
ggtitle("Ids with Most Time in Bed") +
theme(plot.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55),
axis.text=element_text(size=6))
# Very Active Minutes
dailyAct_order1 <- dailyAct %>% group_by(Id) %>%
summarise(VeryActiveMinutes = sum(VeryActiveMinutes))
# Very Active Distance
dailyAct_order2 <- dailyAct %>% group_by(Id) %>%
summarise(VeryActiveDistance = sum(VeryActiveDistance))
# Total Distance
dailyAct_order3 <- dailyAct %>% group_by(Id) %>%
summarise(TotalDistance = sum(TotalDistance))
# Total Steps
dailyAct_order4 <- dailyAct %>% group_by(Id) %>%
summarise(TotalSteps = sum(TotalSteps))
ggplot(dailyAct_order1, aes(reorder(x=factor(Id), -VeryActiveMinutes), y = VeryActiveMinutes))+
geom_bar(stat="identity", width=0.6, fill="steelblue") +
labs(y="Minutes", x="") +
ggtitle("Top Ids with Very Active Minutes") +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55))
ggplot(dailyAct_order2, aes(reorder(x=factor(Id), -VeryActiveDistance), y = VeryActiveDistance)) +
geom_bar(stat="identity", width=0.6, fill="steelblue") +
labs(y="Distance", x="") +
ggtitle("Top Ids with Very Active Distance") +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55))
ggplot(dailyAct_order3, aes(reorder(x=factor(Id), -TotalDistance), y = TotalDistance)) +
geom_bar(stat="identity", width=0.6, fill="steelblue") +
labs(y="Total Distance", x="Id") +
ggtitle("Top Ids - Total Distance") +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55))
ggplot(dailyAct_order4, aes(reorder(x=factor(Id), -TotalSteps), y = TotalSteps)) +
geom_bar(stat="identity", width=0.6, fill="steelblue") +
labs(y="Total Steps", x="Id") +
ggtitle("Top Ids - Total Steps") +
theme(legend.position="none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55))
# Import & describe the data
hours = read.csv("clean_data/hr_merge.csv")
hours <- subset(hours, select = -X)
str(hours)
## 'data.frame': 22099 obs. of 6 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour : chr "2016-04-12 00:00:00" "2016-04-12 01:00:00" "2016-04-12 02:00:00" "2016-04-12 03:00:00" ...
## $ Calories : int 81 61 59 47 48 48 48 47 68 141 ...
## $ TotalIntensity : int 20 8 7 0 0 0 0 0 13 30 ...
## $ AverageIntensity: num 0.333 0.133 0.117 0 0 ...
## $ StepTotal : int 373 160 151 0 0 0 0 0 250 1864 ...
dim(hours)
## [1] 22099 6
summary(hours)
## Id ActivityHour Calories TotalIntensity
## Min. :1.504e+09 Length:22099 Min. : 42.00 Min. : 0.00
## 1st Qu.:2.320e+09 Class :character 1st Qu.: 63.00 1st Qu.: 0.00
## Median :4.445e+09 Mode :character Median : 83.00 Median : 3.00
## Mean :4.848e+09 Mean : 97.39 Mean : 12.04
## 3rd Qu.:6.962e+09 3rd Qu.:108.00 3rd Qu.: 16.00
## Max. :8.878e+09 Max. :948.00 Max. :180.00
## AverageIntensity StepTotal
## Min. :0.0000 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.: 0.0
## Median :0.0500 Median : 40.0
## Mean :0.2006 Mean : 320.2
## 3rd Qu.:0.2667 3rd Qu.: 357.0
## Max. :3.0000 Max. :10554.0
print(paste0("Are there any NAs = ",any(is.na(hours))))
## [1] "Are there any NAs = FALSE"
print(paste0("Number of unique Ids: " ,length(unique(hours$Id))))
## [1] "Number of unique Ids: 33"
# format date
hours$ActivityHour <- as.POSIXct(hours$ActivityHour, format= "%Y-%m-%d %H:%M:%S")
hours <- hours[order(hours$ActivityHour),]
# Extract hour from date/time
hours$Hour <- as.numeric(strftime(hours$ActivityHour, format="%H"))
chart <- hours %>%
select(ActivityHour, Calories, TotalIntensity) %>%
gather(key = "variable", value = "value", -ActivityHour)
ggplot(chart, aes(x = ActivityHour, y = value)) +
geom_line(aes(color = variable, linetype = variable)) +
labs(y="") +
ggtitle("Hourly Intensity & Calories") +
scale_color_manual(values = c("darkorange", "steelblue"))
# Extract numeric variables for correlation
hours_num <- hours[,sapply(hours, is.numeric)]
# Visualise the correlations
ggcorrplot(cor(hours_num), lab = TRUE) +
labs(title='Correlations') +
scale_fill_gradient2(limit = c(0,1), low = "blue", high = "yellow", mid = "green", midpoint = 0.5)
# Visualise calories correlation
cor_hours <- cor(hours_num)
hrs_df <- data.frame(var = colnames(cor_hours),
calories = cor_hours[,2])
hrs_df <- hrs_df[order(hrs_df$calories, decreasing = FALSE),]
# geom_bar plots in alphabetical order. Factorise to current order.
hrs_df$var <- factor(hrs_df$var,
levels = hrs_df$var)
hrs_df <- head(hrs_df,-1) # remove last row (calories)
ggplot(hrs_df, aes(x=var, y = calories)) +
geom_bar(stat="summary", fun=sum, width=0.6,fill="steelblue") +
labs(y="Correlation", x="") +
ggtitle("Correlation with Calories") +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55)) +
ylim(0, 1)
# Group "Hours" into parts of the day
hours$PartOfDay <- with(hours,
ifelse((Hour >= 5 & Hour < 12), "morning",
ifelse((Hour >= 12 & Hour < 17), "afternoon",
ifelse((Hour >=17 & Hour < 21), "evening",
"night"))))
j1 <- ggplot(hours, aes(x=PartOfDay,y=TotalIntensity, fill=PartOfDay)) +
geom_boxplot() +
labs(title="Part of the Day & Total Intensity", x="") +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10))
j2 <- ggplot(hours, aes(x=PartOfDay,y=Calories, fill=PartOfDay)) +
geom_boxplot() +
labs(title="Part of the Day & Calories", x="") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 10))
grid.arrange(j1, j2, nrow=1, ncol=2)
# Scatter plot - intensity v calories
k1 <- ggplot(hours, aes(x = TotalIntensity, y = Calories)) +
geom_point(aes(color = factor(PartOfDay))) +
labs(color = "Part Of Day") +
ggtitle("Part of the Day & Total Intensity") +
theme(plot.title = element_text(size = 10))
# Boxplot - hours of day v calories
k2 <- ggplot(hours, aes(x = TotalIntensity, y = Calories)) +
geom_point(alpha = 0.4,aes(color = Hour)) +
labs(color = "Hour", y="") +
ggtitle("Hour of the Day & Total Intensity") +
theme(plot.title = element_text(size = 10))
grid.arrange(k1, k2, nrow=1, ncol=2)
ggplot(hours, aes(x=factor(Hour),y=TotalIntensity, fill=factor(Hour))) +
geom_boxplot() +
labs(title="Hours of the Day & Total Intensity", x="Hour") +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10))
Metabolic Equivalents
1 x MET = Energy you use when resting
4 x METs - exerting 4 times the energy than you would if you were sitting still
To get accurate MET values, divide by 10.
# Import the data & Describe
mins = read.csv("clean_data/min_merge.csv")
# To get accurate MET values, divide by 10.
mins$METs <- mins$METs / 10
# Group "Hours" into parts of the day
mins$PartOfDay <- with(mins,
ifelse((Hour >= 5 & Hour < 12), "morning",
ifelse((Hour >= 12 & Hour < 17), "afternoon",
ifelse((Hour >=17 & Hour < 21), "evening",
"night"))))
str(mins)
## 'data.frame': 1325580 obs. of 11 variables:
## $ ActivityMinute: chr "2016-04-12 00:00:00" "2016-04-12 00:01:00" "2016-04-12 00:02:00" "2016-04-12 00:03:00" ...
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ Calories : num 0.786 0.786 0.786 0.786 0.786 ...
## $ Intensity : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Steps : int 0 0 0 0 0 0 0 0 0 0 ...
## $ METs : num 1 1 1 1 1 1.2 1.2 1.2 1.2 1.2 ...
## $ Month : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Day : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Hour : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Min : int 0 1 2 3 4 5 6 7 8 9 ...
## $ PartOfDay : chr "night" "night" "night" "night" ...
dim(mins)
## [1] 1325580 11
summary(mins)
## ActivityMinute Id Calories Intensity
## Length:1325580 Min. :1.504e+09 Min. : 0.0000 Min. :0.0000
## Class :character 1st Qu.:2.320e+09 1st Qu.: 0.9357 1st Qu.:0.0000
## Mode :character Median :4.445e+09 Median : 1.2176 Median :0.0000
## Mean :4.848e+09 Mean : 1.6231 Mean :0.2006
## 3rd Qu.:6.962e+09 3rd Qu.: 1.4327 3rd Qu.:0.0000
## Max. :8.878e+09 Max. :19.7499 Max. :3.0000
## Steps METs Month Day
## Min. : 0.000 Min. : 0.000 Min. :4.000 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:4.000 1st Qu.: 9.00
## Median : 0.000 Median : 1.000 Median :4.000 Median :16.00
## Mean : 5.336 Mean : 1.469 Mean :4.339 Mean :15.85
## 3rd Qu.: 0.000 3rd Qu.: 1.100 3rd Qu.:5.000 3rd Qu.:23.00
## Max. :220.000 Max. :15.700 Max. :5.000 Max. :30.00
## Hour Min PartOfDay
## Min. : 0.00 Min. : 0.00 Length:1325580
## 1st Qu.: 5.00 1st Qu.:14.75 Class :character
## Median :11.00 Median :29.50 Mode :character
## Mean :11.42 Mean :29.50
## 3rd Qu.:17.00 3rd Qu.:44.25
## Max. :23.00 Max. :59.00
print(paste0("Are there any NAs: " ,any(is.na(mins))))
## [1] "Are there any NAs: FALSE"
print(paste0("Number of unique Ids: " ,length(unique(mins$Id))))
## [1] "Number of unique Ids: 33"
# change to date format
mins$ActivityMinute <- as.POSIXct(mins$ActivityMinute, format="%Y-%m-%d")
# order the date
mins <-mins[order(mins$ActivityMinute),]
# Create a DayName column from the datetime
mins$DayName <- weekdays(mins$ActivityMinute)
# to allow us to view the correlation of dayName we can add dayName as a number
mins$DayOfWeek <- as.integer(factor(mins$DayName,levels=c("Tuesday","Wednesday","Thursday",
"Friday","Saturday","Sunday",
"Monday"),ordered=TRUE))
# METs Distribution
ggplot(mins, aes(x=METs))+
geom_histogram(color="darkblue", fill="lightblue", bins=5) +
labs(title="METs Distribution", x="METs", y="")
# Calculate the mean & plot part of day & hour for METs
meanMET <- mins %>% group_by(Min, Hour) %>%
summarise(METs = mean(METs))
meanMETs <- mins %>% group_by(PartOfDay) %>%
summarise(METs = mean(METs))
l1 <- ggplot(mins, aes(x=factor(PartOfDay),y=METs, fill=factor(PartOfDay))) +
geom_boxplot() +
labs(title="PartOfDay & Intensity/minute", x="Part of Day") +
theme(legend.position="none")+
theme(plot.title = element_text(size = 10))
l2 <- ggplot(meanMETs, aes(x=PartOfDay, y = METs)) +
geom_bar(stat="identity", fill="steelblue") +
#geom_bar(stat="summary", fun=sum, fill="blue") +
ggtitle("Part Of Day & mean METs per Minute") +
labs(x="Part of the Day", y="METs(mean)") +
theme(legend.position = "none") +
theme(plot.title = element_text(size = 10))
grid.arrange(l1, l2, nrow=1, ncol=2)
ggplot(meanMET, aes(x=factor(Hour),y=METs, fill=factor(Hour))) +
geom_boxplot() +
labs(title="Mean METs for Each Hour of the Day", x="Hour", y="Variables") +
theme(legend.position="none")
# Minutes Correlation
mins_num <- mins[,sapply(mins, is.numeric)]
ggcorrplot(cor(mins_num), lab = TRUE) +
labs(title='Variable Correlation') +
theme(plot.title = element_text(size = 10)) +
scale_fill_gradient2(limit = c(0,1), low = "steelblue", high = "yellow", mid = "green", midpoint = 0.5)
# Correlaton with METs
cor_mins <- cor(mins_num)
mins_df <- data.frame(var = colnames(cor_mins),
METs = cor_mins[,5])
mins_df <- mins_df[order(mins_df$METs, decreasing = FALSE),]
# geom_bar plots in alphabetical order. Factorise to current order.
mins_df$var <- factor(mins_df$var,
levels = mins_df$var)
mins_df <- head(mins_df,-1) # remove last row (METs)
ggplot(mins_df, aes(x=var, y = METs)) +
geom_bar(stat="summary", fun=sum, width=0.6,fill="steelblue") +
labs(y="Correlation") +
ggtitle("Correlation with METs") +
theme(legend.position="none") +
theme(axis.text.x = element_text(angle = 45, vjust=+0.55)) +
ylim(0, 1)
# User & METs
meanIds <- mins %>% group_by(Id) %>%
summarise(METs = mean(METs))
# order largest to smallest
meanIds <- meanIds[order(meanIds$METs, decreasing = TRUE),]
# top ten
meanIds <- head(meanIds, 10)
m1 <- ggplot(meanIds, aes(x=reorder(Id, -METs), y = METs, fill=factor(Id))) +
geom_bar(stat="identity") +
ggtitle("Ids with top mean METs per Minute") +
labs(x="", y="METs(mean)") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
sortMETs <- mins[order(mins$METs, decreasing = TRUE),]
sortMETs <- head(sortMETs,10)
# ggplot does not normally work well with duplicates because it cannot plot duplicate values
# along the x-axis without deploying some hack.
# To make duplicate factor levels unique in an R data frame, we can use make.unique function.
sortMETs <- within(sortMETs,Id<- ave(as.character(Id),FUN=make.unique))
m2 <- ggplot(sortMETs, aes(x=reorder(Id, -METs), y = METs, fill=DayName)) +
geom_bar(stat="identity") +
ggtitle("Ids with top METs per Minute by Day of Week") +
labs(x="", y="METs") +
#theme(legend.position = "none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
m3 <- ggplot(sortMETs, aes(x=reorder(Id, -METs), y = METs, fill=PartOfDay)) +
geom_bar(stat="identity") +
ggtitle("Ids with top METs per Minute by Day of Week") +
labs(x="", y="METs") +
#theme(legend.position = "none") +
theme(axis.text.x = element_text(size = 7,angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
m4 <-ggplot(sortMETs, aes(x=reorder(Id, -METs), y = METs, fill=factor(Hour))) +
geom_bar(stat="identity") +
ggtitle("Ids with top METs per Minute by Hour of Day") +
labs(x="", y="METs") +
#theme(legend.position = "none") +
theme(axis.text.x = element_text(size = 7, angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
grid.arrange(m1, m2, nrow=1, ncol=2)
grid.arrange(m3, m4, nrow=1, ncol=2)
If you compare previous top Ids for very active minutes, distance and steps, and top Ids for METs, you will see similar Ids appearing, so the correlation between these factors are strong
Highest mean METs per minute was on a Saturday in the morning at 7am. Users with the highest METs occur on a Saturday, Sunday and a Thursday. And this usually occurs in the morning and afternoon, around 7am to 8am and 4pm
# import the data
sleepHR = read.csv("clean_data/min_sleep_HR_inner.csv")
sleepHR <- subset(sleepHR, select = -X)
str(sleepHR)
## 'data.frame': 76663 obs. of 12 variables:
## $ ActivityMinute: chr "2016-04-12 22:05:00" "2016-04-12 22:06:00" "2016-04-12 22:07:00" "2016-04-12 22:08:00" ...
## $ Id : num 2.35e+09 2.35e+09 2.35e+09 2.35e+09 2.35e+09 ...
## $ Calories : num 0.941 1.035 0.941 0.941 0.941 ...
## $ Intensity : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Steps : int 0 0 0 0 0 0 0 0 0 0 ...
## $ METs : int 10 11 10 10 10 10 10 10 10 10 ...
## $ Month : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Day : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Hour : int 22 22 22 22 22 22 22 22 22 22 ...
## $ Min : int 5 6 7 8 9 10 11 12 13 14 ...
## $ State : int 3 3 2 1 1 2 1 1 1 1 ...
## $ meanHR : num 76.7 76.7 80.2 80.4 79.7 ...
dim(sleepHR)
## [1] 76663 12
summary(sleepHR)
## ActivityMinute Id Calories Intensity
## Length:76663 Min. :2.347e+09 Min. : 0.8939 Min. :0.00000
## Class :character 1st Qu.:4.388e+09 1st Qu.: 0.9160 1st Qu.:0.00000
## Mode :character Median :5.577e+09 Median : 1.0393 Median :0.00000
## Mean :5.673e+09 Mean : 1.1124 Mean :0.01276
## 3rd Qu.:6.962e+09 3rd Qu.: 1.2637 3rd Qu.:0.00000
## Max. :8.792e+09 Max. :12.3786 Max. :3.00000
## Steps METs Month Day
## Min. : 0.0000 Min. :10.00 Min. :4.000 Min. : 1.00
## 1st Qu.: 0.0000 1st Qu.:10.00 1st Qu.:4.000 1st Qu.: 9.00
## Median : 0.0000 Median :10.00 Median :4.000 Median :16.00
## Mean : 0.1269 Mean :10.33 Mean :4.328 Mean :16.07
## 3rd Qu.: 0.0000 3rd Qu.:10.00 3rd Qu.:5.000 3rd Qu.:23.00
## Max. :54.0000 Max. :90.00 Max. :5.000 Max. :30.00
## Hour Min State meanHR
## Min. : 0.00 Min. : 0.00 Min. :1.000 Min. : 39.40
## 1st Qu.: 2.00 1st Qu.:15.00 1st Qu.:1.000 1st Qu.: 55.80
## Median : 4.00 Median :30.00 Median :1.000 Median : 60.83
## Mean : 6.15 Mean :29.52 Mean :1.071 Mean : 61.15
## 3rd Qu.: 6.00 3rd Qu.:44.00 3rd Qu.:1.000 3rd Qu.: 65.50
## Max. :23.00 Max. :59.00 Max. :3.000 Max. :138.00
print(paste0("Are there any NAs: " ,any(is.na(sleepHR))))
## [1] "Are there any NAs: FALSE"
print(paste0("Number of unique Ids: " ,length(unique(sleepHR$Id))))
## [1] "Number of unique Ids: 11"
# change to date format
sleepHR$ActivityMinute <- as.POSIXct(sleepHR$ActivityMinute, format="%Y-%m-%d")
# order the date
sleepHR <-sleepHR[order(sleepHR$ActivityMinute),]
# Create a DayName column from the datetime
sleepHR$DayName <- weekdays(sleepHR$ActivityMinute)
# to allow us to view the correlation of dayName we can add dayName as a number
sleepHR$DayOfWeek <- as.integer(factor(sleepHR$DayName,
levels=c("Tuesday","Wednesday",
"Thursday",
"Friday","Saturday","Sunday",
"Monday"),ordered=TRUE))
# To get accurate MET values, divide by 10.
sleepHR$METs <- sleepHR$METs / 10
# Group "Hours" into parts of the day
sleepHR$PartOfDay <- with(sleepHR,
ifelse((Hour >= 5 & Hour < 12), "morning",
ifelse((Hour >= 12 & Hour < 17), "afternoon",
ifelse((Hour >=17 & Hour < 21), "evening",
"night"))))
# Rename "State" to "SleepLevel"
sleepHR <- sleepHR %>%
rename(SleepLevel = State)
print(paste0("Intensity Levels: " ,unique(sleepHR$Intensity)))
## [1] "Intensity Levels: 0" "Intensity Levels: 1" "Intensity Levels: 2"
## [4] "Intensity Levels: 3"
print(paste0("Sleep Levels: " ,unique(sleepHR$SleepLevel)))
## [1] "Sleep Levels: 3" "Sleep Levels: 2" "Sleep Levels: 1"
Note:
Intensity = 0:Sedentary, 1:Light, 2:Moderate, 3:Very Active
Sleep Level = 1:asleep, 2:restless, 3:awake
# plot HR distribution
n1 <- ggplot(sleepHR)+
geom_histogram(aes(x=meanHR), color="darkblue", fill="lightblue", bins=15) +
labs(title="Average HR Distribution (per minute)", x="meanHR") +
theme(plot.title = element_text(size = 10))
# plot Sleep Level distribution
n2 <- ggplot(sleepHR)+
geom_histogram(aes(x=SleepLevel), color="darkblue", fill="lightblue", bins=10) +
labs(title="SleepLevel Distribution (per minute)", x="SleepLevel") +
theme(plot.title = element_text(size = 10))
grid.arrange(n1, n2, nrow=1, ncol=2)
sleepHR_num <- sleepHR[,sapply(sleepHR, is.numeric)]
ggcorrplot(cor(sleepHR_num), lab = TRUE, tl.cex = 7) +
labs(title='Sleep Correlation') +
scale_fill_gradient2(limit = c(0,1), low = "steelblue", high = "yellow", mid = "green", midpoint = 0.5)
sleep_corr <- cor(sleepHR_num)
# Correlation with METs
sleep_cor_df <- data.frame(var = colnames(sleep_corr), METs = sleep_corr[,5])
sleep_cor_df <- sleep_cor_df[order(sleep_cor_df$METs, decreasing=FALSE),]
# geom_bar plots in alphabetical order so we factorise to current order
sleep_cor_df$var <- factor(sleep_cor_df$var, levels = sleep_cor_df$var)
sleep_cor_df <- head(sleep_cor_df,-1)
# Correlation with Sleep Level
sleep_cor_df2 <- data.frame(var = colnames(sleep_corr), SleepLevel = sleep_corr[,10])
sleep_cor_df2 <- sleep_cor_df2[order(sleep_cor_df2$SleepLevel, decreasing=FALSE),]
# geom_bar plots in alphabetical order so we factorise to current order
sleep_cor_df2$var <- factor(sleep_cor_df2$var, levels = sleep_cor_df2$var)
sleep_cor_df2 <- head(sleep_cor_df2,-1)
# Correlation with Heart Rate
sleep_cor_df3 <- data.frame(var = colnames(sleep_corr), meanHR = sleep_corr[,11])
sleep_cor_df3 <- sleep_cor_df3[order(sleep_cor_df3$meanHR, decreasing=FALSE),]
# geom_bar plots in alphabetical order so we factorise to current order
sleep_cor_df3$var <- factor(sleep_cor_df3$var, levels = sleep_cor_df3$var)
sleep_cor_df3 <- head(sleep_cor_df3,-1)
O1 = ggplot(sleep_cor_df, aes(x=var, y = METs)) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="correlation", x="") +
ggtitle("Variable Correlation with METs") +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
O2 <- ggplot(sleep_cor_df2, aes(x=var, y = SleepLevel)) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="", x="") +
ggtitle("Variable Correlation with Sleep Level") +
ylim(-0.1,1) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
O3 <- ggplot(sleep_cor_df3, aes(x=var, y = meanHR)) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="correlation", x="") +
ggtitle("Variable Correlation with Heart Rate") +
ylim(-0.1,1) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
O4 <- ggplot(sleepHR, aes(x = Intensity, y = METs)) +
geom_point(alpha = 0.4,aes()) +
labs(y="METs") +
ggtitle("Intensity Level & METs Correlation") +
theme(plot.title = element_text(size = 10))
grid.arrange(O1, O2, nrow=1, ncol=2)
grid.arrange(O3, O4, nrow=1, ncol=2)
p1 <- ggplot(sleepHR, aes(x=factor(DayName),y=meanHR, fill=factor(DayName))) +
geom_boxplot() +
labs(title="Average Heart Rate by Day of the Week", x="Day of Week") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 10))
p2 <- ggplot(sleepHR, aes(x=factor(DayName),y=meanHR, fill=factor(PartOfDay))) +
geom_boxplot() +
labs(title="Average Heart Rate by PartOfDay", x="Day of Week") +
theme(plot.title = element_text(size = 10))
p3 <- ggplot(sleepHR, aes(x = Intensity, y = meanHR)) +
geom_point(alpha = 0.4,aes()) +
labs(y="Mean Heart Rate") +
ggtitle("Correlation for Average Heart Rate and Intensity") +
geom_smooth(method='lm', formula= y~x) +
theme(plot.title = element_text(size = 10))
grid.arrange(p1, nrow=1, ncol=1)
grid.arrange(p2, nrow=1, ncol=1)
grid.arrange(p3, nrow=1, ncol=1)
Note:
Intensity = 0:Sedentary, 1:Light, 2:Moderate, 3:Very Active
Sleep Level = 1:asleep, 2:restless, 3:awake
# Which day has the largest restless percentage of sleep
dayList <- unique(sleepHR$DayName)
# dayList <- "Tuesday"
ls <- list()
for (day in dayList) {
x <- sleepHR[sleepHR$DayName == day,]
y <- x[x$SleepLevel == 2,]
z <- nrow(y)/nrow(x)
ls <- append(ls,z)
}
ls <- matrix(unlist(ls))
ls_df <- data.frame(dayList)
ls_df$Level2 <- ls
ls_df <- ls_df[order(ls_df$Level2, decreasing=FALSE),]
ls_df$dayList <- factor(ls_df$dayList, levels = ls_df$dayList)
q1 <- ggplot(ls_df, aes(x=dayList, y = (Level2*100))) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="Percentage Level 2", , x="Day") +
ggtitle("Percentage Restless for Day of the Week") +
ylim(0,10) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
# Which day has the largest awake level percentage of sleep periods
ls2 <- list()
for (day in dayList) {
x <- sleepHR[sleepHR$DayName == day,]
y <- x[x$SleepLevel == 3,]
z <- nrow(y)/nrow(x)
ls2 <- append(ls2,z)
}
ls2 <- matrix(unlist(ls2))
ls2_df <- data.frame(dayList)
ls2_df$Level <- ls2
ls2_df <- ls2_df[order(ls2_df$Level, decreasing=FALSE),]
ls2_df$dayList <- factor(ls2_df$dayList, levels = ls2_df$dayList)
q2 <- ggplot(ls2_df, aes(x=dayList, y = (Level*100))) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="Percentage Level 3", , x="Day") +
ggtitle("Percentage Awake for Day of the Week") +
ylim(0,5) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
# Which hour of the day has the largest restless percentage of sleep
hourList <- unique(sleepHR$Hour)
# dayList <- "Tuesday"
ls3 <- list()
for (hour in hourList) {
x <- sleepHR[sleepHR$Hour == hour,]
y <- x[x$SleepLevel == 2,]
z <- nrow(y)/nrow(x)
ls3 <- append(ls3,z)
}
ls3 <- matrix(unlist(ls3))
ls3_df <- data.frame(hourList)
ls3_df$Level <- ls3
ls3_df <- ls3_df[order(ls3_df$Level, decreasing=FALSE),]
ls3_df$hourList <- factor(ls3_df$hourList, levels = ls3_df$hourList)
q3 <- ggplot(ls3_df, aes(x=hourList, y = (Level*100))) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="Percentage Level 2", x="Hour") +
ggtitle("Percentage Restless for Hour of Day") +
ylim(0,20) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
# Which hour of the day has the largest awake percentage of sleep
ls4 <- list()
for (hour in hourList) {
x <- sleepHR[sleepHR$Hour == hour,]
y <- x[x$SleepLevel == 3,]
z <- nrow(y)/nrow(x)
ls4 <- append(ls4,z)
}
ls4 <- matrix(unlist(ls4))
ls4_df <- data.frame(hourList)
ls4_df$Level <- ls4
ls4_df <- ls4_df[order(ls4_df$Level, decreasing=FALSE),]
ls4_df$hourList <- factor(ls4_df$hourList, levels = ls4_df$hourList)
q4 <- ggplot(ls4_df, aes(x=hourList, y = (Level*100))) +
geom_bar(stat="identity", fill="steelblue") +
labs(y="Percentage Level 3", x="Hour") +
ggtitle("Percentage Awake for Hour of Day") +
ylim(0,10) +
theme(axis.text.x = element_text(angle = 35, vjust=+0.55)) +
theme(plot.title = element_text(size = 10))
grid.arrange(q1, q2, nrow=1, ncol=2)
grid.arrange(q3, q4, nrow=1, ncol=2)