6 runs using 3,6 and 9 executors for the B1, B2 and B3 Berlin datasets with epsilon values of 90, 100, 110m…

library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Reading the data and plotting scale up of apache spark stages…

source("/home/and/Documents/PhD/Research/Scripts/R/R13/tasksMetrics.R")

nohup = "/home/and/Documents/PhD/Research/Scripts/R/R14/nohupAWS012.txt"
cores = 6
epsilon = 110.0
stages1 = bind_rows(getAppIDs(nohup, cores, executors = 3, epsilon) %>% map(getStages))
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
stages2 = bind_rows(getAppIDs(nohup, cores, executors = 6, epsilon) %>% map(getStages))
stages3 = bind_rows(getAppIDs(nohup, cores, executors = 9, epsilon) %>% map(getStages))

stages = rbind(stages1, stages2, stages3) %>% select(StageId, Stage, Executors, Duration) %>%
  group_by(StageId, Stage, Executors) %>% summarise(Duration = mean(Duration))

data = stages %>% ungroup() %>% mutate(Stage = paste(str_pad(StageId, 5, "left"), Stage))
p = ggplot(data = data, aes(x = Stage, y = Duration, fill = Executors)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x="Stages", y="Duration(s)")
ggplotly(p)

Overal scale up..

data = stages %>% ungroup() %>% 
  select(Executors, Duration) %>% group_by(Executors) %>% summarise(Duration = sum(Duration))
head(data)
data$Epsilon = as.factor(epsilon)
p = ggplot(data = data, aes(x = Epsilon, y = Duration, fill = Executors)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x="Epsilon", y="Duration(s)")
plot(p)

Collecting stages with greatest difference on time…

stagesByDiff = stages %>% group_by(StageId, Stage) %>% 
  summarise(min=min(Duration), max=max(Duration), mean=mean(Duration), sd=sd(Duration)) %>%
  mutate(diff = max - min) %>%
  arrange(desc(diff))
stagesByDiff

Reading taks information for those stages…

source("/home/and/Documents/PhD/Research/Scripts/R/R13/tasksMetrics.R")
tasks1 = getTasksStats(nohup, cores, executors = 3, epsilon)
tasks2 = getTasksStats(nohup, cores, executors = 6, epsilon)
tasks3 = getTasksStats(nohup, cores, executors = 9, epsilon)
tasks = rbind(tasks1, tasks2, tasks3) 
tasks

Selecting the top 10 more unbalance stages and their tasks…

topN = 10
stagesByHost = stagesByDiff[1:topN,] %>% select(StageId) %>% inner_join(tasks, by = c("StageId")) 
stagesByHost

Collecting info about how the tasks are distributed among the executors by number of tasks (bad approach in this case) and the time a executor take to complete their tasks…

tasksByHost = stagesByHost %>% group_by(Executors, StageId, Stage) %>% 
  summarise(minN=min(N), maxN=max(N), sdN=sd(N), 
            minD=min(Duration), maxD=max(Duration), sdD=sd(Duration)) %>%
  mutate(diffN = maxN - minN, diffD = maxD - minD) %>%
  select(Executors, StageId, Stage, sdN, diffN, sdD, diffD) %>%
  arrange(StageId)
tasksByHost

Plotting the results…

data = tasksByHost %>% mutate(Stage = paste(str_pad(StageId,pad = "0", width = 5), str_trim(Stage, side = "both"))) %>%
  select(Executors,Stage, sdN, sdD)
## Adding missing grouping variables: `StageId`
p = ggplot(data = data, aes(x = Stage, y = sdN, fill = Executors)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x="Stage", y="Std dev tasks/executor", title="Variability in the distribution of tasks per executor") 
plot(p)

p = ggplot(data = data, aes(x = Stage, y = sdD, fill = Executors)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x="Stage", y="Std dev time/executor", title="Variability in the time used per executor to run their tasks") 
plot(p)