Grouping by the type of locality of tasks in the runs with 3 nodes.

library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
RESEARCH_HOME = "/home/and/Documents/PhD/Research"
lines = readLines(paste0(RESEARCH_HOME, "/Scripts/R/Benchmarks/MultiAndSingleNode/R12/dblab/monitor.log"))
lines = lines[grepl("\\|TASKS\\|", lines)]
fields = c("Timestamp", "Title", "Time", "appID", "Nodes", "nodeID", "nodeIP", "StageID","Stage", "taskID", "Locality", "Launch", "Duration", "Input", "Status")
tasksInfo = as_tibble(lines) %>%
  separate(value, into=fields, sep="\\|") %>%
  separate(appID, into=c(NA, NA, "ID"), sep="-") %>%
  select(ID, Nodes, taskID, Locality) %>%
  distinct() %>%
  filter(Nodes == "3") %>%
  group_by(ID, Locality) %>% tally()
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
head(tasksInfo, n=Inf)

PROCESS_LOCAL and NODE_LOCAL means the tasks are running in the Java Virtual Machine or, at least, in the same node (1). ANY means the data can be in other rack and there will be a communication cost.

p = ggplot(data = tasksInfo, aes(x = ID, y = n, fill = Locality)) +
  geom_bar(stat="identity", width = 0.6) +
  labs(x = "application ID", y = "Count")
plot(p)