library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
RESEARCH_HOME = "/home/and/Documents/PhD/Research"
PATH_FILES = "/Scripts/R/Benchmarks/MultiAndSingleNode/R12/dblab/"
NOHUP_FILE = "nohup10.tsv"
lines = readLines(paste0(RESEARCH_HOME, PATH_FILES, NOHUP_FILE))
lines = lines[grepl("\\|STAGES\\|", lines)]
stages0 = as_tibble(lines) %>%
  separate(value, into=c("Timestamp", "Title", "Nodes", "Cores","StageID", "Stage", "Start", "End", "nTasks", "Runtime", "CPUTime", "InputBytes", "InputRecords", "ShuffleBytes", "ShuffleRecords", "ID"), sep="\\|") %>%
  separate(ID, into=c(NA, NA, "ID"), sep="-") %>%
  mutate(ID = as.numeric(ID), Stage = str_trim(Stage)) %>%
  mutate(uStart = parse_datetime(str_replace(str_trim(Start), "GMT", "")), 
         uEnd = parse_datetime(str_replace(str_trim(End), "GMT", ""))) %>%
  mutate(Duration = as.numeric(uEnd - uStart), 
         uStart = uStart - hours(7),
         Stage = paste0(str_pad(str_trim(StageID), pad="0", side="left", width=5),"_",Stage)) %>%
  select(uStart, Nodes, Stage, Duration) %>%
  arrange(uStart)
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
stages0$Interval = -2
stages0$FFStage = ""
stages0$Status = ""
head(stages0, n=15)
lines = readLines(paste0(RESEARCH_HOME, PATH_FILES, NOHUP_FILE))
lines = lines[grepl("\\|[1-6]\\.", lines)]
ff = as_tibble(lines) %>%
  separate(value, into=c("Timestamp", "Title", "ID", "Nodes", "Cores", "Status", "Time", "Stage", "Duration", "Load", "Interval"), sep="\\|") %>%
  separate(ID, into=c(NA, NA, "ID"), sep="-") %>%
  mutate(ID = as.numeric(ID), Stage = str_trim(Stage), Duration = as.numeric(Duration)) %>%
  mutate(uStart = parse_datetime(str_replace(Timestamp, ",", ".")), FFStage = Stage, Status = str_trim(Status)) %>%
  select(uStart, Nodes, Stage, Duration, Interval, FFStage, Status)

head(ff)
stages = rbind(stages0, ff) %>% arrange(uStart)

head(stages, n=100)
n = nrow(stages)
ffstage  = ""
interval = ""
for(row in 1:n){
  if(stages[row, "Status"] == "START"){
    ffstage  = stages[row, "FFStage"]
    interval = stages[row, "Interval"]
  } else {
    if(stages[row, "Status"] == "END"){
      ffstage  = ""
      interval = ""
    } else {
      stages[row, "FFStage"] = ffstage
      stages[row, "Interval"] = interval
    }
  }
}

head(stages, n=100)
s = stages %>% filter(Interval != "") %>% filter(grepl("_", Stage)) %>% 
  mutate(FFStage = paste0(Interval,".",FFStage)) %>% 
  select(Nodes, Stage, Duration, FFStage, Interval)

head(s)
interval = 0
p = ggplot(data = s %>% filter(Interval == interval), aes(x = Stage, y = Duration, fill = Nodes)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title=paste("Time interval", interval), x="Stages", y="Duration(s)")
ggplotly(p)
interval = 1
p = ggplot(data = s %>% filter(Interval == interval), aes(x = Stage, y = Duration, fill = Nodes)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title=paste("Time interval", interval), x="Stages", y="Duration(s)")
ggplotly(p)
interval = 2
p = ggplot(data = s %>% filter(Interval == interval), aes(x = Stage, y = Duration, fill = Nodes)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title=paste("Time interval", interval), x="Stages", y="Duration(s)")
ggplotly(p)
interval = 3
p = ggplot(data = s %>% filter(Interval == interval), aes(x = Stage, y = Duration, fill = Nodes)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title=paste("Time interval", interval), x="Stages", y="Duration(s)")
ggplotly(p)
interval = 4
p = ggplot(data = s %>% filter(Interval == interval), aes(x = Stage, y = Duration, fill = Nodes)) +
  geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title=paste("Time interval", interval), x="Stages", y="Duration(s)")
ggplotly(p)