knitr::opts_chunk$set(
  echo = TRUE,
  cache = TRUE,
  autodep = TRUE,
  cache.lazy = FALSE,
  cache.comments = FALSE
)

options(scipen = 999)

cbpalette <- multi.utils::cbpalette()

library(tidyverse)

Data

We use the counts data for this task.

counts <- read.table("data/counts-1pct-sample.tsv", header = TRUE)
subset_length <- 100
subset_counts <- head(counts, n = subset_length)
head(subset_counts)

For development of the initial script, we subset counts data, which has 1000000 rows and is of size 68.7 Mb, to just the first 100 rows of size format(object.size(subset_counts), units = "auto").

Developing an initial script

profvis

possibly_glm <- possibly(.f = glm, otherwise = NULL)
possibly_tidy <- possibly(.f = broom::tidy, otherwise = NULL)

To begin with, we will use tidyr::pivot_longer and stats::glm to create a function f which we can input into profvis::profvis to produce a HTML output.

f <- function(df) {
  df_long <- df %>%
  pivot_longer(
    cols = starts_with("day"),
    names_to = "day",
    names_prefix = "day",
    values_to = "count",
    names_transform = list(count = as.integer, day = as.integer)
  )

  fits <- tibble(ec = df$ec)
  fits <- mutate(fits, fit = map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
  fits <- mutate(fits, fit = map(fit, possibly_tidy, conf.int = TRUE))
  fits <- unnest(fits, fit)
  return(fits)
}

profvis::profvis(f(subset_counts))

bench

What about with bench::mark instead:

Benchmark a list of quoted expressions. Each expression will always run at least twice, once to measure the memory allocation and store results and one or more times to measure timing.

The number of times that each expression is run is determined by… [TODO].

Pivoting

nrows_experiment <- 10^{1:4}

pivot_bm <- bench::press(
  nrows = nrows_experiment,
  {
    df <- head(counts, n = nrows)
    bench::mark(
      pivot = pivot_longer(
        df,
        cols = starts_with("day"),
        names_to = "day",
        names_prefix = "day",
        values_to = "count",
        names_transform = list(count = as.integer, day = as.integer)
      )
    )
  }
)
## Running with:
##   nrows
## 1    10
## 2   100
## 3  1000
## 4 10000
pivot_bm
## # A tibble: 4 × 7
##   expression nrows      min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 pivot         10   2.47ms   2.61ms      371.   32.09KB     2.13
## 2 pivot        100   2.02ms   2.17ms      442.    92.3KB     2.08
## 3 pivot       1000   2.09ms   2.43ms      409.  693.47KB     2.07
## 4 pivot      10000    3.1ms   3.88ms      244.    6.55MB     4.31
ggplot2::autoplot(pivot_bm)

Running the generalised linear model

glm_bm <- bench::press(
  nrows = nrows_experiment,
  {
    df <- head(counts, n = nrows)
    df_long <- pivot_longer(
        df,
        cols = starts_with("day"),
        names_to = "day",
        names_prefix = "day",
        values_to = "count",
        names_transform = list(count = as.integer, day = as.integer)
      )
    
    bench::mark(
      glm = {
        fits <- tibble(ec = df$ec)
        fits <- mutate(fits, fit = map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
      }
    )
  }
)
## Running with:
##   nrows
## 1    10
## 2   100
## 3  1000
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
## 4 10000
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
glm_bm
## # A tibble: 4 × 7
##   expression nrows      min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 glm           10   47.4ms   51.5ms   16.8      418.8KB    2.10 
## 2 glm          100  360.6ms  360.6ms    2.77       6.3MB    2.77 
## 3 glm         1000     4.3s     4.3s    0.233    296.1MB    3.03 
## 4 glm        10000    49.6s    49.6s    0.0202    25.7GB    0.988
ggplot2::autoplot(glm_bm)
## Warning in f(...): The default behavior of beeswarm has changed in version 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis. Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis choice.
## Warning in f(...): The default behavior of beeswarm has changed in version 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis. Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis choice.

Tidying the generalised linear model

before_tidy <- function(nrows) {
  df <- head(counts, n = nrows)
  df_long <- pivot_longer(
    df,
    cols = starts_with("day"),
    names_to = "day",
    names_prefix = "day",
    values_to = "count",
    names_transform = list(count = as.integer, day = as.integer)
  )
  fits <- tibble(ec = df$ec)
  fits <- mutate(fits, fit = purrr::map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
  return(fits)
}

tidy_bm <- bench::press(
  nrows = nrows_experiment,
  {
    fits <- before_tidy(nrows = nrows)
    
    bench::mark(
      tidy = {
        mutate(fits, fit = purrr::map(fit, ~possibly_tidy))
      }
    )
  }
)
## Running with:
##   nrows
## 1    10
## 2   100
## 3  1000
## 4 10000
tidy_conf_bm <- bench::press(
  nrows = nrows_experiment,
  {
    fits <- before_tidy(nrows = nrows)
    
    bench::mark(
      tidy_conf = {
        mutate(fits, fit = purrr::map(fit, ~possibly_tidy, conf.int = TRUE))
      }
    )
  }
)
## Running with:
##   nrows
## 1    10
## 2   100
## 3  1000
## 4 10000
tidy_bm
## # A tibble: 4 × 7
##   expression nrows      min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 tidy          10   1.68ms   2.07ms     453.       18KB     0   
## 2 tidy         100   1.57ms   1.87ms     497.     18.9KB     0   
## 3 tidy        1000   1.64ms   1.89ms     515.     25.9KB     2.05
## 4 tidy       10000  10.13ms  10.98ms      90.1    96.2KB     2.44
tidy_conf_bm
## # A tibble: 4 × 7
##   expression nrows      min   median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
## 1 tidy_conf     10   1.42ms   1.72ms     568.       18KB     0   
## 2 tidy_conf    100   1.13ms   1.19ms     813.     18.9KB     2.10
## 3 tidy_conf   1000   1.68ms   1.73ms     571.     25.9KB     2.49
## 4 tidy_conf  10000  14.19ms  15.82ms      60.2    96.2KB     0
ggplot2::autoplot(tidy_bm)

ggplot2::autoplot(tidy_conf_bm)

Ideas and notes for continuation of this document

  • Use ?glm to check what the possible input formats are
    • Could use filter (or something more computationally efficient like []) to pick out rows then put them into glm
    • Could “hard-code” in the time covariate, as it is the same for every time-series
  • Use data.table to do the pivot, how much quicker is that?
  • On the order of GB memory is starting to be prohibitive
  • What exactly does mem_alloc refer to?
  • Try forecasting the requirements for each step at different numbers of \(k\)-mers (just by extrapolation)
  • Is tidy the slow running thing? Try it without conf.int = TRUE
  • Is there a way to specify one call to glm with inputs ec within the formula and gets what we’re aiming for?
  • Test running furrr