Abstract
Background Poisson regression is a candidate method for exponential growth detection. The number of k-mers a realistic Nucleic Acid Observatory would process is large. For this reason, it’s important that any candidate methods are assessed for their computational feasibility at scale.
Task We will write an R script for performing Poisson regression exponential growth detection on real data. Using code profling and benchmarking tools, we will iterate on the code to improve its computational performance. For the final code, we will produce a summary of the runtime for each line. We will repeat the runtime analysis for varying total numbers of k-mers. We will output an estimated time per e.g. million k-mers.
Findings
Next steps
knitr::opts_chunk$set(
echo = TRUE,
cache = TRUE,
autodep = TRUE,
cache.lazy = FALSE,
cache.comments = FALSE
)
options(scipen = 999)
cbpalette <- multi.utils::cbpalette()
library(tidyverse)
We use the counts
data for this task.
counts <- read.table("data/counts-1pct-sample.tsv", header = TRUE)
subset_length <- 100
subset_counts <- head(counts, n = subset_length)
head(subset_counts)
For development of the initial script, we subset counts
data, which has 1000000 rows and is of size 68.7 Mb, to just the first
100 rows of size
format(object.size(subset_counts), units = "auto")
.
profvis
possibly_glm <- possibly(.f = glm, otherwise = NULL)
possibly_tidy <- possibly(.f = broom::tidy, otherwise = NULL)
To begin with, we will use tidyr::pivot_longer
and
stats::glm
to create a function f
which we can
input into profvis::profvis
to produce a HTML output.
f <- function(df) {
df_long <- df %>%
pivot_longer(
cols = starts_with("day"),
names_to = "day",
names_prefix = "day",
values_to = "count",
names_transform = list(count = as.integer, day = as.integer)
)
fits <- tibble(ec = df$ec)
fits <- mutate(fits, fit = map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
fits <- mutate(fits, fit = map(fit, possibly_tidy, conf.int = TRUE))
fits <- unnest(fits, fit)
return(fits)
}
profvis::profvis(f(subset_counts))
bench
What about with bench::mark
instead:
Benchmark a list of quoted expressions. Each expression will always run at least twice, once to measure the memory allocation and store results and one or more times to measure timing.
The number of times that each expression is run is determined by… [TODO].
nrows_experiment <- 10^{1:4}
pivot_bm <- bench::press(
nrows = nrows_experiment,
{
df <- head(counts, n = nrows)
bench::mark(
pivot = pivot_longer(
df,
cols = starts_with("day"),
names_to = "day",
names_prefix = "day",
values_to = "count",
names_transform = list(count = as.integer, day = as.integer)
)
)
}
)
## Running with:
## nrows
## 1 10
## 2 100
## 3 1000
## 4 10000
pivot_bm
## # A tibble: 4 × 7
## expression nrows min median `itr/sec` mem_alloc `gc/sec`
## <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
## 1 pivot 10 2.47ms 2.61ms 371. 32.09KB 2.13
## 2 pivot 100 2.02ms 2.17ms 442. 92.3KB 2.08
## 3 pivot 1000 2.09ms 2.43ms 409. 693.47KB 2.07
## 4 pivot 10000 3.1ms 3.88ms 244. 6.55MB 4.31
ggplot2::autoplot(pivot_bm)
glm_bm <- bench::press(
nrows = nrows_experiment,
{
df <- head(counts, n = nrows)
df_long <- pivot_longer(
df,
cols = starts_with("day"),
names_to = "day",
names_prefix = "day",
values_to = "count",
names_transform = list(count = as.integer, day = as.integer)
)
bench::mark(
glm = {
fits <- tibble(ec = df$ec)
fits <- mutate(fits, fit = map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
}
)
}
)
## Running with:
## nrows
## 1 10
## 2 100
## 3 1000
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
## 4 10000
## Warning: Some expressions had a GC in every iteration; so filtering is disabled.
glm_bm
## # A tibble: 4 × 7
## expression nrows min median `itr/sec` mem_alloc `gc/sec`
## <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
## 1 glm 10 47.4ms 51.5ms 16.8 418.8KB 2.10
## 2 glm 100 360.6ms 360.6ms 2.77 6.3MB 2.77
## 3 glm 1000 4.3s 4.3s 0.233 296.1MB 3.03
## 4 glm 10000 49.6s 49.6s 0.0202 25.7GB 0.988
ggplot2::autoplot(glm_bm)
## Warning in f(...): The default behavior of beeswarm has changed in version 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis. Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis choice.
## Warning in f(...): The default behavior of beeswarm has changed in version 0.6.0. In versions <0.6.0, this plot would have been dodged on the y-axis. In
## versions >=0.6.0, grouponX=FALSE must be explicitly set to group on y-axis. Please set grouponX=TRUE/FALSE to avoid this warning and ensure proper axis choice.
tidy.glm
is given here: https://github.com/tidymodels/broom/blob/520a933cd84974c42ec3ba8df9a37c59634ce211/R/stats-glm-tidiers.Rbefore_tidy <- function(nrows) {
df <- head(counts, n = nrows)
df_long <- pivot_longer(
df,
cols = starts_with("day"),
names_to = "day",
names_prefix = "day",
values_to = "count",
names_transform = list(count = as.integer, day = as.integer)
)
fits <- tibble(ec = df$ec)
fits <- mutate(fits, fit = purrr::map(ec, ~possibly_glm(count ~ 1 + day, family = "poisson", data = filter(df_long, ec == .))))
return(fits)
}
tidy_bm <- bench::press(
nrows = nrows_experiment,
{
fits <- before_tidy(nrows = nrows)
bench::mark(
tidy = {
mutate(fits, fit = purrr::map(fit, ~possibly_tidy))
}
)
}
)
## Running with:
## nrows
## 1 10
## 2 100
## 3 1000
## 4 10000
tidy_conf_bm <- bench::press(
nrows = nrows_experiment,
{
fits <- before_tidy(nrows = nrows)
bench::mark(
tidy_conf = {
mutate(fits, fit = purrr::map(fit, ~possibly_tidy, conf.int = TRUE))
}
)
}
)
## Running with:
## nrows
## 1 10
## 2 100
## 3 1000
## 4 10000
tidy_bm
## # A tibble: 4 × 7
## expression nrows min median `itr/sec` mem_alloc `gc/sec`
## <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
## 1 tidy 10 1.68ms 2.07ms 453. 18KB 0
## 2 tidy 100 1.57ms 1.87ms 497. 18.9KB 0
## 3 tidy 1000 1.64ms 1.89ms 515. 25.9KB 2.05
## 4 tidy 10000 10.13ms 10.98ms 90.1 96.2KB 2.44
tidy_conf_bm
## # A tibble: 4 × 7
## expression nrows min median `itr/sec` mem_alloc `gc/sec`
## <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
## 1 tidy_conf 10 1.42ms 1.72ms 568. 18KB 0
## 2 tidy_conf 100 1.13ms 1.19ms 813. 18.9KB 2.10
## 3 tidy_conf 1000 1.68ms 1.73ms 571. 25.9KB 2.49
## 4 tidy_conf 10000 14.19ms 15.82ms 60.2 96.2KB 0
ggplot2::autoplot(tidy_bm)
ggplot2::autoplot(tidy_conf_bm)
?glm
to check what the possible input formats are
filter
(or something more computationally
efficient like []
) to pick out rows then put them into
glm
mem_alloc
refer to?tidy
the slow running thing? Try it without
conf.int = TRUE
glm
with inputs
ec
within the formula and gets what we’re aiming for?furrr