speed_tests.Rmd
The functions included in staticimports are meant to be fast. For many of the functions, many implementations have been considered. This document contains performance tests of the various possible implementations.
walk
walk_purrr <- purrr::walk
walk_lapply <- function(.x, .f, ...) {
lapply(.x, .f, ...)
NULL
}
walk_for <- function(.x, .f, ...) {
for (i in seq_along(.x)) {
.f(.x[[i]], ...)
}
NULL
}
x <- 1:100
f <- function(a) a
(times <- bench::mark(
walk_purrr(x, f),
walk_lapply(x, f),
walk_for(x, f),
check = FALSE
))
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 walk_purrr(x, f) 113µs 120.8µs 8180. 125.6KB 12.4
#> 2 walk_lapply(x, f) 65.4µs 72.2µs 13547. 848B 14.9
#> 3 walk_for(x, f) 47.9µs 51.9µs 18928. 17.9KB 18.8
Of the three implementations, walk_for()
is the fastest. It is about 57% faster than walk_purrr()
, and 40% faster than walk_lapply()
.
Note that walk_purrr()
returns the input .x
object, whereas the other two implementations return NULL
.
map
map_purrr <- purrr::map
map_lapply <- function(.x, .f, ...) {
lapply(.x, .f, ...)
}
map_for <- function(.x, .f, ...) {
res <- vector("list", length(.x))
for (i in seq_along(.x)) {
res[[i]] <- .f(.x[[i]], ...)
}
names(res) <- names(.x)
res
}
x <- 1:100
f <- function(a) a
bench::mark(
map_purrr(x, f),
lapply(x, f), # Bare lapply() for comparison
map_lapply(x, f),
map_for(x, f),
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map_purrr(x, f) 111.2µs 119.7µs 8220. 3.69KB 14.6
#> 2 lapply(x, f) 63.8µs 71.9µs 13801. 848B 19.1
#> 3 map_lapply(x, f) 65.1µs 71.7µs 13785. 848B 18.9
#> 4 map_for(x, f) 68.6µs 73.1µs 13498. 32.77KB 12.6
All of these implementations are within a pretty close range. Although map_for()
is fastest, the margin is small, so we’ll just use map_lapply()
for simplicity.
map2
map2_purrr <- purrr::map2
map2_mapply <- function(.x, .y, .f, ...) {
mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE)
}
map2_for <- function(.x, .y, .f, ...) {
res <- vector("list", length(.x))
for (i in seq_along(.x)) {
res[[i]] <- .f(.x[[i]], .y[[i]], ...)
}
names(res) <- names(.x)
res
}
x <- 1:100
y <- x * 1000
f <- function(a, b) a+b
bench::mark(
map2_purrr(x, y, f),
map2_mapply(x, y, f),
map2_for(x, y, f),
)
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map2_purrr(x, y, f) 168µs 180µs 5483. 26.3KB 16.8
#> 2 map2_mapply(x, y, f) 124µs 135µs 7288. 848B 23.5
#> 3 map2_for(x, y, f) 100µs 107µs 9195. 31.8KB 16.4
# With named vector
names(x) <- as.character(x)
(times <- bench::mark(
map2_purrr(x, y, f),
map2_mapply(x, y, f),
map2_for(x, y, f),
))
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map2_purrr(x, y, f) 167µs 173µs 5712. 1.09KB 16.7
#> 2 map2_mapply(x, y, f) 126µs 132µs 7382. 848B 23.6
#> 3 map2_for(x, y, f) 101µs 110µs 8648. 848B 16.8
map2_for
is about 17% faster than map2_mapply
, for both named and unnamed inputs, so we’ll use map2_for()
.
map2_lgl
, map2_int
, map2_dbl
, map2_chr
The map2*
functions return an atomic vector of the specified type.
# Use map2_for implementation from previous section
map2 <- map2_for
map2_dbl_purrr <- purrr::map2_dbl
map2_dbl_mode <- function(.x, .y, .f, ...) {
res <- map2(.x, .y, .f, ...)
mode(res) <- "double"
res
}
map2_dbl_storagemode <- function(.x, .y, .f, ...) {
res <- map2(.x, .y, .f, ...)
storage.mode(res) <- "double"
res
}
# This version is not strictly the same as the others, because it drops names.
map2_dbl_asnumeric <- function(.x, .y, .f, ...) {
res <- as.numeric(map2(.x, .y, .f, ...))
names(res) <- names(.x)
res
}
map2_dbl_for <- function(.x, .y, .f, ...) {
res <- vector("double", length(.x))
for (i in seq_along(.x)) {
res[[i]] <- .f(.x[[i]], .y[[i]], ...)
}
names(res) <- names(.x)
res
}
x <- 1:1000
y <- x * 10000
x <- as.list(x)
y <- as.list(y)
f <- function(a, b) a+b
# This is what the output should look like
map2_dbl_purrr(1:3, 101:103, f)
#> [1] 102 104 106
bench::mark(
map2_dbl_purrr(x, y, f),
map2_dbl_mode(x, y, f),
map2_dbl_storagemode(x, y, f),
map2_dbl_asnumeric(x, y, f),
map2_dbl_for(x, y, f),
)
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map2_dbl_purrr(x, y, f) 1.04ms 1.11ms 887. 8.12KB 21.6
#> 2 map2_dbl_mode(x, y, f) 924.89µs 982.86µs 1011. 31.05KB 14.7
#> 3 map2_dbl_storagemode(x, y, f) 907.21µs 956.48µs 1038. 15.72KB 14.7
#> 4 map2_dbl_asnumeric(x, y, f) 910.46µs 955.38µs 1035. 15.72KB 14.9
#> 5 map2_dbl_for(x, y, f) 858.79µs 906.78µs 1099. 38.85KB 16.9
# Same test, with names
names(x) <- as.character(x)
(times <- bench::mark(
map2_dbl_purrr(x, y, f),
map2_dbl_mode(x, y, f),
map2_dbl_storagemode(x, y, f),
map2_dbl_asnumeric(x, y, f),
map2_dbl_for(x, y, f),
))
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 map2_dbl_purrr(x, y, f) 1.05ms 1.14ms 875. 8.12KB 19.3
#> 2 map2_dbl_mode(x, y, f) 925.98µs 988.46µs 998. 15.72KB 14.8
#> 3 map2_dbl_storagemode(x, y, f) 905.77µs 954.97µs 1044. 15.72KB 17.0
#> 4 map2_dbl_asnumeric(x, y, f) 910.38µs 965.19µs 1028. 15.72KB 14.8
#> 5 map2_dbl_for(x, y, f) 863.83µs 929.21µs 1054. 7.86KB 15.0
map2_dbl_for()
is the fastest by a bit. However, one drawback is that if .f()
returns a value of the incorrect type, it simply promotes the result vector to that type, and emits no warnings; the returned vector is not guaranteed to be of the specified type. This is not acceptable behavior.
x <- c(1, 2)
map2_dbl_for(x, x, function(a, b) "test")
#> [1] "test" "test"
The ideal behavior in this situation is for the function to throw an error when an incorrect type is returned. This is what map2_dbl_purrr()
does.
map2_dbl_purrr(x, x, function(a, b) "test")
#> Error in `map2_dbl_purrr()`:
#> ℹ In index: 1.
#> Caused by error:
#> ! Can't coerce from a string to a double.
The other three versions, map2_dbl_mode()
, map2_dbl_storagemode()
, and map2_dbl_asnumeric()
emit warnings, which isn’t ideal but it is acceptable.
map2_dbl_mode(x, x, function(a, b) "test")
#> Warning in mde(x): NAs introduced by coercion
#> Warning in mde(x): NAs introduced by coercion
#> [1] NA NA
map2_dbl_storagemode(x, x, function(a, b) "test")
#> Warning in storage.mode(res) <- "double": NAs introduced by coercion
#> Warning in storage.mode(res) <- "double": NAs introduced by coercion
#> [1] NA NA
map2_dbl_asnumeric(x, x, function(a, b) "test")
#> Warning in map2_dbl_asnumeric(x, x, function(a, b) "test"): NAs introduced by
#> coercion
#> Warning in map2_dbl_asnumeric(x, x, function(a, b) "test"): NAs introduced by
#> coercion
#> [1] NA NA
map2_dbl_asnumeric()
has the best balance of speed, understandability, and warning/error behavior when the function returns the wrong type.