The goal of dchunkr is provide basic tools for computing on data bigger than you can fit in memory. Use it to chunk your data, run multiple chunks in parallel, and cache the result of each chunk. This allows you to use your computer resources more efficiently, and to resume after interruptions. It’s designed to be a minimal and to play well with the tidyverse and friends. For a mature, complete alternative see the wonderful targets package.
You can install the development version of dchunkr from GitHub with:
# install.packages("pak")
pak::pak("2DegreesInvesting/dchunkr")
library(dplyr, warn.conflicts = FALSE)
library(readr)
library(future)
library(furrr)
library(fs)
library(dchunkr)
set.seed(123)
# Enable computing over multiple workers in parallel
plan(multisession)
data <- tibble(id = c(1, 1, 1, 2, 3))
job <- data |>
# Each chunk can run parallel to other chunks
nest_chunk(.by = "id", chunks = 3) |>
# Set where to cache the result of each chunk
add_file(parent = cache_path("demo"), ext = ".csv") |>
# Don't recompute what's already cached, so you can resume after interruptions
pick_undone() |>
# You may complete the job twice faster if two computers run the same code and
# feed the same cache but work through chunks in reverse order, or even faster
# if multiple computers work through chunks in random order.
order_rows("sample")
job
#> # A tibble: 3 × 4
#> chunk data file done
#> <int> <list> <fs::path> <lgl>
#> 1 3 <tibble [1 × 1]> ~/.cache/dchunkr/demo/3.csv FALSE
#> 2 1 <tibble [3 × 1]> ~/.cache/dchunkr/demo/1.csv FALSE
#> 3 2 <tibble [1 × 1]> ~/.cache/dchunkr/demo/2.csv FALSE
# Here is the important function you want to run for each chunk of data
important <- function(data) mutate(data, x2 = id * 2)
job |>
# Select the columns that match the signature of the function passed to pmap
select(data, file) |>
# Map your important fuction to each chunk and write the result to the cache
future_pwalk(\(data, file) important(data) |> write_csv(file))
# See cached files
dir_tree(cache_path("demo"))
#> ~/.cache/dchunkr/demo
#> ├── 1.csv
#> ├── 2.csv
#> └── 3.csv
# Read all cached files at once
read_csv(dir_ls(cache_path("demo")))
#> Rows: 5 Columns: 2
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (2): id, x2
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 5 × 2
#> id x2
#> <dbl> <dbl>
#> 1 1 2
#> 2 1 2
#> 3 1 2
#> 4 2 4
#> 5 3 6
# Cleanup before the next run
cache_path("demo") |> dir_delete()