From b49e42beb63aff37c6eb85d29bd29505accb9e5e Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Sun, 6 Sep 2020 18:42:49 +0200
Subject: [PATCH 01/23] Increment version number

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e453c04..ccfccb4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: git2rdata
 Title: Store and Retrieve Data.frames in a Git Repository
-Version: 0.2.2
+Version: 0.2.3
 Authors@R: 
     c(person(given = "Thierry",
              family = "Onkelinx",
diff --git a/NEWS.md b/NEWS.md
index 8fee349..dbb9c03 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+# git2rdata 0.2.3
+
 # git2rdata 0.2.2
 
 * Use the [checklist](https://inbo.github.io/checklist) package for CI.

From 5b82717de4a34740a08a97046161f8ecae2f03e3 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 15 Sep 2020 10:11:42 +0200
Subject: [PATCH 02/23] write_vc() gains a split_by argument

---
 DESCRIPTION                    |  1 +
 NAMESPACE                      |  1 +
 NEWS.md                        |  2 ++
 R/meta.R                       | 18 +++++++++-
 R/write_vc.R                   | 66 +++++++++++++++++++++++++---------
 codemeta.json                  | 54 ++++++++++++----------------
 man/meta.Rd                    | 24 +++++++++----
 man/rm_data.Rd                 |  2 +-
 man/write_vc.Rd                | 30 ++++++++++++----
 tests/testthat/test_a_basics.R | 12 ++++---
 10 files changed, 144 insertions(+), 66 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index ccfccb4..11ae337 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -35,6 +35,7 @@ Depends:
     R (>= 3.5.0)
 Imports:
     assertthat,
+    digest,
     git2r (>= 0.23.0),
     methods,
     yaml
diff --git a/NAMESPACE b/NAMESPACE
index a11c0fe..19abdd3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -64,6 +64,7 @@ importFrom(assertthat,has_name)
 importFrom(assertthat,is.flag)
 importFrom(assertthat,is.string)
 importFrom(assertthat,noNA)
+importFrom(digest,sha1)
 importFrom(git2r,add)
 importFrom(git2r,commit)
 importFrom(git2r,hash)
diff --git a/NEWS.md b/NEWS.md
index dbb9c03..c8e1f38 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # git2rdata 0.2.3
 
+* `write_vc()` gains an optional `split_by` argument.
+
 # git2rdata 0.2.2
 
 * Use the [checklist](https://inbo.github.io/checklist) package for CI.
diff --git a/R/meta.R b/R/meta.R
index ebee960..df56f6d 100644
--- a/R/meta.R
+++ b/R/meta.R
@@ -211,12 +211,24 @@ meta.Date <- function(x, optimize = TRUE, ...) {
 #' @rdname meta
 #' @inheritParams write_vc
 meta.data.frame <- function(# nolint
-  x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...
+  x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
+  split_by = character(0), ...
 ) {
   assert_that(
     !has_name(x, "..generic"),
     msg = "'..generic' is a reserved name and not allowed as column name")
+  assert_that(
+    !has_name(x, "..hash"),
+    msg = "'..hash' is a reserved name and not allowed as column name")
   generic <- list(optimize = optimize, "NA string" = na)
+  assert_that(is.character(split_by))
+  assert_that(
+    all(split_by %in% colnames(x)),
+    msg = "All split_by variables must be available in the data.frame")
+  assert_that(
+    any(!colnames(x) %in% split_by),
+    msg = "No remaining variables after splitting"
+  )
 
   dots <- list(...)
   if (has_name(dots, "old")) {
@@ -236,6 +248,7 @@ Sorting is strongly recommended in combination with version control.")
     assert_that(
       all(sorting %in% colnames(x)),
       msg = "All sorting variables must be available in the data.frame")
+    sorting <- unique(c(split_by, sorting))
     if (nrow(x) > 1) {
       old_locale <- set_c_locale()
       x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint
@@ -248,6 +261,9 @@ Add extra sorting variables to ensure small diffs.", sorted)
       }
     }
     generic <- c(generic, sorting = list(sorting))
+    if (length(split_by)) {
+      generic <- c(generic, split_by = list(split_by))
+    }
   }
   # calculate meta for each column
   if (has_name(dots, "old")) {
diff --git a/R/write_vc.R b/R/write_vc.R
index 47f987e..43f7fdc 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -13,11 +13,12 @@
 #' @param root The root of a project. Can be a file path or a `git-repository`.
 #' Defaults to the current working directory (`"."`).
 #' @param sorting an optional vector of column names defining which columns to
-#' use for sorting `x` and in what order to use them. Omitting `sorting` yields
-#' a warning. Add `sorting` to avoid this warning. Strongly recommended
-#' in combination with version control. See
-#' `vignette("efficiency", package = "git2rdata")` for an illustration of the
-#' importance of sorting.
+#' use for sorting `x` and in what order to use them.
+#' The default empty `sorting` yields a warning.
+#' Add `sorting` to avoid this warning.
+#' Strongly recommended in combination with version control.
+#' See `vignette("efficiency", package = "git2rdata")` for an illustration of
+#' the importance of sorting.
 #' @param strict What to do when the metadata changes. `strict = FALSE`
 #' overwrites the data and the metadata with a warning listing the changes,
 #' `strict = TRUE` returns an error and leaves the data and metadata as is.
@@ -33,8 +34,8 @@
 #' @note `..generic` is a reserved name for the metadata and is a forbidden
 #' column name in a `data.frame`.
 write_vc <- function(
-  x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA",
-  ...
+  x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ...,
+  split_by
 ) {
   UseMethod("write_vc", root)
 }
@@ -46,14 +47,18 @@ write_vc.default <- function(
   stop("a 'root' of class ", class(root), " is not supported", call. = FALSE)
 }
 
+#' @rdname write_vc
+#' @param split_by An optional vector of variables name to split the text files.
+#' This creates a separate file for every combination.
 #' @export
 #' @importFrom assertthat assert_that is.string is.flag
+#' @importFrom digest sha1
 #' @importFrom yaml read_yaml write_yaml
 #' @importFrom utils write.table
 #' @importFrom git2r hashfile
 write_vc.character <- function(
-  x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA",
-  ...
+  x, file, root = ".", sorting, strict = TRUE, optimize = TRUE,
+  na = "NA", ..., split_by = character(0)
 ) {
   assert_that(
     inherits(x, "data.frame"), is.string(file), is.string(root),  is.string(na),
@@ -66,7 +71,9 @@ write_vc.character <- function(
   }
 
   if (!file.exists(file["meta_file"])) {
-    raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting)
+    raw_data <- meta(
+      x, optimize = optimize, na = na, sorting = sorting, split_by = split_by
+    )
   } else {
     tryCatch(
       is_git2rmeta(file = remove_root(file = file["meta_file"], root = root),
@@ -79,7 +86,7 @@ write_vc.character <- function(
     old <- read_yaml(file["meta_file"])
     class(old) <- "meta_list"
     raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting,
-                     old = old, strict = strict)
+                     old = old, strict = strict, split_by = split_by)
     problems <- compare_meta(attr(raw_data, "meta"), old)
     if (length(problems)) {
       problems <- c(
@@ -99,11 +106,38 @@ write_vc.character <- function(
       }
     }
   }
-  write.table(
-    x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE,
-    sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE,
-    col.names = TRUE, fileEncoding = "UTF-8"
-  )
+  if (length(split_by) == 0) {
+    write.table(
+      x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE,
+      sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE,
+      col.names = TRUE, fileEncoding = "UTF-8"
+    )
+  } else {
+    index <- unique(raw_data[split_by])
+    index[["..hash"]] <- apply(index, 1, sha1)
+    dir.create(file["raw_file"], showWarnings = FALSE)
+    write.table(
+      x = index, file = file.path(file["raw_file"], "index.tsv"),
+      append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".",
+      row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
+    )
+    detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by]
+    for (i in seq_len(nrow(index))) {
+      matching <- vapply(
+        split_by,
+        function(split) {
+          raw_data[[split]] == index[[split]][i]
+        },
+        logical(nrow(raw_data))
+      )
+      write.table(
+        x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE],
+        file = file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")),
+        append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na,
+        dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
+      )
+    }
+  }
   meta_data <- attr(raw_data, "meta")
   meta_data[["..generic"]][["git2rdata"]] <- as.character(
     packageVersion("git2rdata")
diff --git a/codemeta.json b/codemeta.json
index 71ff4c5..4f43830 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -1,26 +1,26 @@
 {
-  "@context": [
-    "https://doi.org/10.5063/schema/codemeta-2.0",
-    "http://schema.org"
-  ],
+  "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
   "@type": "SoftwareSourceCode",
   "identifier": "git2rdata",
   "description": "Make versioning of data.frame easy and efficient using git\n    repositories.",
   "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository",
   "codeRepository": "https://github.com/ropensci/git2rdata",
-  "relatedLink": [
-    "https://doi.org/10.5281/zenodo.1485309",
-    "https://CRAN.R-project.org/package=git2rdata"
-  ],
+  "relatedLink": "https://doi.org/10.5281/zenodo.1485309",
   "issueTracker": "https://github.com/ropensci/git2rdata/issues",
   "license": "https://spdx.org/licenses/GPL-3.0",
-  "version": "0.2.2",
+  "version": "0.2.3",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
     "url": "https://r-project.org"
   },
   "runtimePlatform": "R version 4.0.2 (2020-06-22)",
+  "provider": {
+    "@id": "https://cran.r-project.org",
+    "@type": "Organization",
+    "name": "Comprehensive R Archive Network (CRAN)",
+    "url": "https://cran.r-project.org"
+  },
   "author": [
     {
       "@type": "Person",
@@ -169,6 +169,18 @@
       },
       "sameAs": "https://CRAN.R-project.org/package=assertthat"
     },
+    {
+      "@type": "SoftwareApplication",
+      "identifier": "digest",
+      "name": "digest",
+      "provider": {
+        "@id": "https://cran.r-project.org",
+        "@type": "Organization",
+        "name": "Comprehensive R Archive Network (CRAN)",
+        "url": "https://cran.r-project.org"
+      },
+      "sameAs": "https://CRAN.R-project.org/package=digest"
+    },
     {
       "@type": "SoftwareApplication",
       "identifier": "git2r",
@@ -200,27 +212,5 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
-  "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
-  "fileSize": "578.382KB",
-  "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
-  "developmentStatus": ["https://www.repostatus.org/#active", "https://www.tidyverse.org/lifecycle/#maturing"],
-  "keywords": [
-    "r",
-    "rstats",
-    "r-package",
-    "version-control",
-    "reproducible-research"
-  ],
-  "provider": {
-    "@id": "https://cran.r-project.org",
-    "@type": "Organization",
-    "name": "Comprehensive R Archive Network (CRAN)",
-    "url": "https://cran.r-project.org"
-  },
-  "review": {
-    "@type": "Review",
-    "url": "https://github.com/ropensci/software-review/issues/263",
-    "provider": "https://ropensci.org"
-  }
+  "fileSize": "1765.055KB"
 }
diff --git a/man/meta.Rd b/man/meta.Rd
index d1f617b..7be6e6f 100644
--- a/man/meta.Rd
+++ b/man/meta.Rd
@@ -23,7 +23,15 @@ meta(x, ...)
 
 \method{meta}{Date}(x, optimize = TRUE, ...)
 
-\method{meta}{data.frame}(x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...)
+\method{meta}{data.frame}(
+  x,
+  optimize = TRUE,
+  na = "NA",
+  sorting,
+  strict = TRUE,
+  split_by = character(0),
+  ...
+)
 }
 \arguments{
 \item{x}{the vector.}
@@ -46,11 +54,15 @@ overwrites the data and the metadata with a warning listing the changes,
 Defaults to \code{TRUE}.}
 
 \item{sorting}{an optional vector of column names defining which columns to
-use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields
-a warning. Add \code{sorting} to avoid this warning. Strongly recommended
-in combination with version control. See
-\code{vignette("efficiency", package = "git2rdata")} for an illustration of the
-importance of sorting.}
+use for sorting \code{x} and in what order to use them.
+The default empty \code{sorting} yields a warning.
+Add \code{sorting} to avoid this warning.
+Strongly recommended in combination with version control.
+See \code{vignette("efficiency", package = "git2rdata")} for an illustration of
+the importance of sorting.}
+
+\item{split_by}{An optional vector of variables name to split the text files.
+This creates a separate file for every combination.}
 }
 \value{
 the optimized vector \code{x} with \code{meta} attribute.
diff --git a/man/rm_data.Rd b/man/rm_data.Rd
index 7c31dd1..6478e66 100644
--- a/man/rm_data.Rd
+++ b/man/rm_data.Rd
@@ -33,7 +33,7 @@ is relative to \code{root}.}
 the git history and unchanged since the last commit. \code{modified} are files in
 the git history and changed since the last commit. \code{ignored} refers to file
 listed in a \code{.gitignore} file. Selecting \code{modified} will remove both
-\code{unmodified} and \code{modified} data files. Selecting \verb{ìgnored} will remove
+\code{unmodified} and \code{modified} data files. Selecting \code{ìgnored} will remove
 \code{unmodified}, \code{modified} and \code{ignored} data files. \code{all} refers to all
 visible data files, including \code{untracked} files.}
 }
diff --git a/man/write_vc.Rd b/man/write_vc.Rd
index 1385b35..819b8ca 100644
--- a/man/write_vc.Rd
+++ b/man/write_vc.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/write_vc.R
 \name{write_vc}
 \alias{write_vc}
+\alias{write_vc.character}
 \alias{write_vc.git_repository}
 \title{Store a Data.Frame as a Git2rdata Object on Disk}
 \usage{
@@ -13,7 +14,20 @@ write_vc(
   strict = TRUE,
   optimize = TRUE,
   na = "NA",
-  ...
+  ...,
+  split_by
+)
+
+\method{write_vc}{character}(
+  x,
+  file,
+  root = ".",
+  sorting,
+  strict = TRUE,
+  optimize = TRUE,
+  na = "NA",
+  ...,
+  split_by = character(0)
 )
 
 \method{write_vc}{git_repository}(
@@ -41,11 +55,12 @@ Note that \code{file} must point to a location within \code{root}.}
 Defaults to the current working directory (\code{"."}).}
 
 \item{sorting}{an optional vector of column names defining which columns to
-use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields
-a warning. Add \code{sorting} to avoid this warning. Strongly recommended
-in combination with version control. See
-\code{vignette("efficiency", package = "git2rdata")} for an illustration of the
-importance of sorting.}
+use for sorting \code{x} and in what order to use them.
+The default empty \code{sorting} yields a warning.
+Add \code{sorting} to avoid this warning.
+Strongly recommended in combination with version control.
+See \code{vignette("efficiency", package = "git2rdata")} for an illustration of
+the importance of sorting.}
 
 \item{strict}{What to do when the metadata changes. \code{strict = FALSE}
 overwrites the data and the metadata with a warning listing the changes,
@@ -60,6 +75,9 @@ Defaults to \code{TRUE}.}
 
 \item{...}{parameters used in some methods}
 
+\item{split_by}{An optional vector of variables name to split the text files.
+This creates a separate file for every combination.}
+
 \item{stage}{Logical value indicating whether to stage the changes after
 writing the data. Defaults to \code{FALSE}.}
 
diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R
index 9e4e78a..1df75e3 100644
--- a/tests/testthat/test_a_basics.R
+++ b/tests/testthat/test_a_basics.R
@@ -21,8 +21,10 @@ expect_error(
   "file should not contain '..'"
 )
 expect_is(
-  output <- write_vc(
-    x = test_data, file = "test.txt", root = root, sorting = "test_Date"
+  suppressWarnings(
+    output <- write_vc(
+      x = test_data, file = "test.txt", root = root, sorting = "test_Date"
+    )
   ),
   "character"
 )
@@ -43,7 +45,7 @@ for (i in colnames(stored)) {
   )
 }
 expect_identical(
-  write_vc(x = test_data, file = "test.xls", root = root),
+  suppressWarnings(write_vc(x = test_data, file = "test.xls", root = root)),
   output
 )
 expect_error(
@@ -51,7 +53,9 @@ expect_error(
   "The data was not overwritten because of the issues below."
 )
 expect_error(
-  write_vc(x = test_data, file = "test", root = root, optimize = FALSE),
+  suppressWarnings(
+    write_vc(x = test_data, file = "test", root = root, optimize = FALSE)
+  ),
   "New data is verbose, whereas old data was optimized"
 )
 expect_warning(

From 58d52d0b8e62c7183bce90182f62e888f8bdff0d Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 15 Sep 2020 11:10:21 +0200
Subject: [PATCH 03/23] Improve message when data file is missing.

The old implementation yielded a "missing metadata" error when reading a non existing object.
The new implementation yields a "missing object" error.
---
 NEWS.md                              |  2 ++
 R/is_git2rmeta.R                     |  6 +++++-
 tests/testthat/test_b_is_git2rmeta.R | 14 +++++++++++---
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index c8e1f38..a454c1e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,8 @@
 # git2rdata 0.2.3
 
 * `write_vc()` gains an optional `split_by` argument.
+* `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
+  when both the data and metadata are missing.
 
 # git2rdata 0.2.2
 
diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R
index d350e82..f4ed0b2 100644
--- a/R/is_git2rmeta.R
+++ b/R/is_git2rmeta.R
@@ -38,7 +38,11 @@ is_git2rmeta.character <- function(file, root = ".",
   file <- clean_data_path(root = root, file = file)
 
   if (!file.exists(file["meta_file"])) {
-    msg <- "Metadata file missing."
+    msg <- ifelse(
+      file.exists(file["raw_file"]),
+      "Metadata file missing.",
+      "`git2rdata` object not found."
+    )
     switch(message, error = stop(msg, call. = FALSE),
            warning = warning(msg, call. = FALSE))
     return(FALSE)
diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R
index 88186f7..99eaa4f 100644
--- a/tests/testthat/test_b_is_git2rmeta.R
+++ b/tests/testthat/test_b_is_git2rmeta.R
@@ -12,16 +12,16 @@ test_that("is_git2rmeta checks metadata", {
   expect_false(is_git2rmeta(file = "junk", root = root))
   expect_false(is_git2rdata(file = "junk", root = root))
   expect_error(is_git2rmeta(file = "junk", root = root, message = "error"),
-               "Metadata file missing.")
+               "`git2rdata` object not found.")
   expect_warning(is_git2rmeta(file = "junk", root = root, message = "warning"),
-               "Metadata file missing.")
+                 "`git2rdata` object not found.")
   expect_false(
     suppressWarnings(
       is_git2rmeta(file = "junk", root = root, message = "warning")
     )
   )
   expect_warning(is_git2rdata(file = "junk", root = root, message = "warning"),
-               "Metadata file missing.")
+                 "`git2rdata` object not found.")
   expect_false(
     suppressWarnings(
       is_git2rdata(file = "junk", root = root, message = "warning")
@@ -32,6 +32,14 @@ test_that("is_git2rmeta checks metadata", {
   junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date")
   correct_yaml <- yaml::read_yaml(file.path(root, junk[2]))
 
+  file.remove(file.path(root, junk[2]))
+  expect_error(is_git2rmeta(file = file, root = root, message = "error"),
+               "Metadata file missing.")
+  expect_warning(is_git2rmeta(file = file, root = root, message = "warning"),
+                 "Metadata file missing.")
+  expect_false(is_git2rmeta(file = file, root = root))
+
+
   junk_yaml <- correct_yaml
   junk_yaml[["..generic"]] <- NULL
   yaml::write_yaml(junk_yaml, file.path(root, junk[2]))

From 784841a0830a18eef3f0e42d8fdffa637a6a6e87 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 15 Sep 2020 13:25:45 +0200
Subject: [PATCH 04/23] bugfix: calculate data hash when using split_by

---
 R/write_vc.R | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/R/write_vc.R b/R/write_vc.R
index 43f7fdc..6c949e4 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -112,37 +112,45 @@ write_vc.character <- function(
       sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE,
       col.names = TRUE, fileEncoding = "UTF-8"
     )
+    data_hash <- datahash(file["raw_file"])
   } else {
     index <- unique(raw_data[split_by])
     index[["..hash"]] <- apply(index, 1, sha1)
-    dir.create(file["raw_file"], showWarnings = FALSE)
+    dir.create(file["raw_file"], showWarnings = FALSE, recursive = TRUE)
     write.table(
       x = index, file = file.path(file["raw_file"], "index.tsv"),
       append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".",
       row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
     )
     detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by]
-    for (i in seq_len(nrow(index))) {
-      matching <- vapply(
-        split_by,
-        function(split) {
-          raw_data[[split]] == index[[split]][i]
-        },
-        logical(nrow(raw_data))
-      )
-      write.table(
-        x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE],
-        file = file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")),
-        append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na,
-        dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
-      )
-    }
+    data_hash <- vapply(
+      seq_len(nrow(index)),
+      function(i) {
+        matching <- vapply(
+          split_by,
+          function(split) {
+            raw_data[[split]] == index[[split]][i]
+          },
+          logical(nrow(raw_data))
+        )
+        rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
+        write.table(
+          x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE],
+          file = rf,
+          append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na,
+          dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
+        )
+        datahash(rf)
+      },
+      character(1)
+    )
+    data_hash <- sha1(data_hash)
   }
   meta_data <- attr(raw_data, "meta")
   meta_data[["..generic"]][["git2rdata"]] <- as.character(
     packageVersion("git2rdata")
   )
-  meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"])
+  meta_data[["..generic"]][["data_hash"]] <- data_hash
   write_yaml(meta_data, file["meta_file"],
              fileEncoding = "UTF-8")
 

From 87e4dc8e1cf6965fa76af45b12e05409054e5353 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 15 Sep 2020 14:35:21 +0200
Subject: [PATCH 05/23] read_vc() handles split_by data files

---
 R/is_git2rdata.R | 54 +++++++++++++++++++++++++++++++++++++---------
 R/meta.R         |  6 +++---
 R/read_vc.R      | 56 ++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 96 insertions(+), 20 deletions(-)

diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R
index d505796..15f9f23 100644
--- a/R/is_git2rdata.R
+++ b/R/is_git2rdata.R
@@ -43,17 +43,51 @@ is_git2rdata.character <- function(file, root = ".",
 
   # read the metadata
   meta_data <- read_yaml(file["meta_file"])
-
-  correct <- names(meta_data)
-  correct <- paste(correct[correct != "..generic"], collapse = "\t")
-  header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
-  if (correct != header) {
-    msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
-    switch(message, error = stop(msg, call. = FALSE),
-           warning = warning(msg, call. = FALSE))
-    return(FALSE)
+  if (has_name(meta_data[["..generic"]], "split_by")) {
+    header <- readLines(
+      file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8"
+    )
+    correct <- paste(
+      c(meta_data[["..generic"]][["split_by"]], "..hash"),
+      collapse = "\t"
+    )
+    if (correct != header) {
+      msg <- paste(
+        "Corrupt data, incorrect header in index.tsv. Expecting:", correct
+      )
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
+    correct <- names(meta_data)
+    keep <- !correct %in% c("..generic", meta_data[["..generic"]][["split_by"]])
+    correct <- paste(correct[keep], collapse = "\t")
+    header <- vapply(
+      list.files(file["raw_file"], pattern = "[[:xdigit:]]{20}\\.tsv"),
+      function(z) {
+        readLines(
+          file.path(file["raw_file"], z), n = 1, encoding = "UTF-8"
+        )
+      },
+      character(1)
+    )
+    if (any(header != correct)) {
+      msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
+  } else {
+    correct <- names(meta_data)
+    correct <- paste(correct[correct != "..generic"], collapse = "\t")
+    header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
+    if (correct != header) {
+      msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
   }
-
   return(TRUE)
 }
 
diff --git a/R/meta.R b/R/meta.R
index df56f6d..8da3c0f 100644
--- a/R/meta.R
+++ b/R/meta.R
@@ -261,9 +261,9 @@ Add extra sorting variables to ensure small diffs.", sorted)
       }
     }
     generic <- c(generic, sorting = list(sorting))
-    if (length(split_by)) {
-      generic <- c(generic, split_by = list(split_by))
-    }
+  }
+  if (length(split_by)) {
+    generic <- c(generic, split_by = list(split_by))
   }
   # calculate meta for each column
   if (has_name(dots, "old")) {
diff --git a/R/read_vc.R b/R/read_vc.R
index aac44b6..ec050c4 100644
--- a/R/read_vc.R
+++ b/R/read_vc.R
@@ -69,14 +69,56 @@ read_vc.character <- function(file, root = ".") {
   col_classes <- vapply(details, "[[", character(1), "class")
 
   # read the raw data and check the data hash
-  raw_data <- read.table(
-    file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
-    dec = ".", numerals = "warn.loss", na.strings = na_string,
-    colClasses = setNames(col_type[col_classes], col_names), comment.char = "",
-    stringsAsFactors = FALSE, fileEncoding = "UTF-8"
-  )
+  if (has_name(meta_data[["..generic"]], "split_by")) {
+    split_by <- meta_data[["..generic"]][["split_by"]]
+    which_split_by <- col_names %in% split_by
+    index <- read.table(
+      file = file.path(file["raw_file"], "index.tsv"),
+      header = TRUE, sep = "\t", quote = "\"",
+      dec = ".", numerals = "warn.loss", na.strings = na_string,
+      colClasses = setNames(
+        col_type[col_classes[which_split_by]],
+        col_names[which_split_by]
+      ),
+      comment.char = "",
+      stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+    )
+    raw_data <- vapply(
+      seq_len(nrow(index)),
+      function(i) {
+        rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
+        raw_data <- read.table(
+          file = rf, header = TRUE, sep = "\t", quote = "\"",
+          dec = ".", numerals = "warn.loss", na.strings = na_string,
+          colClasses = setNames(
+            col_type[col_classes[!which_split_by]],
+            col_names[!which_split_by]
+          ),
+          comment.char = "",
+          stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+        )
+        raw_data <- cbind(
+          index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
+          raw_data
+        )
+        attr(raw_data, "hash") <- datahash(rf)
+        return(list(raw_data))
+      },
+      vector(mode = "list", length = 1)
+    )
+    dh <- sha1(vapply(raw_data, attr, character(1), "hash"))
+    raw_data <- do.call(rbind, raw_data)[, col_names]
+  } else {
+    raw_data <- read.table(
+      file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
+      dec = ".", numerals = "warn.loss", na.strings = na_string,
+      colClasses = setNames(col_type[col_classes], col_names),
+      comment.char = "",
+      stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+    )
+    dh <- datahash(file["raw_file"])
+  }
 
-  dh <- datahash(file["raw_file"])
   if (meta_data[["..generic"]][["data_hash"]] != dh) {
     meta_data[["..generic"]][["data_hash"]] <- dh
     warning("Mismatching data hash. Data altered outside of git2rdata.",

From 0a5024e61a30d7c11d053ebcc0e5f041f0182454 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 15 Sep 2020 15:27:28 +0200
Subject: [PATCH 06/23] Add unit tests.

---
 codemeta.json                    | 28 +++++++++--
 tests/testthat/test_f_split_by.R | 79 ++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 3 deletions(-)
 create mode 100644 tests/testthat/test_f_split_by.R

diff --git a/codemeta.json b/codemeta.json
index 4f43830..ba94061 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -1,11 +1,17 @@
 {
-  "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+  "@context": [
+    "https://doi.org/10.5063/schema/codemeta-2.0",
+    "http://schema.org"
+  ],
   "@type": "SoftwareSourceCode",
   "identifier": "git2rdata",
   "description": "Make versioning of data.frame easy and efficient using git\n    repositories.",
   "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository",
   "codeRepository": "https://github.com/ropensci/git2rdata",
-  "relatedLink": "https://doi.org/10.5281/zenodo.1485309",
+  "relatedLink": [
+    "https://doi.org/10.5281/zenodo.1485309",
+    "https://CRAN.R-project.org/package=git2rdata"
+  ],
   "issueTracker": "https://github.com/ropensci/git2rdata/issues",
   "license": "https://spdx.org/licenses/GPL-3.0",
   "version": "0.2.3",
@@ -212,5 +218,21 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "1765.055KB"
+  "fileSize": "586.414KB",
+  "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
+  "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
+  "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
+  "developmentStatus": ["https://www.repostatus.org/#active", "https://www.tidyverse.org/lifecycle/#maturing"],
+  "review": {
+    "@type": "Review",
+    "url": "https://github.com/ropensci/software-review/issues/263",
+    "provider": "https://ropensci.org"
+  },
+  "keywords": [
+    "r",
+    "rstats",
+    "r-package",
+    "version-control",
+    "reproducible-research"
+  ]
 }
diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R
new file mode 100644
index 0000000..657e540
--- /dev/null
+++ b/tests/testthat/test_f_split_by.R
@@ -0,0 +1,79 @@
+test_that("write_vc() handles the split_by argument", {
+  root <- tempfile(pattern = "git2rdata-split-by")
+  dir.create(root)
+
+  expect_warning(
+    write_vc(
+      test_data, file = "unsorted", root = root, split_by = "test_factor"
+    ),
+    "No sorting applied."
+  )
+  expect_is({
+    z <- read_vc("unsorted", root)
+    },
+    "data.frame"
+  )
+  expect_equal(
+    z[order(z$test_numeric), ],
+    test_data[order(test_data$test_numeric), ],
+    check.attributes = FALSE
+  )
+
+  expect_is({
+    sorted_file <- write_vc(
+      test_data, file = "sorted", root = root,
+      sorting = "test_Date", split_by = "test_factor"
+    )
+  },
+    "character"
+  )
+
+  expect_is({
+    z <- read_vc(sorted_file[1], root)
+    },
+    "data.frame"
+  )
+  expect_equal(
+    z,
+    test_data[order(test_data$test_factor, test_data$test_Date), ],
+    check.attributes = FALSE
+  )
+
+  data_file <- list.files(
+    file.path(root, sorted_file[1]), pattern = "[[:xdigit:]]{20}",
+    full.names = TRUE
+  )
+  data_file <- sample(data_file, 1)
+  raw_data <- readLines(data_file)
+  writeLines(raw_data[-1], data_file)
+  expect_warning(
+    is_git2rdata("sorted", root, "warning"),
+    "Corrupt data, incorrect header"
+  )
+  expect_error(
+    is_git2rdata("sorted", root, "error"),
+    "Corrupt data, incorrect header"
+  )
+  expect_false(
+    suppressWarnings(is_git2rdata("sorted", root, "warning")),
+    "Corrupt data, incorrect header"
+  )
+
+  index_file <- file.path(root, sorted_file[1], "index.tsv")
+  index <- readLines(index_file)
+  writeLines(index[-1], index_file)
+  expect_warning(
+    is_git2rdata("sorted", root, "warning"),
+    "Corrupt data, incorrect header in index.tsv"
+  )
+  expect_error(
+    is_git2rdata("sorted", root, "error"),
+    "Corrupt data, incorrect header in index.tsv"
+  )
+  expect_false(
+    suppressWarnings(is_git2rdata("sorted", root, "warning")),
+    "Corrupt data, incorrect header in index.tsv"
+  )
+
+  file.remove(list.files(root, recursive = TRUE, full.names = TRUE))
+})

From 35f41cc23218c4637756bd7d79c5614722c8351f Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Thu, 17 Sep 2020 17:12:56 +0200
Subject: [PATCH 07/23] add vignette on split_by

---
 _pkgdown.yml                    |   2 +
 codemeta.json                   |   2 +-
 inst/split_by/read_timings.rds  | Bin 0 -> 7810 bytes
 inst/split_by/write_timings.rds | Bin 0 -> 7867 bytes
 vignettes/split_by.Rmd          | 319 ++++++++++++++++++++++++++++++++
 5 files changed, 322 insertions(+), 1 deletion(-)
 create mode 100644 inst/split_by/read_timings.rds
 create mode 100644 inst/split_by/write_timings.rds
 create mode 100644 vignettes/split_by.Rmd

diff --git a/_pkgdown.yml b/_pkgdown.yml
index f81a74f..bd478a9 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -15,6 +15,8 @@ navbar:
       href: articles/workflow.html
     - text: Efficiency
       href: articles/efficiency.html
+    - text: Large dataframes
+      href: articles/split_by.html
   - text: Functions
     href: reference/index.html
   - text: Contributing
diff --git a/codemeta.json b/codemeta.json
index ba94061..08f8dd0 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -218,7 +218,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "586.414KB",
+  "fileSize": "614.847KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
diff --git a/inst/split_by/read_timings.rds b/inst/split_by/read_timings.rds
new file mode 100644
index 0000000000000000000000000000000000000000..c7b63a4366a003610aa09f47029913dde454601f
GIT binary patch
literal 7810
zcmV-|9(~~-iwFP!0000019f?KJXQZ6zdchTrIOH)%19`Yo+|BK6j33{y7%0DU6j2^
z!<LeUG*uM&SSg95p`oHZtxBQ6@1)M}aq|2A{y6S=pYwjd_Urk2y^akEVq#*FVp3uf
zVv^G2t(dg!x&`_($!DtMLW2BNBCpW?O=4p5<R$+1Ui{B{F<IL4-<RBPBDA&m-#wCl
ze}A8m`FrN~CH41L`tKg;zwhb%sQ1#ed-1<#<^Eo>G@5e%gQo0Xy3~H^p3*_$??vsE
z`^!UyCeJ_~lpfSCrN7hwdh&norD$@?{(tyL53q?ijSgLI>bX4aqIglX#c6A48Xrn;
z>X+h8`9qv`mU=Jy_Z?+Z3EHpp-(SktGGhPgO!22|Pwk^<P<tu<a{q%iWqUaqFNziw
zlQOi6q9a8+PuYYnlMIa>-9}WrNe%D^rN1nV4`nOK0leh~(4*Uw;!VY>l-R%WNYU;o
zdq~n~QT7@b|8&2|iT!I!Y9HMWRBTgvQ2VGnp!QID4AhtMD?R2Z9?~@aRDMu&DH}+M
z{cB??)+ib1ex>%&<r(Nh$}W^1RGbWqUpiLN^J<_UB?qi2-6>v_+?1{9XDGhZS-Omr
zO(@+4;v(H=Qeyw+p(Kru+yI)C?dUl|ogb(_rI#G-j3n(WJ?157=LYho=N%P?biAkh
zCPCv*$0<5S(QQcK8I>ROc$60V*Vghh9&|aWc%b5BAXd}yi0*&7jp=wz$8%ZQMaLUD
zPEq!x<ddPTDH)~3{$HQU)96yNQSn6arhG`RUjt)FnnsgOgC6&kymWg~dQo!G^OMRO
z`dK>O(B+_GIAuF28gF`?lA!UI96<Yj#S~ozx}Nm<LhYerH-#T`A5u6dO?yv|&w;g^
z9;>pnGjyL)m@i4AN9jK>&**#TcuB|bfw)P@O2rAC|3J*5c+>j}dX7;sF%a`*X>_G&
z?*{fivSR;lK2r20#QxQJAf^w*e7X+@`i&l+R9sLp(&L++L)1MT2k89@J-)?hYsz<0
zwDa`3OZP3^&h)w`N!v3}MtbZ}u}j&4vJ)M9DZBqKETs2p)IGJP_dt~1^jM?oNq<I<
zH+nxdP&ay?O0OFfEh?7ieUuc97QH8@_t*3sqU=J&@xY#h-d_)__tcuM6TO~Nyy>$C
z-4;~5&~t}cQ+m>|h?0Yf@qsu`pO5IhGrb?9;|V=K>2uJ)nn0bW&uMaE|K^w!jUFY-
zz`j*{0DVet8kS4_ky+}GFTCCOZr&X7PPzZ&-=~B#o4a}1KP%&ZR@47knf$Xd{byzN
zcP0CWrs+RR6KW~<hsO+~0rE+^dOLF{@&2#S?#`}UCY!wLC9lE1$6PQL=d~!>@sQjr
zE`C;WJD4w*$zQ3{1}kJ|iqXqDFkd}>`}vAE*j10h3$IAS_p57H%zrbHvZFKIrHTjc
zGL7l%G3y|#%1IcsMiW6xR(iL$x`J@LeRxLjaxjjvR@QNSz`S#2^+Z=|uzM4uJPw`)
zceBX)bwDOqZu;eiMVa80`VX-g;SBDf)R}6d=0SKzd#^sD4w;s7ri@E7N20@sr}I^9
z!6-ZWcJns{B+q@NzG_1ggzoQ#t!jS?LXBvh9kw6L`wOy;h}{Qq`s_%bU8_J$)wAIo
zOF@d_wT4@{q&|(W{BqNmAnBKC=ke$qaAYMVE)3a*AeATjx`9F@h9%BDWGMz><~-w$
z5w>8et?0dPVhf(=`J$qbuLwJ1vnoP-D8fHVO>R})2Ue4;;q)8ENZPmW$Iy@hBr8^%
z{<K($$lK0gF;_E?)Ue2LTk}h>ls)1bb-J*FoAo$CbRJyAcU_MetFVJTe$~mc5(I6T
z*PSx;9AfTG9_{G=2+_vpu7uxd1vjkY@Jru|2>DvI$|9%$Z1rmzzUP;~FRp&Z%5HZs
zUdn8Az55#8Rm*S08%zbkzEksIMG1J?9aol(ib3S!d9!YA+z!q|51FX&U*M})s(8BH
z!j8OSmlXEDhnF4e$JM3`u$LoLvq=R>`Lzmh+QmqoS#?u#EgR82r&f>@L5!bVm-}TC
zGD^MKj<egqdn6dwHu??@-YjhI>D~#SskeN7IzB~sMSjP*uyGL1UUKfahb2Vu3XZQY
zj0bm@!@4;dd<gXNOPn&TaU}L-QvL1#_!{f#+J%n=ztX8{!j)!l=S3NI7Oes2>OoJ2
z@hkY>+?z9|>@h@%4C9x<qY*DHo}Ta_6U-t1<sH4B3f5cS*^hRT@n$qZ$MxX|uyy<7
z6LY)4O|c8^nLi$Bw^t48&W{4)CL7-~RN*!0jjq2<Er^)5y&HFTL0BR=(cx(Zg4!1u
z9n32Rab|_JqN^f8Z+FOPZT?K!S83e6Pn*CUob)E?5Cdserbp+bk?~_HI9b^qg%l^h
zmfTB6!O~_e4tbD(<U%K%xl44xU7E6;|GXKZ=9U6$LJ3R>_p-f@+>vBBXqvs%7H|rO
z#*W-!2X;MYqhjf5aMIM4#<a(Qb;?7l%;O!x*<V^}7bGG;_GD7+<}w7<>{|NHv>A*D
zXQPary<oEh^`E}_fxRo)x<2$BQVrct&zcnv&mpVhCbZUrvD%LeM@e}0JW>6nB!lFn
zcd6#u(clgLG~XuI0IX}rGyH1x5afFL`(eH}Jg#lAj+(Iygu3xTcS{~vyi?0D{ybQV
zbs2|s$a6oZj#mD)1BCH~%_IJc1#9id>V~XHu+HAuUp1i;#FA2jMemLwNOov-m!SY&
zl5#m`<L)4N@LXj-;Q|md7G7WKMB07pr>YI=*C2AQxV5HG0itQ4#y8gIAZXK^RxLd*
z1aDXK66<ROJ6y}pbaE&uZ^NB>4JGiTOh&accY%K}r{7O0y&<FSsBVJy)3DAgU40}~
zXqH<XPyu_xzR2K?=LnU)lJQW5v`^{B7cAd0aLfiL%c|>u_rP|8ww?_FWR8#G-5dn=
zFRO*Aqpu>#+%Vv~*b9WOf0Ef!ng=3|G*?s!h{4M|3>hJa3yFT1HQXIuR-fAyUYn8n
zAIP50UkRf4ezxe!NATyJeh|{!36AtWL+93w;7mGpg16NOM9QQO=2tZlYt}xrK;b=@
zVWQk}`6Xa6vz@jTk$IBz=C;FaS#XNOgs&?<!e2#c`Yzuvu)CLDFRR-F|2ZyA5%OI4
zKG}JmYc?JHAYbRe<tgCKUoqb(&ljSQ52EX1EWmRpi7EG}0NYy9-)_$Z@W=H{c<ucF
z{L=WI6q(Q9G+Ve>&%OlV_7gE4wg%u94(YPrz8Z;vLqm7kX_I=4-YXeE<`Lt2eP98X
z#5L>tx55TPFvfS#ZrLB;NG;Zo<IV%C+_j{|Tm<g0Detd`4@auC<GozvX9#%Vpm9=e
z8a%I^kQlVM2;L&g1gm4x2)X94N@-IZLXX63em(a!_>bJzUs_%Srle2n>B_@kAOAii
zA&Jbp@9ObpyX26rt=(UT_+v!9+06{F4*PPqjMoR-<64-lRTsiCt;Yoojzro;!DPRC
zaq!jiakW`ofWT5GxqC$q!P|CK>661!2sHWz74^7--Kq6m?Xd}V=vHa((dq;Dn&yrp
z-OCaA={k3A)lcw$<jjxlssUSS&C$tO`@o(ScdK?|JosN4lO4WygRNnq8~Ioo`<e1B
zQ(w6uSg*&Vy`}-dMP;wEowh^p=4FKa1XH962$kW%J@6^H@or;gEWFnqpV{Qo3})w^
zGUw2p2%YQJW3O=skvGpBU)do6VXu0Q59<qjIl1ytY9)xSn;ra%_X5Jl^Uhp+uMC0y
zBog*?A&44AY;E}fPeX2xMcF|R*{f3S2}pa{DHzs$KMdYur{mRo!of-D*d~2F8Ge#6
zK^$XUMAhw9%UCRc&zaUM|Jm;WM_<K0^Fb9j%G-V#e|!U$tf$8ImbLJGze`%{_B8k_
zYV)6-Y=>uktK;{RjbIua8<GC28!0KpgWhZG0;?xu&(r@3ATSdwyF0=Y(IGF)w{V)^
z;c~9Emsbp?vZ&jB?rtzLB(GHm#DTMKam|FO&mc66=+Ap${GPt0M@Av=>yfj4%B^6`
z^w5YuDTd?+2KQc7&jVql?^Ax*53IzYo-w=B;CmxGZ%M*J@aCs6czw>KURlYTrxw7Q
z-D`Gd`~-Mrs1|<G{RWon_Mua<7)aMDpI%<)4L(Oc?fv-A2syU>*MYlC#GY`PHO3g=
zcRucqZ@~s-&%6~C;Fx-NkN^PgbseWT-`n5}TJuV7O+I|rb<2gxm%(#;V}fMcA^3#3
z`!2Bj0j934<<tAmKzypN-xezd!O`KX&(;=$*Q!~X)AbnMnv<SYKe`Lf{VDzNpYE(C
zy`&-ozPV2hrPw@0Fk?dGC^cJny<K!p%U1*8x%x-eT`Gmh0g90~tRUEycA$h`4Zo1f
z{z9#6c(k3ZCp`xS@nBeC&31&JQJr=$<|hOzb-iQeB_X`+zVfM?`Uok_zA~oE1!2Z}
zJereF;K21XZP#D-A!^_E+u#u9^oqW-b>M3>d%}4CiwO2p{AI9@%rEAs$;<P$!q3QX
zS?3XZFlH3E{Q5Zx%%93KvVpxwsqtyA%1(h#!a>cphY4WkYUyWunhoxyvFE@0bRc<1
zoW`u`$q2W(IOpP#PDF(*(K#Pw2=BDn$LBdzgI_+pqWzvZ7)QQlk6QW!VKN+PWd(Ea
zX722-&jJ&3rxm--g27FQons{lpVn{7x*v=Hp>iNO_vBCzwrrN!VY2UB>K;2$H32Ex
z?gt3IlYLZrExDZq?zR@WX%pXrwPx0^x=Ky3Ke|;(nRXz>Ts7Ka{Z*uIRFq4T;KAEX
zc6Vi=1ei~UYJ9P+Ls+?Kd2q`a5I0VA&VTk8ME&!JE>ms5HF@#uz`N}T-gSMMr$rOI
zf)20P`}_u&#eee3TYUr3=yV3*o7>lS&1eK6(-+~<q7S}*B5P^JMey|x4qa}z34wWU
zo~zj#Aasj;zU#PRFjR)-J(>~-#>1qOAI_K|?DYMNaTgQ8l8U(!oMlAzJM(90yY_-r
z7%{7O9|2)$dU5AF9vCh<ABNmL4=?92yp6S_yy8Fmw%&>Zu`)Yjbz=#_8`NJJ&(}hv
zS>gyyy^HW3dD2M!yex=4?kCd6Jq6<eZ<bY=Az1zk55=6#Mw-IxU=y7M;3xjL8)##S
zz=<QS%UQP}=uXnt8=oJ-YwUz8O>cw2d~;!&pSdG^izH6pHlGJx)DZt%UNmA_OYFUa
z$$VaUE1=-D6u1Uw)t$%Qhi}I;ix&o);IqtUPNrWFI38iHtwUUp^rEdu_jEdVUA?U6
zaIy|v8T0noyrW>xxIVjh+%)jQ2!`XmR<J#@eyMqU1#g~4LS<700-M)lE_Vuo_mJ0L
zbvJ!LQr@#=G1WK0ds81FuABwVm*P1=V^<=r<MFulW5$3J8`~cbtOdDIeABlG9asPD
z6!8|k>Pj`n#1&u-*VJBX)=bvZdzVVOZy~KDbr@sWU?h%OH+fuX4;bHH<cMv}CHst&
zemrJ6YkC%*vjS&BL$jOhRkCk?#rO7k3%0<Wd4F{tc$a!tbWG9!;p`P3pPhk}{|t6c
zHBtaOZb;2a&81*{^YwUH?E&_g#@MEVCSZxnKfHEdmBgL)N!HtrgSd0mF-dX|QbsV&
zv@!GH-{6*efvXJ9y$Y*ViM+vb-_>N$`3&jBW{0NC+)MTk-1vodVu*g1CzidJ^w*<p
zcb9jp0O$O~T8-=@5U5{zVk@SEP~OQ!r`j$fb;k=qrDHKzchp`)AD&L)(Ut`LXG##>
zn<&{JNFs5wXRhMxr69gJI7Ah$fp3PDgYdHh0<^}MY`CcnpGPE$%3EQ7)2QK^EBO%m
z|Ke0n5F%)IQQ4et3Gm+;({%MqB1Bh}I|n<MK`0wquX=nk;;tO5Q`~I;VsEd$%gjb3
zD`bVIm<~t8^rkR=Em?0|x0Tq=x=Qvl&Xv43X7G&IJi|D{1WAvc+-sd$3Z}^)eB+E7
zysqo`NbvXFzQ>8SAm_|w<q?`b<XmGu`1qT4u=8$-wW}@$H{GCc^c-!7)+eO*T$>D^
zuCoCOt!ePKl+meNrHkOcA5T2GE0FpszgRq*0bX#cwrS%ju%9pT4g9(Z!V6KI3X_Bg
zTeqiux5gJl1i$f28NZU8d#i^1e7g{Y|GF{p;Q+hibF}jaHdv!)J5`GDAozUq?T5`_
z5SAOvP=h2yr=$Aojn|&9$G5(_1Y#T~V{pkc_*m`|{}Frz%s5qYKqYZ$8d1GAJqiTZ
zF)QZ23y7o*&SM)Q5k5Xz-|*!ZgszOQKd-nEQSY=f3(S?kcQ~ql?FBiX#>c;Zx}z2`
zb*KC1P$7|4zRX<#>4RcR$-x*Qh7n{#PWGt{dp2GyxD3IA1-oviUWV7_`$?~(r-S`0
zSYS2251f<3-<=6^0rT9oXS=jnVE%aOaaV!`#=YYowYVRV@_o;u2hS#gc$TC6Idupk
zr(9lMSK5N0k=uHg-e?3Xzi-jbnmtGz-7L3Y>;Z)S@btZWsvO*oPYS68-w-lomeU#8
zGDOy{N`3a$3|>=bWv$i}Bl`lA#7q}E5Mye7sO59PjLJz|vNI6erw@Yfh6+F!elG8v
zLe8}tq>S=}=aJS%EU=#X2|E)s3XeD1Bg!JeqFV9;saK7Kv9uX>=(v7mH_Zo6fBVGo
zc3n8=@1x&x$QFzrUYCzJk$K5#{F1e-1w^Oa*p07AoSqQXS{g&<srx2{s>ZPhc{j%P
zO|c7j)`O<!n;RqTgdOpsd<sM|TQ%1|dIA<F`|`Dqitw&8$b98!21c!{(vijv@J@RC
zZYUwNeCfDx$#G=<oc-CoXfZg>&u*!U<&d~*vv%T$3PdOOei9rL2UBxl`v?(%z@0Vg
z9EHaa?0k9avvnTub~zPtt$GK9_WLhq^c*Dn+wD4ia#7$nPGyhOQ3f|@c34L2UU2J4
zZ>l6f_<8YdW_2<6>(dSF&m188Cyv>MpezXH+X)EYH)K9p_vQ?p3eWbmq2p$M0c%}0
zIj#nP=ozG09C94Y8I!JtM=&7L%JkiK;{({QV*BTrwBV0>7o^Sw*U)gt@XnbKO&DLL
z_jNzg9Ja1D?)HPv`v4^wVHwz>mV8sOy$IhH%lLBl5C}!9{{Abl{mGv7VLkAyIMy0^
z*BsuBgUaVikn`uw>i&FS){&?(K^d&`2dew_kaB%}|H*4;6@+JOy_J~j;JN<KJi^#x
zK5xXytKghDZ>02!oWl;x`aS4_u+VpSe)A1@DT|wjvfDxY7(VgbXL6qM(jqS1DnoLq
ze40zEDcQGu;pkmi1RtNA1J@dD5VSBQq{w7GJeTVC%!q%9<SOZhgBLSF1UUqZS64#R
z;@lTQ#m^$tDWX5F1sB-eqI$A!g)pbM7m)pj%dFvVvsJ*d97V)8*+O)*Y&AL1g7Kww
zsNb3haN`oCtk&v)v)O1_-RqO!#x!(=*pq$W>LIrRpG=4U;I&PM-Hw1~+%$3bR%@_-
z=GXKs<Ulw$`{e<F9z+G}o0Qx?f!$^!cx20haQDOhzGDB;?B$-rH4$hWn7g&28;ttj
zW?F`%X%k+b`J4szn)M(n8vMSD8lteN4{6^IsA=-d;CuSdyvWy`lJ8Ik1iWGPPPY62
z-txS{0_I-$N7$}y_i_T$F+`y5wuY?3Do_8*e*{9SR_@{^vL27=Twj)F2F_r8b5e4!
z6n|Ct)Xf6B&?H(dMI8~6_w3hh@PQBK%EOFTIY>&+^v!)!4Pv(MMSqF25M5um_O0x5
z5DE{JBb5TldX=N_T#B4G0u%Lm#g-!V;rN>8ALqhzm8OyS$`}wct`Ik`^FWwxof%(T
z1){3&S#<k(@P1x4Gy8H5#JT6e?gD)fqwS6!RB-_3!dHX*ub;q~sQ2yo96bmp3w6Jb
zQUY^l5$WJv;6$vp-Sa3G5vRHLvz%W;_;6cAc=stpj(y~3)O`&eOB@wbcbkEoSsXj)
z{TL8oBL>;6OaWWDYDG@tPIAtFlJnw4A^0}7?zgNzll@X#|92+B?7zDvR~yOxXNBdX
zBRXK$W>q-J&P7IIkj>y<HxM?ZaQkW<W3VSj7KL=SB3hImK7Y*|#F*y3nOmBIpm84$
ziQb<;_@}^qoA-Z3+G<S?T}4^=E60luyYmeZ%db_=*}NNR)AQTxqE{hl*uIRZqshMa
zNaE3}cgQ|&L!6!7Lu>Ft#?_Dm9ohFaOMe(&i$LX53b_HT@bI4-Gudk}yrQaVh?8WU
zTp3)qU~vMNV;c+%OMdqqp>so<>%iYyZA*4^q`o>8GZrR-ucSz}DK=m^4hq|8<bpK)
zyn8+6-AI4WoS-q%0s<A2v(GPFM&`K3cM3kGB6jT(g(1uR;3YprJIFx}{EL?&+=`g+
zoFsoxcg;F5*F-((-unaI9@WKyVI~j;nnu3d{t2u{Z6R6m^$6G9rpI(s1v_10m*b>w
z;8ZSeP>AdRVM25jK63`KWb%A#ApsxZ<KY%3au8<I%hlT-iAbWvw(KW4k9j!ES>WCZ
zPK|GsS@Z)0ELuP5=uiQeH!8kqHk<~_sj*?^$&uu|w`!t9wH$~SDu3qdr7Nv!D?Px<
z*vR6=>wq=w&ez9<_27jw*o`ViV9#Ooe+T1eHud+1BCSiwT62>S+9AEpIKmR_n{)HN
zMdU$L)H=HB)KZ9S_*)gc_Q5M{NcgBN_dxjC8{cnvj#QOnwPiENKJDV#@<k@gz{|Kk
zyhA1)Nv`F`9P2NGuov{oo&EtrX^Yy9)&*cquGO2_!~nbH^jbIZ7hr{JY%SVH;=Ag|
zV-|C3;rrk3GOh~GMfHc~yUKuhOv~Mp`vbh<z=n4X)?l3P){Hi|0+!fI`OsHlAeeUw
z6@+?Vc{OY;aOwU1{lF(jBuma0_MAs81enrJQHxYc;D2&se?9ljjt#bd-U;@>b&u`W
zM}sFTWyhNEl<aT1hE3ml0-Srpyr&zABlbX@`pE+%-X7G6{ZGY)%*Wo~)jNdn`Fi5l
zh1v<=N1FD}b?k8S=5_IP;Fg~{alkem%#zs+%Maf|@|yM2-6BVUoxOEWu_1~335Bd#
z4W-~OUfX=OK?^}%Wi_AA&4MR!`^f{DK@i-2e?D%@Q6$UzPO11w`mNSHbM>w{AcEB`
zy^|k-_fY-!hhBuX{4Q%gh%}8Sb8rQAwP6miVFPxwdf(75&_MdDpR(gGGLatdC;k0n
zB@(hewtk=23*JQS8)YFX;5cq-Y*}Ir*6nW&X@QHuR}7H7zH}2p%9lmYm?(?Dq;26A
zZI#Fx6FWgxD;)k)FBv%{J_XZict0NTv*jlj#I}RWX_u>A?+8JU%Q^p22ADejq4`g)
zfs;79>z!mYLb?rqCj|UM&F|J!?t*uIg~6+lqu?XRj?1o4LFmM<dtd)xLuBFCGv_lo
zKPhlaLY&&Lqg&g`O3w)1CV%qI`|`!#A9~68L%#p}o6yeqy4Cf)h**tRNv&<9|IUB2
zoYoAM$MZ48$4R?&=?`(?knxyscl{RgiQu^uv9-fu!FM{YH}3T%_=acCGkP!+48F##
z(|wNcKE)lk=Hq6t^ptGJY_9@4#8mFvqAH}c4_hRoHw4TKsffb;Hef64>;Dc#q~@tx
zY?%b!8TY}f?`#B5eP7<T79ofxj$hmybtJ3Z&8xpo&WDHBtDf4}23EjrnKq*ua8-^(
zO$@z7&bhO$eDuEz*7Ag<mR3B71a|%3F+2+d=MyV+;FB^s;X@`9%#Ne|bF?6Hzp0yL
zIt2E!`|A>K(|fY4?1V5qe{-*oJ@}K)|Ce!E8{EFbj;7J%{NT?#?K}Pqcs=J{#YeNj
zYyMs4c7i=xoMSw(60Db`8|@F^U{TZgr$b8+Ss?vx&xC2<4%ZtUY#EN7hr{JlpX7tR
zY}$R9X>6o^e!V+d>;S@Tr-yVDy+;J=-V?VfDN-JT-wiD|b{)M^0ka{r`29II03yR_
zY1W$;AT8<Yp$CS)z}UwWDQ<cMj)?d;{wRrWlkN^`U9lX(=F=4<fP-^<nPZyC4TL{#
zPj0Ph2Y*teoOSDS9K7XTBKppT-!J$c3Rw>J{Jlv_?8_h^Z0@Sg_z2Eg@5wb`M<6uK
z^ORn{0o)B5{d2|cktY3iU{C+~lN^+hHR`98vWy(UyNJt-`vOGlXFZSqkq4%#U0=Cc
zGnkq6Qa^@<lKAyG+|WK6BHM*0ex$d;D^>O4!^H#t?J3Q0_Hm>Bttj#Lp_H4DU{m*8
z4<EMyhveBzfdO?;>G#1v@=ssiPyas~CHe*s9PH!a%4WK_F<iI%IJ3R}K2_N2%yl+!
UXFLC9AlCbT0F#CzoX9Hx03a#hxBvhE

literal 0
HcmV?d00001

diff --git a/inst/split_by/write_timings.rds b/inst/split_by/write_timings.rds
new file mode 100644
index 0000000000000000000000000000000000000000..4166fe667c5ac4424e5ea9c4050407ce59d9efd6
GIT binary patch
literal 7867
zcmV;s9z@|EiwFP!000001BJPHJXPHnKYq<agOcVb&82!GRFd8_NhL)JWeAlfM2QU7
zov(SGl4K@RgOpNIt|4VAM3W{ohX~<&9#7}@JNbTIukRnn*?XV8_gZ`H_j<3juD#MU
z8cl>IN)x7uh!IVi*lwduOIHzN6Nrv5@g74wtl>#CniTN}{dvUxJpYbK5Rc@ax7eTW
zqW`TQ?u-8UCh}*V$e(eEKkG&Q%n_r^A@`G@j7$BQM?U|KiT{}+^ye*3SuaJICq|J$
z$|Q9XrnJeuBuD5ZOwnDKvacv*Z_+*zl=D+{pxRC7&v)_vVK36gR2|4~l9V%3?IT86
zL&_DR{e5QAZq#|?H&Pa9L-O20w7>Neq1b@5m*j{!LbU%M7b3L3@0GM2DU*7yq<w@a
zW8~iAl=b8tk@}P0#3}oUkLU|g+EOEIO6pDSPqxJ<dy(^m{zspRHxk#RJQBy$^Gl78
zCrMdH+EkpP-$+c7cOXRj+mFaM6(i(2QaAFB$vIR^l6V%UjFIa|dy{fV?21#y$%piT
zkvJ!Hr=E*y8&Y>F=13o>`Wp32RJ@8&+T@*xQ1&6uO~ojAek!jBQ`S*AMVQhjaY3Gk
zJf9F{9@SRlzEu25(*DLhl}pL}sP{nHM2aGx+>`V(5=*2^az9DRd?8Akv@g|PNn4GS
zOZ9USJER^|pBAUgk)*UqIn*^&-zLvM>MuljNZqJ7r*bCM_o&z=&qL}@-iI*lZ|)(-
zsCP}`Mw0T7+%8J{+s{b<5&Iu|lQt&xp~g@uFOm00exq^#mFK89pmGRlQ!4hT_MpZP
z3EJNnBe6`{iW=|8d!oi0DtA!zqWU#87L)U+F+rTNujGg}RW3PB`VQ4EMsg>~3nbR4
z+(eF%`;U|*Mv*~{C!)0f+xMxt0hv#b=OX2ix>C<AMOjacAJqFH$Ekcr^-J;$r0(J)
z<OoyNh|&J`wUPPBNd2j?jq1OoJkkzS{7`+Bie++4iXualGKac;BsWuinj9l#lIuyk
z{V(pM+JSnv)U!}+NsSp)?xe;V@(k46lA8CCZEBuG+MgO5Njp)wi0ZGRlyB6vl9auv
z+$%);`z}eYrRq)P_>u9K%KgG4_8@Ud;$UQcH<AyixFO|G^IFLfZ7Ro)%-5(fgX;g(
z9G=Qu)b~i))O(@6sqvSZ|4@C5nlDR?kTH_isWlCWSJECNo<#p2b4T(piDyxYT=E{M
zzDL!YI-i>3QZYxKe`J0xMv+V92}+C+{ZFpwe^@zYd)oGx6VVzstihjA2dk6L7RrD1
zRQ~F%_^YS-S5NJ)p8B7j#D9CL{nb?^yORIOSh;+Jeqz?gtei;o{y(J;T3I{WpCnp?
z#54BaG8dlKzF~%vw^8hSV%&l4b)c`mTNId@24<*Ne3(TXSSJqGZ}*Y`>mNstqqZ4%
z@QwSZJL(Lg9wf$n|6z}-nqs^_&NL)D?kSIHpNH5nY@1%s=LosqmzXuB7Ga;Z(r({5
zfM9KY!vMnt=T}=lb&1eGxZ?Evo0g~|ULY~#TDBhCS(OF7wpWnj`O2Hws{poRVro;;
z0DSD$MP_7<M?|CI!%YQMs4(&nnQ~JVd|kP8QHM16wrm(9o#Bb#0t;hB=Msd?6}s|1
zrVp8ie~sO8;uW$)x;vGgFF?3y{_*i=4H16rNQJA~Y50V_xBen?0Nm_*4x<~5QFeGj
z_=I20;9U$9n^mI>?z4UeJB2iGo{bBe!I%s_&FSqvv#sE0bT<8Z<%aM{ZIRaI&R~11
zN{B3?BiwS+LSYX+f_OtGmrW}{X5{&LQ@Jg;q}XNeHp&8J2{F}Q?ia#W>MXA6yMQy=
zO4xOeD1256g%wQ_1K)(@kS19MUc2z*TGn}#OQ@-hHz@&oXJq%~gk(e=5j4zHpAR3y
zqmS7uE`wcia<q6H9TDNn=x1*o0Oxdw?{Usz?X_5J1##wBINOIw|g9q@c(#xhNW
zBz-d4cP15~hvv`LnsNl8o1=4Dn$Ciw&~ZlMObo7S4eimoq>jLz)_r=+E8xX{`ee3r
zJNR`C_X?9`;d3?XNUZT_@V6vx9Xov+eC-yP%v0NdK!%={$Aq1zNHLuJYHB^0yFIF9
zT#g~){Ql)xo4Vm!IGxoT>jduCmWm>#EFLOmxmKr~hVR?@J^gl3;F;Xh2uTzJFQGHp
zGh;D)mfg;|n!gLadRkv9XFUb8*zX)`w=KB&eb<(Eg~OlrkMiYRU0_?r?JoMA2fsR%
zy!VT4!6$Ueo}7<b@CnX~9JS>&m`?{vFZIWQd8Oj4zR@pacDE$7cK?LWv{yfw$(r!j
z)EG1Hjg6p}nVv`2eMDH|+0_^J4<Td6t|vhsj>G2>=lW#N1VoNKeBftP93sS9Bl!x7
zh}3;6ku&Qfc;8-2>=|uL=-r-wS0Mn5N{h3mAC-~88{mIyJPMA{#u|FiHIz?nk`-hs
zf}<`S`+UP{_{b}WI6NwYUwB8y^`llO&0BSX7H|&S(@%90G-=>^T2xmb@&iLpg@BPK
zGA>;;kXlj&mRn-ftQqz2O@VmzlW3I2RX#|PvOs#@hE3rzr3l`nqUKoh1m|}SbY9;`
z=*_Q*OwbKSSkapvrHu;_ylnQDGi!&yKiut8&szz9=}~iMIoyO_XhoY@oFDifl!LZv
zY9VBP+o2;TZ4tJ}(@&Ir5y1mU-)rcJuvZZ;PwTiNxnlZ7scLIni+`>yJQN7uG3gKI
zEqRTQrrtr?RxfZRr3>z4?Eq`(72<ejP&w&J)Sw;Vf3^8`v!87NZ`>8ztY^jW>62T2
zV&7A6dvix^%(g_<Z@<+qb!UR%HBM-UuL-y<dj<8frKnhQbgNi?H8Q>wo5VGgg5_R1
zW@h3F2$l;b86@oo$0RPY`sf|7r{p`*<oAK=?vUGY_A0zq6^Eyk)S}F!aD()ZO0d2=
zxkqP*Ak8ex<Z87CIN#&VR=;sWVBWQrC3Z^4F!oW9S9Ap{bJMie(Gw8icWJ__+^68J
zzw$1?=Oh?ceVPR}Pryknz0-V>0l%O3xr75CGwHIH)}rO`m#_0kQ5a3&W`q8ij4^m{
zT}$SjYX~^at?ayf3HUB4{-QQp5zN&WKI|~6MTwr5wzbOzc!Vj(Xj^Q8_r0ge%^NIG
zk}{v85E4T0?3<V?aofRroy9hB$OFS{8oerR75s7@E^It#4(4E3WWKf@*e8yR{~R2E
ztcDg<UGYU=PXEY#8;}d{#=(;j-8+$*{M`0{>0z)#Hx}KDo`5pDpIUM&OW;3SYf}C5
zY2c}Ar-XXU0$)}r^}^>EFy?lc8$UXR;OXntU*0rFAa~(RLxwpxv&I@ZI_?9rZNWfE
zs65iwh}x*`8$w!3USEa!G&~e##j1MxgY`-(BQa|#LayjMw3#dfrjb^}{ofnFus%|w
zCoO~2FN5a%*#r+?IdW=KT^|^Q&DoD6*hp0#*QVn<72Z|HCxj`Vg<#3+^M1#z;Qd>!
z<H%<Tq;5K2Aic{DjI2}7A+ZM;wGr_HYZim^kH_4&4Sfi?J$-EeO%m*tmwq<zWD%Yg
zpH@5l9(d8SBd)4#1ApV~dat@}aMQ*m?bq%@sE}{xS^srlb{s!4Nnr;v9~&pS-Twr3
zMw|ho$`Api{zubFxL|xe^P$GH226#}ofBiXfU`bAtGasu!cHmoG-!rEkhNMl?b{iI
zlok#8T<b&0z9E_A3Yka?YOycB?LzP#PkDa3E5iChhW&xLLuq5SMIX`)4@c>*8$wBx
zs)(g{4cPuGe^q!#gX{EmVQ<%VgzS6VpZop;QWR+wmZL<#v;6XThWv5R6^aAjSY8G9
zo=(qY_xIq>%`6SLK=4hNg=UYC5t#m4-+HubfD?awO0D?|aK~jRw?2yo=Y`)XEeB6L
zsLF7BX_p07f264EC^kHO&V8CP-k#vE%l?`3{86GEUjFrAKM^N1%U&+N5B6j2UAZa0
z;gcVrp%LH+KepSX)<jvb7y^5Psa+5#hAQtKodCg_oDY`MD^R*ONGbu!pnu*oU%XF%
zu={ZW_1p*q^d*$+ix2_3uxTYh1YkX`d3HHJ4neM8?+}iRAff2kRk>TiteCUnH>V1W
zTOWq~hsV&)<;N5DUZywAV2lOA-3l%?>@I}g8fI;y7Y`Y!bJOPyMnZ7iS2)Dm1($x*
zd@*f#4EDXbr;i<P1@GauVuO>CV4YJr1bQD4uZ3Jb&ai`zxr#q+*&GBuSY=XHt^!Vf
zTSoaY0r-n)Vh?O*f-SFb*?ZDwq%YBNzk6aG1pQ_6oR?N1^n+=T&wLB;=rcuM<n01q
z=0}bGnn&<EAzUHnn+Bf^+`jk?Iq;ePXwH)#=E(Wz)YP!k7wL)a@79l+0`6k%!L;#9
z;4>w2^-#SG=$&r0(cy;?(mv%fK?F!qt#;Rr9t7jacP>}g9IRd4p&Fu+5J;ch73t0g
z{aj9g(ATfvPS8)9&5r`RLG_s+*ATpeSGdKrdtm(@+gW$94$SgVeSseN;8b%ih<%&~
zUdh_k5q(TVI(3O%javxrbyKC+d(R=Y-Dv0dBYt3-Z@KL3eFfo<Z{1c3dj?;VypD}a
zOd-gVD=-ydfWP_rfq~!Cz<TP!K6!2-_#&438+(<&n|XbOu2MUKTCHg>x9tQgN8FIM
zcOn8V1YR<%*G9yxk1h$1A3+e)_K#Sq6hdRW1Gicc=Mz}mSfbPjp9;y+smqeVoV;(I
zo~8{Lnhmr^Y!ig0N5#HN%>#W@qMBs)7Ni+JUscnX4Bop`)*P8?&@JyeybL@J?le($
z{GFW;=-W?eDmFowu=Eu(WiO<yGJ5chcN6~Zem6{a2q7RnA!m;x4ZJq_-5+irfbXvH
zPW^5cC}p#YX+_`Rl}$erF@dn#2P<EL#kU}kXyf=^c#e{v&nt$GuK-U>wMG7RI+*vd
z>H_l)82oFK*SFh%oAsOX_*MZp52afB=er{<e&Wgt>7L;BtT*qoG6Q4Jvi&XlQo;S$
z<6SB@fXK^+E^+5&ko>W7Mc!%^B=>EX-uPq|xczCa^q&N;cjUfnlB$Gva-hx`y)XpP
zYjwjO=z^}>m2k7J5)7e-hl)A9;HrI=%r{R#>Dwt!O*JGDI?q^e&eR4h6^n%|CJ($U
zjg8WkqY-*)PSeTMLioI$@VRnRI0TH_vbqhgLBHC&jT<Ecw&YQp^Q(N38of?S#Nja*
zAB_pRB=SAWo;io!O$292tG2(LG`tmegx;2xK$-mb+ST%V!EWBJoo{*yEY&2r)~~@3
z?4B^mvy_8$`Om|7ysvA^!^#dn@M;}?{~B{2T$f9}cDt7$ZE2bO)QM-|`|Wv=TXrWn
zpGRkZv?g#`w_e5WRTL37qXdbCukm2%#vAsVuY>*W8z(+07;I@>)om+V;U%_bQf<aQ
z@Os03yt}4{<hY=`xvS5Cv3r~Gj>xaz*v*?p+qVfkjoArnhI|P>T4Qt}j1AVurqhP&
zo#D~4@y(+mS$KAD5DwUJ6Tw^~8`-LHV6DhGm~6ns1I6Igiu^kW+cWI|5wIcnWw+?d
zY9nxd7@7t5j0Mx|bxTjfTJSQ;=#4_xAUNYvP;fRFT(2$K+U+?AE!=5Sx~LoJvkj&w
zi1dJ)xqI&2+IG+%-cnMvQU<5vOThZMIs}f&tF29Sz_ZylAo=7L(kf33=Vf8e8rm}!
z3xic8Q@=C!5xDBua`3DL*qd!%NSSt`)TwZBTT4BWXDv>%9ZSTMQ-$xmf35=CB7#VV
zbl~GCR;gpX3u%sbpO{*d!TXoUjf=78!I8_&HnbH6TZwsNy3K2(g`6MGgVL%Xe_{J7
zq@3Hn$fYa*p@ypQ@m-Bzn+rYbs&fZ7Pl;cj)q~XC>|b3@qTo)H436vFjc|*hN;Cfw
z@DC*n=gYqLMSmwP{sKRNX^PZ@WcZG*S#|r>cOqZQy>v}~CrZ*qxA~So0`u~F{VtOx
zu&jflZ!O>>z)s`26#WV~Hs?;%yEuS7;Z4EoHS6F_)9q<KVF1rvW{ajX)`I;@&41$j
zXJFR4#OHnS2XDf(;dO%f45rHzTkwjKG&OWOkltV`bvM=;-hl>s*Trvu|D#t}E=wJL
zn=+POpPmawiK}dLQUiQD4feihO$E>Q(Rw|hOz`FOl^bL)BHc4xsB`9KFxnD6MvQWR
z53^iaZOlTH^f)CI7Dyv~XO_YG_bOl+*R8bqF&$|W^R*Y=)da7GIQ@h$qONQBen_kw
zVG9ilRM$U&K-;Qt=dx0SI!C@&++&B($@{alj%`Dls}k>1<uPz3HEE_AXo8WueS@*%
zeE8$usCFHFq{ZB|oE%w+umYJ=hIT}~J5Wocb4o<Lx_oZ8NdSE0Z1Ri)7~pL&>Y3e=
z308|t{((kKu)B8GpG{Z=&f$)+Gd-Na>9(|8F6NDtrADKogVn+4el(o#u#?NSz7tgh
zy(#9Vy<!id>|d~B{X!Ax+5BR~HgoVbuVi^=eS)9rt5QpM9e6F3x&LeYPjEJ8H8=0)
z6MVRZ2)lYnopkW?YS%b;XPdu$I`A00wFNmt>X`@`(mfYGV;@2<a?|R!d%=HJzSi8q
zLZsLW&&ZTiCV1fPRyYv6DnG8orq>Jn9;X^!*K(Xw`B=R+CLD~dD+d@ZTHqdbY<(6*
z+;{YP+lOjh1b&j`*7Phu=*%a_*TuOby`;JDRRs;6=fhodrf?DZC_boXMLN<HJ7q6l
zJqA|Y6N|}(e()`5HEfk=fk)%=4M**o;W=Pt=3qAyAv)D-UtCrML(U`r=$acuelfL9
z*rAK4BmHg77iNI3bhJ(F<ty;DeyK{$ki?~a8{gRyAqcXx{USd23+LW-GN$#^!8fq^
zNYYIrA6c5}e3+Y#@?Y6c9>!IH$B&Llk&;76*M!NL559wO!hMD%uK{IScZlxsAo$p2
ziTWq~C~(X~WUA;yyjdO9=3Gug@;JXUvl0ou(TF_qkM4bh<P*2JMH?yA=TC?hnSc`u
z`}qs`U_7e2@7P4#_tWfCt_xe>y{S*HN?8eEKKk1Fp6y7NEciDe2XFDLyKi%~2zzK)
zy)H=ucZOfod#!8W%p5JdV9O7L?euzia@<|`(8{+r%sGtGu{4Wnz2)G&x_7B2gQ#~R
zT6FANr-H@%He64YO;mq(VW%C~g9$`{+fU$WfrWi7QAad7^>r;^qg20hXh(e`1oCQg
zI~G(UoR!GgmUj!h-+SNPIQs&8ja>mDPfZae&uaHx<pxen{X;_oDU_PUZ;+Y8N4Zhx
z8NO#U_@1AMFl7urecU0jvPdv*E*k#b;!@+T9UG)O5b*ktgL?TrF!Q3l4qVTM&+A5E
zjbm>KeoVSnZCnD@)JP%2!b#vWk~5ZR8iJppC*h!80nYp5Q};&1fUW#nj=6!zb8Y1&
zt#ti}$fEJzmM6U>>ZH?Se)>t_($ja_ZQohql8obU-pJd#^W@>Wg$UTg|1`__6Zkse
z3jN0`;5XVcuya8MQFjaKmWRoM-ISO=_UA#c)-E<bV`c_l-Bm<7E{4FDkKWhNgAqsz
zS+Td{H3Dy5VM>ph0q@7tnN>@OxNh`OMC;5OMARv#)JF}1SJ!3#GH)q3(Ru%9F3m>J
zc9CKGR;cB#l*}k6^8b^z3lhY@tA3l9)+z`8G~H*tVjA$z{ng%`z79SrIWumYKZ-!v
zz{ze7QpgINVSQA}8vN^9H(dNqM{dQ~$+_KG$m!9_HY@W4OV7<*ZJh_A7i%*Px-3O_
zX~jWVW)K2(&pJt}nZQTs+HfAmx|lk+wW0}(el?8;J)gmBa*XRWBJA-!T~F!!T9n_=
z5bobk)GcYNg5OMB0j9mTPPalm*u8^o{+&)J>tal?npuyU?ZwX<#7842cFcP5_m2>s
z>i<*s^D~5`oQU+_@d`BoC%neiH6n1~V_`)d2ly7XcoBpIp5tA*?5hSu7VC4j1^&im
z;h`02iG=;;Waru6_z9-XP`Rs_Is*5N6F463MP<rch9<ihJ{|G8rla12OD`I(|5>V4
z+8wgQJ_jOyIGtLLfYnnK`GqX_MtuM0*XMDlP?Y`9(N=`aS<n9^6!^9rRd<sKM!=Ha
zi3%Pus2uC$y~_0g!nspL9nz$bp|)&_=ra}qf`YoU99j`0p_gyGUIadY|J5t3FT2*L
zt9OBa*gZzIf;jh?Pq!~uMu1W8DQsh|4{odC-czGzAghKsXOFx*(%uEgnHcUx*qVgZ
zA1<9kX+(>W#>X2-TeFlSq;3Vik-p>RBfUu9*O=s4^%1Er7j2#7Sq5fO%|VlzrQjI{
zpIA8&gk4wKcV@i-qx6oHno1ke+e=5Q-b+X6)FT_O1mA!lX{z^uqyK=_d0A#><~gKq
z*gH$hx*Y<~qiz=3IpAc7oO_rk1fP{U1T4%^7Ex{bbX6u%ufLH9esB{$heC$yeSuZ%
zCeuAO@P779Bw$e=xJ5O?`4D4&#|4whesCm8w*~h5f_ucPH_v)51d>8~b@J6va^ka2
zwZ$UPEoLe%by0!0b<bJ%Gm2oo-7K7_Yfr>egQo}1v%t7{ZcYE*8n6rw+l$6Tqa>p7
z>Y?~vgpSiA)R~G9$BBon+82S{${z4(DnOWrfn&)!4X|gO=dDTH08aifB3$i4N!vd^
zt_LfkMDv1T4C^dX=N?jdWw0Mi(TUpOm27a^mp}hqa+9dHG{^7n-i?yoN&e;Dcfl=>
z^GyD=9qbZYq8!`=Z~1dQSz8x?;Tx*-eEthOa2YlHdpEb)r}};^Q6H*$%(zEuMasNG
zJ#O8=gF?+SQ`Uvx{$yw83uE=cO8m*K?>&v;D21)u!YTwgBrR7$H|SGicUk^?2aa@e
z%<Frn5$s%Lb!g5!gsg}U;WxcTah80>j*9gNmAZY%<kE3?%bkBxdVD<SHg&_l+uRp!
zrm1t1!EGqH&23f!OEXNrejp9Zh6W;Sv4YoZ*=vmj+DP&$U8xeQ3Z_-R{hJS;zz{oj
zUE5&{JVzZ5aelKH#jhQWnzpqeXs@j7&NcfG#LGRitLG=!w22RgHame^yD<OVOHH^v
zwBIp?wH*v4i|-p<i2B~^+AVV#I+C93?R==U2Mp!vNsrIOf$!V-xc!9}JRWY^qcU*-
zOit2pJ?j2QZKr`zCX%9cR-Fnl1hcMUPb#Mv9xgZc>j&l{d5M%<;IjdoZF`^j$WRRK
zof~&+bzg_~mHE9l!`^_McMR1TY4BElbGCb05{kcko;~V56CSLrZ>5Puotz#t&2^CA
z{bMVMYp_9ao5N@Sea;998yYLkx&!(hi{X4L*oosk$6E$0jT5&DcGsdfz%0eUN*msz
z6Ovom*TGuTRF-(s671NQyEPvC0!!_8t^TC3V9lyoCb-=O`j?<3M8E*c0;6~pTEsni
z#G9n90cUb>(n`j5(C5-WJF5tRHA&_6KY~aUfBTTOFXK17(u>AiOkV+dG(GRoZa2`U
z)yNN(=)!x{A)5uICGcK3IFHp;3P$S8w~NJzI(h5e<13_ez({e_FBa_tJ#+I*14b1X
z?U_3~?-TV3=+=|x`{RD9-qWg#$4E9Wt^1kRhcuVPn-2<J!dvuw<;R7CNRG@9%xk{@
zI&=G?Xk$-=JZI^Y_Gf{y;{5nk>O`J!OM3YCBJZHhZdd2BiMmqrsgsR6JVr}|)GTxd
zBl6k7RqNNn>sg-%0Z4>|nl}dRIDz8+0}dRwXW-dsuk7?QLQL5TOY!Qb@W1cTEGzXD
zJX0Uf-sU+-owwXL(d_|3%k_qNgZ^}h=fb(8z>awOAZ?92JZ^7KEFRB8RPewjnXFEf
z&0HGKj7ovG%Fu!3r;8Bc_EbzFr4X#Q1^RCJ#$dfyxSQ6~4DXDt-&Qg*V9@POiY_uG
z^5avnRoO#eWq6(stA2qH+TCo|xjBe@mHf`~&=znseJn=_hk|p8#xpqA2fEMx>hK(b
zk0LZy6vk!~`EQG->dlQ{=nXP<dJ=V4>{p^(af6$RrAo8H1+d)GIj2+SqxfgayG~Ug
zb8XD&2#Z|^6aQTp<!(*ni_XS9`(wb#O(e>dU0|hHJi0Vh0j##|W?$D@f?hJ<e&v}s
zf-~s-pW?QFt|vKhzr`NVUE@z&_+ClW>xI2;8AP5V*L>1@AQtS@?7cI3cwkh-Cyu$E
zfTWuTUu1r50V82!PRB(jBF{T_>wq?azb*bUZI=wdVICb^Bk~5UUH*1~A41^0+5PRr
zG1*{rBsNH|bwu($t%`ZO;=y~cNz1=!3w$Nj%Zt)#NB&#!gw=6d^52TWe+ETu&p4bU
z+s<~!ZAUDUI%)5+j9fV8--UGI&(P3s_x~me4~^Jx>~TBmllBK}Pgoy5ZgukLpHXQW
ZD`%@^2Txl4D}Xlme*iH+Zgj{i001#Fv~B<Z

literal 0
HcmV?d00001

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
new file mode 100644
index 0000000..100a6fb
--- /dev/null
+++ b/vignettes/split_by.Rmd
@@ -0,0 +1,319 @@
+---
+title: "Storing Large Dataframes"
+author: "Thierry Onkelinx"
+output: 
+  rmarkdown::html_vignette:
+        fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{Storing Large Dataframes}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+  %\VignetteDepends{git2r}
+  %\VignetteDepends{microbenchmark}
+  %\VignetteDepends{ggplot2}
+---
+
+```{r setup, include = FALSE}
+library(knitr)
+opts_chunk$set(
+  fig.height = 4, fig.width = 6,
+  collapse = TRUE,
+  comment = "#>"
+)
+library(ggplot2)
+inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517",
+                  "#843860", "#C04384", "#C2C444", "#685457")
+theme_inbo <- function(base_size = 12, base_family = "") {
+  rect_bg <- "white"
+  legend_bg <- "white"
+  panel_bg <- "#F3F3F3"
+  panel_grid <- "white"
+  plot_bg <- "white"
+  half_line <- base_size / 2
+  theme(
+    line = element_line(colour = "black", size = 0.5, linetype = 1,
+                        lineend = "butt"),
+    rect = element_rect(fill = rect_bg, colour = "black", size = 0.5,
+                        linetype = 1),
+    text = element_text(family = base_family, face = "plain",
+                        colour = "#843860", size = base_size, hjust = 0.5,
+                        vjust = 0.5, angle = 0, lineheight = 0.9,
+                        margin = margin(), debug = FALSE),
+    axis.line = element_blank(),
+    axis.line.x = element_blank(),
+    axis.line.y = element_blank(),
+    axis.text = element_text(size = rel(0.8)),
+    axis.text.x = element_text(margin = margin(t = 0.8 * half_line / 2),
+                               vjust = 1),
+    axis.text.x.top = NULL,
+    axis.text.y = element_text(margin = margin(r = 0.8 * half_line / 2),
+                               hjust = 1),
+    axis.text.y.right = NULL,
+    axis.ticks = element_line(),
+    axis.ticks.length = unit(0.15, "cm"),
+    axis.title = element_text(colour = "black"),
+    axis.title.x = element_text(
+      margin = margin(t = 0.8 * half_line, b = 0.8 * half_line / 2)
+    ),
+    axis.title.x.top = NULL,
+    axis.title.y = element_text(
+      margin = margin(r = 0.8 * half_line, l = 0.8 * half_line / 2),
+      angle = 90
+    ),
+    axis.title.y.right = NULL,
+    legend.background = element_rect(colour = NA, fill = legend_bg),
+    legend.key = element_rect(fill = panel_bg, colour = NA),
+    legend.key.size = unit(1.2, "lines"),
+    legend.key.height = NULL,
+    legend.key.width = NULL,
+    legend.margin = NULL,
+    legend.spacing = unit(0.2, "cm"),
+    legend.spacing.x = NULL,
+    legend.spacing.y = NULL,
+    legend.text = element_text(size = rel(0.8)),
+    legend.text.align = NULL,
+    legend.title = element_text(size = rel(0.8), face = "bold", hjust = 0,
+                                colour = "black"),
+    legend.title.align = NULL,
+    legend.position = "right",
+    legend.direction = NULL,
+    legend.justification = "center",
+    legend.box = NULL,
+    legend.box.margin = margin(t = half_line, r = half_line, b = half_line,
+                               l = half_line),
+    legend.box.background = element_rect(colour = NA, fill = legend_bg),
+    legend.box.spacing = unit(0.2, "cm"),
+    panel.background = element_rect(fill = panel_bg, colour = NA),
+    panel.border = element_blank(),
+    panel.grid = element_line(colour = panel_grid),
+    panel.grid.minor = element_line(colour = panel_grid, size = 0.25),
+    panel.spacing = unit(half_line, "pt"),
+    panel.spacing.x = NULL,
+    panel.spacing.y = NULL,
+    panel.ontop = FALSE,
+    strip.background = element_rect(fill = "#8E9DA7", colour = NA),
+    strip.text = element_text(size = rel(0.8), colour = "#F3F3F3"),
+    strip.text.x = element_text(margin = margin(t = half_line, b = half_line)),
+    strip.text.y = element_text(margin = margin(r = half_line, l = half_line),
+                                angle = -90),
+    strip.switch.pad.grid = unit(0.1, "cm"),
+    strip.switch.pad.wrap = unit(0.1, "cm"),
+    strip.placement = "outside",
+    plot.background = element_rect(colour = NA, fill = plot_bg),
+    plot.title = element_text(size = rel(1.2),
+                              margin = margin(0, 0, half_line, 0)),
+    plot.subtitle = element_text(size = rel(1),
+                                 margin = margin(0, 0, half_line, 0)),
+    plot.caption = element_text(size = rel(0.6),
+                                margin = margin(0, 0, half_line, 0)),
+    plot.margin = margin(t = half_line, r = half_line, b = half_line,
+                         l = half_line),
+    plot.tag = element_text(size = rel(1.2), hjust = 0.5, vjust = 0.5),
+    plot.tag.position = "topleft",
+    complete = TRUE
+  )
+}
+theme_set(theme_inbo())
+update_geom_defaults("line", list(colour = "#356196"))
+update_geom_defaults("hline", list(colour = "#356196"))
+update_geom_defaults("boxplot", list(colour = "#356196"))
+update_geom_defaults("smooth", list(colour = "#356196"))
+```
+
+## Introduction
+
+Sometimes, a large dataframe has one or more variables with a small number of unique combinations.
+E.g. a dataframe with factor variables.
+
+In such a case we can use the `split_by` argument of `write_vc()`.
+This will store the large dataframe over a set of tab separated files.
+One file for every combination of the variables defined by `split_by`.
+Every partial data file holds one combination of `split_by`.
+We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash.
+This hash becomes the base name of the partial data files.
+The combination of the hash in the `index.tsv` and the base name of the partial data files makes the information of `split_by` in the partial data file redundant.
+We remove the `split_by` variables from the partial data files, reducing their size.
+
+## When to Split the Dataframe
+
+Let's set the following variables:
+
+-   $s$: the average number of bytes to store a single line of the `split_by` variables.
+
+-   $r$: the average number of bytes to store a single line of the remaining variables.
+
+-   $h_s$: the number of bytes to store the header of the `split_by` variables.
+
+-   $h_r$: the number of bytes to store the header of the remaining variables.
+
+-   $N$: the number of rows in the dataframe.
+
+-   $N_s$: the number of unique combinations of the `split_by` variables.
+
+Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation.
+The total number of bytes is `T_0 = h_s + h_r + 1 + N (s + r + 1)`.
+The `+ 1` originates from the tab character to separate the `split_by` variables from the remaining variables.
+
+Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables.
+`h_s` bytes for the header and `N_s s` for the data.
+The headers of the partial data files require $N_s h_r$ bytes ($N_s$ files and $h_r$ byte per file).
+The data in the partial data files require $N r$ bytes.
+The total number of bytes is `T_s = h_s + N_s s + N_s h_r + N r`.
+
+We can look at the ratio of $T_s$ over $T_0$.
+
+$$\frac{T_s}{T_0} = \frac{h_s + N_s s + N_s h_r + N r}{h_s + h_r + 1 + N (s + r + 1)}$$
+
+Let's simplify the equation by assuming that we need an equal amount of character for the headers and the data ($h_s = s$ and $h_r = r$).
+
+$$\frac{T_s}{T_0} = \frac{s + N_s s + N_s r + N r}{s + r + 1 + N (s + r + 1)}$$
+
+$$\frac{T_s}{T_0} = \frac{s + N_s s + N_s r + N r}{s + r + 1 + N s + N r + N}$$
+
+Let assume that $s = a r$ with $0 < a$ and $N_s = b N$ with $0 < b < 1$.
+
+$$\frac{T_s}{T_0} = \frac{a r + N a b r + N b r + N r}{a r + r + 1 + N a r + N r + N}$$
+
+$$\frac{T_s}{T_0} = \frac{(a + N a b + N b + N) r}{(N + 1) (a r + r + 1)}$$
+
+$$\frac{T_s}{T_0} = \frac{a + N a b + N b + N}{(N + 1) (a + 1 + 1 / r)}$$ $$\frac{T_s}{T_0} = \frac{a + (a b + b + 1) N }{(N + 1) (a + 1 + 1 / r)}$$
+
+When $N$ is large, we can state that $a \lll N$ and $N / (N + 1) \approx 1$.
+
+$$\frac{T_s}{T_0} \approx \frac{a b + b + 1}{a + 1 + 1 / r}$$
+
+```{r ratio, fig.cap = "Storage space required using `split_by` relative to storing a single file.", echo = FALSE}
+combinations <- expand.grid(
+  a = c(0.25, 0.5, 1, 2, 4),
+  b = seq(0, 1, length = 41),
+  r = c(10, 100, 1000)
+)
+combinations$ratio <- with(
+  combinations,
+  (a * b + b + 1) / (a + 1 + 1 / r)
+)
+ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) +
+  geom_hline(yintercept = 1, linetype = 2) +
+  geom_line() +
+  facet_wrap(~ paste("r =", r)) +
+  scale_x_continuous(
+    "b = N_s / N",
+    labels = function(x) {
+      paste0(100 * x, "%")
+    }
+  ) +
+  scale_y_continuous(
+    "Relative amount of disc space",
+    labels = function(x) {
+      paste0(100 * x, "%")
+    }
+  ) +
+  scale_colour_manual(
+    "a = s / r",
+    values = inbo_colours,
+    labels = c("1/4", "1/2", "1", "2", "4")
+  )
+```
+
+The figure illustrates that using `split_by` is more efficient when the number of unique combinations ($N_s$) of the `split_by` variables is much smaller than the number of rows in the dataframe ($N$).
+The efficiency also increases when the storage for a single combination of `split_by` variables ($s$) is larger than the storage needed for a single line of the remain variables ($r$).
+The storage needed for a single line of the remain variables ($r$) doesn't influence the efficiency.
+
+## Benchmarking
+
+```{r load_data, echo = FALSE}
+airbag <- readRDS(
+  system.file("efficiency", "airbag.rds", package = "git2rdata")
+)
+```
+
+```{r set_tmp_dir}
+library(git2rdata)
+root <- tempfile("git2rdata-split-by")
+dir.create(root)
+```
+
+```{r get_write_timings, eval = system.file("split_by", "write_timings.rds", package = "git2rdata") == ""}
+library(microbenchmark)
+mb <- microbenchmark(
+  part_1 = write_vc(airbag, "part_1", root, sorting = "X"),
+  part_2 = write_vc(airbag, "part_2", root, sorting = "X", split_by = "airbag"),
+  part_3 = write_vc(airbag, "part_3", root, sorting = "X", split_by = "abcat"),
+  part_4 = write_vc(
+    airbag, "part_4", root, sorting = "X", split_by = c("airbag", "sex")
+  ),
+  part_5 = write_vc(airbag, "part_5", root, sorting = "X", split_by = "dvcat"),
+  part_6 = write_vc(
+    airbag, "part_6", root, sorting = "X", split_by = "yearacc"
+  ),
+  part_15 = write_vc(
+    airbag, "part_15", root, sorting = "X", split_by = c("dvcat", "abcat")
+  ),
+  part_45 = write_vc(
+    airbag, "part_45", root, sorting = "X", split_by = "yearVeh"
+  ),
+  part_270 = write_vc(
+    airbag, "part_270", root, sorting = "X", split_by = c("yearacc", "yearVeh")
+  )
+)
+mb$time <- mb$time / 1e6
+```
+
+```{r store_write_timings, echo = FALSE}
+if (system.file("split_by", "write_timings.rds", package = "git2rdata") == "") {
+  dir.create(file.path("..", "inst", "split_by"), showWarnings = FALSE)
+  saveRDS(mb, file.path("..", "inst", "split_by", "write_timings.rds"))
+} else {
+  mb <- readRDS(
+    system.file("split_by", "write_timings.rds", package = "git2rdata")
+  )
+}
+```
+
+Splitting the dataframe over more than one file takes more time to write the data.
+The log time seems to increase quadratic with log number of parts.
+
+```{r plot_write_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for different number of parts."}
+mb$combinations <- as.integer(gsub("part_", "", levels(mb$expr)))[mb$expr]
+ggplot(mb, aes(x = combinations, y = time)) +
+  geom_boxplot(aes(group = combinations)) +
+  scale_x_log10("Number of parts") +
+  scale_y_log10("Time (in milliseconds)")
+```
+
+```{r get_read_timings, eval = system.file("split_by", "read_timings.rds", package = "git2rdata") == ""}
+mb_r <- microbenchmark(
+  part_1 = read_vc("part_1", root),
+  part_2 = read_vc("part_2", root),
+  part_3 = read_vc("part_3", root),
+  part_4 = read_vc("part_4", root),
+  part_5 = read_vc("part_5", root),
+  part_6 = read_vc("part_6", root),
+  part_15 = read_vc("part_15", root),
+  part_45 = read_vc("part_45", root),
+  part_270 = read_vc("part_270", root)
+)
+mb_r$time <- mb_r$time / 1e6
+```
+
+```{r store_read_timings, echo = FALSE}
+if (system.file("split_by", "read_timings.rds", package = "git2rdata") == "") {
+  saveRDS(mb_r, file.path("..", "inst", "split_by", "read_timings.rds"))
+} else {
+  mb_r <- readRDS(
+    system.file("split_by", "read_timings.rds", package = "git2rdata")
+  )
+}
+```
+
+A small number of parts does not seem to affect the read timings much.
+Above ten parts, the required time for reading seems to increase.
+The log time seems to increase quadratic with log number of parts.
+
+```{r plot_read_timings, echo = FALSE, fig.cap = "Boxplot of the read timings for the different number of parts."}
+mb_r$combinations <- as.integer(gsub("part_", "", levels(mb_r$expr)))[mb_r$expr]
+ggplot(mb_r, aes(x = combinations, y = time)) +
+  geom_boxplot(aes(group = combinations)) +
+  scale_x_log10("Number of parts") +
+  scale_y_log10("Time (in milliseconds)")
+```

From 8d74022c49eba206a86ba052970975ffbda389db Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Fri, 18 Sep 2020 14:39:14 +0200
Subject: [PATCH 08/23] Change split_by storage

Handle the case where a file stored without split_by is replaced with a version with split_by and vice versa.
Also check changes in split_by variables.
---
 R/write_vc.R                     | 17 +++++++++++++++++
 codemeta.json                    |  2 +-
 tests/testthat/test_f_split_by.R | 27 +++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/R/write_vc.R b/R/write_vc.R
index 6c949e4..4acbdd0 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -106,6 +106,10 @@ write_vc.character <- function(
       }
     }
   }
+  assert_that(
+    unlink(file["raw_file"], recursive = TRUE) == 0,
+    msg = "Failed to remove existing files."
+  )
   if (length(split_by) == 0) {
     write.table(
       x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE,
@@ -232,6 +236,19 @@ compare_meta <- function(new, old) {
     ) -> extra
     problems <- c(problems, extra)
   }
+  new_split_by <- new[["..generic"]][["split_by"]]
+  old_split_by <- old[["..generic"]][["split_by"]]
+  if (!isTRUE(all.equal(new_split_by, old_split_by))) {
+    sprintf(
+      "- The split_by variables changed.
+    - Split_by for the new data: %s.
+    - Split_by for the old data: %s.",
+      paste(sprintf("'%s'", new_split_by), collapse = ", "),
+      paste(sprintf("'%s'", old_split_by), collapse = ", ")
+    ) -> extra
+    problems <- c(problems, extra)
+  }
+
 
   new <- new[names(new) != "..generic"]
   old <- old[names(old) != "..generic"]
diff --git a/codemeta.json b/codemeta.json
index 08f8dd0..8ad6d43 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -218,7 +218,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "614.847KB",
+  "fileSize": "616.079KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R
index 657e540..789770f 100644
--- a/tests/testthat/test_f_split_by.R
+++ b/tests/testthat/test_f_split_by.R
@@ -39,6 +39,33 @@ test_that("write_vc() handles the split_by argument", {
     check.attributes = FALSE
   )
 
+  expect_error(
+    write_vc(
+      test_data, file = "sorted", root = root, split_by = character(0)
+    ),
+    "The split_by variables changed."
+  )
+  expect_warning(
+    write_vc(
+      test_data, file = "sorted", root = root, split_by = character(0),
+      strict = FALSE
+    ),
+    "The split_by variables changed."
+  )
+  expect_error(
+    write_vc(
+      test_data, file = "sorted", root = root, split_by = "test_factor"
+    ),
+    "The split_by variables changed."
+  )
+  expect_warning(
+    write_vc(
+      test_data, file = "sorted", root = root, split_by = "test_factor",
+      strict = FALSE
+    ),
+    "The split_by variables changed."
+  )
+
   data_file <- list.files(
     file.path(root, sorted_file[1]), pattern = "[[:xdigit:]]{20}",
     full.names = TRUE

From b5a68690b745c649a9ee1e214d6e023034460bf1 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 22 Sep 2020 16:18:34 +0200
Subject: [PATCH 09/23] datahash() handles split_by objects

This simplifies the logical in write_vc() and read_vc()
---
 NAMESPACE    |  1 +
 R/datahash.R | 42 ++++++++++++++++++++++++++----------------
 R/meta.R     |  2 +-
 R/read_vc.R  |  4 +---
 R/write_vc.R | 10 ++++------
 5 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 19abdd3..061a942 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -78,6 +78,7 @@ importFrom(git2r,status)
 importFrom(git2r,workdir)
 importFrom(methods,setOldClass)
 importFrom(stats,setNames)
+importFrom(utils,file_test)
 importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,write.table)
diff --git a/R/datahash.R b/R/datahash.R
index 3c13188..0512a55 100644
--- a/R/datahash.R
+++ b/R/datahash.R
@@ -8,23 +8,12 @@
 #' @family internal
 #' @importFrom assertthat assert_that
 #' @importFrom git2r hash
+#' @importFrom utils file_test
 datahash <- function(file) {
-  chunk_size <- 1e4
-  hashes <- character(chunk_size + 1)
-  i <- 0
-  rawdata <- scan(
-    file = file, what = character(), nmax = -1, sep = "\n", quote = "",
-    skip = i * chunk_size, nlines = chunk_size, na.strings = "",
-    flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
-    blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
-    encoding = "UTF-8", skipNul = FALSE
-  )
-  while (length(rawdata)) {
-    hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
-    i <- i + 1
-    if (i  %% chunk_size == 0) {
-      hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
-    }
+  if (file_test("-f", file)) {
+    chunk_size <- 1e4
+    hashes <- character(chunk_size + 1)
+    i <- 0
     rawdata <- scan(
       file = file, what = character(), nmax = -1, sep = "\n", quote = "",
       skip = i * chunk_size, nlines = chunk_size, na.strings = "",
@@ -32,6 +21,27 @@ datahash <- function(file) {
       blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
       encoding = "UTF-8", skipNul = FALSE
     )
+    while (length(rawdata)) {
+      hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
+      i <- i + 1
+      if (i  %% chunk_size == 0) {
+        hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
+      }
+      rawdata <- scan(
+        file = file, what = character(), nmax = -1, sep = "\n", quote = "",
+        skip = i * chunk_size, nlines = chunk_size, na.strings = "",
+        flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
+        blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
+        encoding = "UTF-8", skipNul = FALSE
+      )
+    }
+  } else {
+    hashes <- sapply(
+      list.files(
+        file, pattern = "(index|[[:xdigit:]]{20}\\.tsv$)", full.names = TRUE
+      ),
+      datahash
+    )
   }
   hash(paste(hashes, collapse = ""))
 }
diff --git a/R/meta.R b/R/meta.R
index 8da3c0f..b213e1c 100644
--- a/R/meta.R
+++ b/R/meta.R
@@ -262,7 +262,7 @@ Add extra sorting variables to ensure small diffs.", sorted)
     }
     generic <- c(generic, sorting = list(sorting))
   }
-  if (length(split_by)) {
+  if (length(split_by) > 0) {
     generic <- c(generic, split_by = list(split_by))
   }
   # calculate meta for each column
diff --git a/R/read_vc.R b/R/read_vc.R
index ec050c4..4b6bdde 100644
--- a/R/read_vc.R
+++ b/R/read_vc.R
@@ -101,12 +101,10 @@ read_vc.character <- function(file, root = ".") {
           index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
           raw_data
         )
-        attr(raw_data, "hash") <- datahash(rf)
         return(list(raw_data))
       },
       vector(mode = "list", length = 1)
     )
-    dh <- sha1(vapply(raw_data, attr, character(1), "hash"))
     raw_data <- do.call(rbind, raw_data)[, col_names]
   } else {
     raw_data <- read.table(
@@ -116,8 +114,8 @@ read_vc.character <- function(file, root = ".") {
       comment.char = "",
       stringsAsFactors = FALSE, fileEncoding = "UTF-8"
     )
-    dh <- datahash(file["raw_file"])
   }
+  dh <- datahash(file["raw_file"])
 
   if (meta_data[["..generic"]][["data_hash"]] != dh) {
     meta_data[["..generic"]][["data_hash"]] <- dh
diff --git a/R/write_vc.R b/R/write_vc.R
index 4acbdd0..745187f 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -116,7 +116,6 @@ write_vc.character <- function(
       sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE,
       col.names = TRUE, fileEncoding = "UTF-8"
     )
-    data_hash <- datahash(file["raw_file"])
   } else {
     index <- unique(raw_data[split_by])
     index[["..hash"]] <- apply(index, 1, sha1)
@@ -127,7 +126,7 @@ write_vc.character <- function(
       row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
     )
     detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by]
-    data_hash <- vapply(
+    vapply(
       seq_len(nrow(index)),
       function(i) {
         matching <- vapply(
@@ -144,17 +143,16 @@ write_vc.character <- function(
           append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na,
           dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8"
         )
-        datahash(rf)
+        return(TRUE)
       },
-      character(1)
+      logical(1)
     )
-    data_hash <- sha1(data_hash)
   }
   meta_data <- attr(raw_data, "meta")
   meta_data[["..generic"]][["git2rdata"]] <- as.character(
     packageVersion("git2rdata")
   )
-  meta_data[["..generic"]][["data_hash"]] <- data_hash
+  meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"])
   write_yaml(meta_data, file["meta_file"],
              fileEncoding = "UTF-8")
 

From 44868e4e5020dc9a038481f08580cd5306209a4a Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 22 Sep 2020 18:31:35 +0200
Subject: [PATCH 10/23] Add rename_variable()

---
 .Rbuildignore                           |   2 +
 .gitignore                              |   2 +
 DESCRIPTION                             |   1 +
 NAMESPACE                               |   4 +
 NEWS.md                                 |   9 ++
 R/rename_variable.R                     | 160 +++++++++++++++++++
 man/list_data.Rd                        |   1 +
 man/prune_meta.Rd                       |   1 +
 man/read_vc.Rd                          |   1 +
 man/relabel.Rd                          |   1 +
 man/rename_variable.Rd                  |  97 ++++++++++++
 man/rm_data.Rd                          |   1 +
 man/write_vc.Rd                         |   1 +
 tests/testthat/test_g_rename_variable.R | 202 ++++++++++++++++++++++++
 14 files changed, 483 insertions(+)
 create mode 100644 R/rename_variable.R
 create mode 100644 man/rename_variable.Rd
 create mode 100644 tests/testthat/test_g_rename_variable.R

diff --git a/.Rbuildignore b/.Rbuildignore
index af21982..b9532f6 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -13,3 +13,5 @@
 ^codecov.yml$
 ^LICENSE.md$
 ^\.httr-oauth$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
index 155cda3..cde4424 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@
 inst/doc
 docs
 .httr-oauth
+doc
+Meta
diff --git a/DESCRIPTION b/DESCRIPTION
index 11ae337..015f956 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -67,5 +67,6 @@ Collate:
     'recent_commit.R'
     'reexport.R'
     'relabel.R'
+    'rename_variable.R'
     'upgrade_data.R'
     'utils.R'
diff --git a/NAMESPACE b/NAMESPACE
index 061a942..1606515 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -33,6 +33,9 @@ S3method(recent_commit,git_repository)
 S3method(relabel,data.frame)
 S3method(relabel,default)
 S3method(relabel,list)
+S3method(rename_variable,character)
+S3method(rename_variable,default)
+S3method(rename_variable,git_repository)
 S3method(rm_data,character)
 S3method(rm_data,default)
 S3method(rm_data,git_repository)
@@ -53,6 +56,7 @@ export(push)
 export(read_vc)
 export(recent_commit)
 export(relabel)
+export(rename_variable)
 export(repository)
 export(rm_data)
 export(status)
diff --git a/NEWS.md b/NEWS.md
index a454c1e..624b130 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,18 @@
 # git2rdata 0.2.3
 
+## New features
+
 * `write_vc()` gains an optional `split_by` argument.
+  See `vignette("split_by")` for more details.
+* `rename_variable()` efficiently renames variables in a stored `git2rdata`
+  object.
+
+## Bugfixes
+
 * `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
   when both the data and metadata are missing.
 
+
 # git2rdata 0.2.2
 
 * Use the [checklist](https://inbo.github.io/checklist) package for CI.
diff --git a/R/rename_variable.R b/R/rename_variable.R
new file mode 100644
index 0000000..76f829a
--- /dev/null
+++ b/R/rename_variable.R
@@ -0,0 +1,160 @@
+#' Rename a Variable
+#'
+#' The raw data file contains a header with the variable names.
+#' The metadata list the variable names and their type.
+#' Changing a variable name and overwriting the `git2rdata` object with result
+#' in an error.
+#' Because it will look like removing an existing variable and adding a new one.
+#' Overwriting the object with `strict = FALSE` potentially changes the order of
+#' the variables, leading to a large diff.
+#'
+#' This function solves this by only updating the raw data header and the
+#' metadata.
+#' @inheritParams write_vc
+#' @param change A named vector with the old names as values and the new names
+#' as names.
+#' @return invisible `NULL`.
+#' @export
+#' @examples
+#'
+#' # initialise a git repo using git2r
+#' repo_path <- tempfile("git2rdata-repo-")
+#' dir.create(repo_path)
+#' repo <- git2r::init(repo_path)
+#' git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")
+#'
+#' # Create a dataframe and store it as an optimized git2rdata object.
+#' # Note that write_vc() uses optimization by default.
+#' # Stage and commit the git2rdata object.
+#' ds <- data.frame(
+#'   a = c("a1", "a2"),
+#'   b = c("b2", "b1"),
+#'   stringsAsFactors = TRUE
+#' )
+#' junk <- write_vc(ds, "rename", repo, sorting = "b", stage = TRUE)
+#' cm <- commit(repo, "initial commit")
+#' # check that the workspace is clean
+#' status(repo)
+#'
+#' # Define change.
+#' change <- c(new_name = "a")
+#' rename_variable(file = "rename", change = change, root = repo)
+#' # check the changes
+#' read_vc("rename", repo)
+#' status(repo)
+#' cm <- commit(repo, "relabel using a list")
+#'
+#' # clean up
+#' junk <- file.remove(
+#'   rev(list.files(repo_path, full.names = TRUE, recursive = TRUE,
+#'                  include.dirs = TRUE, all.files = TRUE)),
+#'   repo_path)
+#' @family storage
+rename_variable <- function(file, change, root = ".", ...) {
+  UseMethod("rename_variable", root)
+}
+
+#' @rdname rename_variable
+#' @export
+#' @importFrom assertthat assert_that noNA
+#' @importFrom yaml read_yaml write_yaml
+#' @importFrom utils file_test
+rename_variable.character <- function(file, change, root = ".", ...) {
+  assert_that(is.character(change), noNA(change), length(change) > 0)
+  assert_that(length(names(change)) > 0, msg = "`change` must have names.")
+  assert_that(
+    length(unique(change)) == length(change),
+    length(unique(names(change))) == length(names(change)),
+    msg = "The names and values in `change` are not unique."
+  )
+  is_git2rdata(file = file, root = root, message = "error")
+  file <- clean_data_path(root = root, file = file)
+  yaml <- read_yaml(file[["meta_file"]])
+  assert_that(
+    all(change %in% names(yaml)),
+    msg = "Not every old name in `change` present in the `git2rdata` object."
+  )
+  assert_that(
+    !any(names(change) %in% names(yaml)),
+    msg = "New name in `change` present in the existing `git2rdata` object."
+  )
+  names(yaml) <- replace_vector(names(yaml), change)
+  yaml[["..generic"]][["sorting"]] <- replace_vector(
+    yaml[["..generic"]][["sorting"]], change
+  )
+  if (file_test("-f", file["raw_file"])) {
+    replace_header(file["raw_file"], change)
+  } else {
+    vapply(
+      c(
+        file.path(file["raw_file"], "index.tsv"),
+        list.files(
+          file["raw_file"], pattern = "[[:xdigit:]]{20}.tsv", full.names = TRUE
+        )
+      ),
+      replace_header, change = change, logical(1)
+    )
+    yaml[["..generic"]][["split_by"]] <- replace_vector(
+      yaml[["..generic"]][["split_by"]], change
+    )
+  }
+  yaml[["..generic"]][["hash"]] <- metadata_hash(yaml)
+  yaml[["..generic"]][["data_hash"]] <- datahash(file["raw_file"])
+  write_yaml(yaml, file["meta_file"], fileEncoding = "UTF-8")
+
+  hashes <- remove_root(file = file, root = root)
+  names(hashes) <-
+    c(
+      yaml[["..generic"]][["data_hash"]],
+      yaml[["..generic"]][["hash"]]
+    )
+
+  return(hashes)
+}
+
+replace_vector <- function(x, change) {
+  if (!any(change %in% x)) {
+    return(x)
+  }
+  for (i in seq_along(change)) {
+    x[x == change[i]] <- names(change[i])
+  }
+  return(x)
+}
+
+replace_header <- function(x, change) {
+  raw_data <- readLines(x)
+  header <- strsplit(raw_data[1], "\t")[[1]]
+  for (i in seq_along(change)) {
+    header[header == change[i]] <- names(change)[i]
+  }
+  raw_data[1] <- paste0(header, collapse = "\t")
+  writeLines(text = raw_data, con = x)
+  return(TRUE)
+}
+
+#' @rdname rename_variable
+#' @export
+rename_variable.default <- function(file, change, root, ...) {
+  stop("a 'root' of class ", class(root), " is not supported",
+       call. = FALSE)
+}
+
+#' @rdname rename_variable
+#' @export
+#' @inheritParams write_vc
+#' @inheritParams git2r::add
+#' @importFrom assertthat assert_that is.flag noNA
+#' @importFrom git2r add workdir
+rename_variable.git_repository <- function(
+  file, change, root, ..., stage = FALSE, force = FALSE
+) {
+  assert_that(is.flag(stage), noNA(stage), is.flag(force), noNA(force))
+  hashes <- rename_variable(file = file, root = workdir(root), change = change)
+  if (!stage) {
+    return(hashes)
+  }
+
+  add(root, path = hashes, force = force)
+  return(hashes)
+}
diff --git a/man/list_data.Rd b/man/list_data.Rd
index 96953a3..435ddd3 100644
--- a/man/list_data.Rd
+++ b/man/list_data.Rd
@@ -103,6 +103,7 @@ Other storage:
 \code{\link{prune_meta}()},
 \code{\link{read_vc}()},
 \code{\link{relabel}()},
+\code{\link{rename_variable}()},
 \code{\link{rm_data}()},
 \code{\link{write_vc}()}
 }
diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd
index 9a0b6fd..7d4a6de 100644
--- a/man/prune_meta.Rd
+++ b/man/prune_meta.Rd
@@ -117,6 +117,7 @@ Other storage:
 \code{\link{list_data}()},
 \code{\link{read_vc}()},
 \code{\link{relabel}()},
+\code{\link{rename_variable}()},
 \code{\link{rm_data}()},
 \code{\link{write_vc}()}
 }
diff --git a/man/read_vc.Rd b/man/read_vc.Rd
index f2431e5..6976451 100644
--- a/man/read_vc.Rd
+++ b/man/read_vc.Rd
@@ -100,6 +100,7 @@ Other storage:
 \code{\link{list_data}()},
 \code{\link{prune_meta}()},
 \code{\link{relabel}()},
+\code{\link{rename_variable}()},
 \code{\link{rm_data}()},
 \code{\link{write_vc}()}
 }
diff --git a/man/relabel.Rd b/man/relabel.Rd
index 0a08c79..5914631 100644
--- a/man/relabel.Rd
+++ b/man/relabel.Rd
@@ -91,6 +91,7 @@ Other storage:
 \code{\link{list_data}()},
 \code{\link{prune_meta}()},
 \code{\link{read_vc}()},
+\code{\link{rename_variable}()},
 \code{\link{rm_data}()},
 \code{\link{write_vc}()}
 }
diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd
new file mode 100644
index 0000000..c24c94b
--- /dev/null
+++ b/man/rename_variable.Rd
@@ -0,0 +1,97 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rename_variable.R
+\name{rename_variable}
+\alias{rename_variable}
+\alias{rename_variable.character}
+\alias{rename_variable.default}
+\alias{rename_variable.git_repository}
+\title{Rename a Variable}
+\usage{
+rename_variable(file, change, root = ".", ...)
+
+\method{rename_variable}{character}(file, change, root = ".", ...)
+
+\method{rename_variable}{default}(file, change, root, ...)
+
+\method{rename_variable}{git_repository}(file, change, root, ..., stage = FALSE, force = FALSE)
+}
+\arguments{
+\item{file}{the name of the git2rdata object. Git2rdata objects cannot
+have dots in their name. The name may include a relative path. \code{file} is a
+path relative to the \code{root}.
+Note that \code{file} must point to a location within \code{root}.}
+
+\item{change}{A named vector with the old names as values and the new names
+as names.}
+
+\item{root}{The root of a project. Can be a file path or a \code{git-repository}.
+Defaults to the current working directory (\code{"."}).}
+
+\item{...}{parameters used in some methods}
+
+\item{stage}{Logical value indicating whether to stage the changes after
+writing the data. Defaults to \code{FALSE}.}
+
+\item{force}{Add ignored files. Default is FALSE.}
+}
+\value{
+invisible \code{NULL}.
+}
+\description{
+The raw data file contains a header with the variable names.
+The metadata list the variable names and their type.
+Changing a variable name and overwriting the \code{git2rdata} object with result
+in an error.
+Because it will look like removing an existing variable and adding a new one.
+Overwriting the object with \code{strict = FALSE} potentially changes the order of
+the variables, leading to a large diff.
+}
+\details{
+This function solves this by only updating the raw data header and the
+metadata.
+}
+\examples{
+
+# initialise a git repo using git2r
+repo_path <- tempfile("git2rdata-repo-")
+dir.create(repo_path)
+repo <- git2r::init(repo_path)
+git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")
+
+# Create a dataframe and store it as an optimized git2rdata object.
+# Note that write_vc() uses optimization by default.
+# Stage and commit the git2rdata object.
+ds <- data.frame(
+  a = c("a1", "a2"),
+  b = c("b2", "b1"),
+  stringsAsFactors = TRUE
+)
+junk <- write_vc(ds, "rename", repo, sorting = "b", stage = TRUE)
+cm <- commit(repo, "initial commit")
+# check that the workspace is clean
+status(repo)
+
+# Define change.
+change <- c(new_name = "a")
+rename_variable(file = "rename", change = change, root = repo)
+# check the changes
+read_vc("rename", repo)
+status(repo)
+cm <- commit(repo, "relabel using a list")
+
+# clean up
+junk <- file.remove(
+  rev(list.files(repo_path, full.names = TRUE, recursive = TRUE,
+                 include.dirs = TRUE, all.files = TRUE)),
+  repo_path)
+}
+\seealso{
+Other storage: 
+\code{\link{list_data}()},
+\code{\link{prune_meta}()},
+\code{\link{read_vc}()},
+\code{\link{relabel}()},
+\code{\link{rm_data}()},
+\code{\link{write_vc}()}
+}
+\concept{storage}
diff --git a/man/rm_data.Rd b/man/rm_data.Rd
index 6478e66..31d4052 100644
--- a/man/rm_data.Rd
+++ b/man/rm_data.Rd
@@ -134,6 +134,7 @@ Other storage:
 \code{\link{prune_meta}()},
 \code{\link{read_vc}()},
 \code{\link{relabel}()},
+\code{\link{rename_variable}()},
 \code{\link{write_vc}()}
 }
 \concept{storage}
diff --git a/man/write_vc.Rd b/man/write_vc.Rd
index 819b8ca..b9aa4e9 100644
--- a/man/write_vc.Rd
+++ b/man/write_vc.Rd
@@ -171,6 +171,7 @@ Other storage:
 \code{\link{prune_meta}()},
 \code{\link{read_vc}()},
 \code{\link{relabel}()},
+\code{\link{rename_variable}()},
 \code{\link{rm_data}()}
 }
 \concept{storage}
diff --git a/tests/testthat/test_g_rename_variable.R b/tests/testthat/test_g_rename_variable.R
new file mode 100644
index 0000000..4d85bd0
--- /dev/null
+++ b/tests/testthat/test_g_rename_variable.R
@@ -0,0 +1,202 @@
+test_that("rename_variable() handles single files", {
+  root <- tempfile(pattern = "git2rdata-rename")
+  dir.create(root)
+  repo <- git2r::init(root)
+  git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")
+  files <- suppressWarnings(
+    write_vc(test_data, file = "unsorted", root = repo, stage = TRUE)
+  )
+  cm <- commit(repo, "initial commit")
+
+  # unsorted unstaged
+  change <- c("new_var" = "test_Date")
+  expect_silent({
+    rf <- rename_variable(file = files[1], change = change, root = repo)
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["unstaged"]]) > 0)
+  expect_length(git2r::status(repo)[["staged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_identical(test_data[, change], changed_df[, names(change)])
+  git2r::reset(cm, "hard")
+
+  files <- write_vc(
+    test_data, file = "sorted", root = repo, sorting = "test_Date", stage = TRUE
+  )
+  cm <- commit(repo, "sorted")
+  # staged & sorted on changed variable
+  change <- c("new_var" = "test_Date")
+  expect_silent({
+    rf <- rename_variable(
+      file = files[1], change = change, root = repo, stage = TRUE
+    )
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["staged"]]) > 0)
+  expect_length(git2r::status(repo)[["unstaged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_equivalent(sorted_test_data[, change], changed_df[, names(change)])
+  git2r::reset(cm, "hard")
+
+  # staged & sorted on other variable
+  change <- c("new_var" = "test_numeric")
+  expect_silent({
+    rf <- rename_variable(
+      file = files[1], change = change, root = repo, stage = TRUE
+    )
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["staged"]]) > 0)
+  expect_length(git2r::status(repo)[["unstaged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_equivalent(sorted_test_data[, change], changed_df[, names(change)])
+  git2r::reset(cm, "hard")
+
+  file.remove(list.files(root, recursive = TRUE, full.names = TRUE))
+})
+
+test_that("rename_variable() handles split_by files", {
+  root <- tempfile(pattern = "git2rdata-rename")
+  dir.create(root)
+  repo <- git2r::init(root)
+  git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")
+  files <- suppressWarnings(
+    write_vc(
+      test_data, file = "unsorted", split_by = "test_factor", root = repo,
+      stage = TRUE
+    )
+  )
+  cm <- commit(repo, "initial commit")
+
+  # unsorted unstaged
+  change <- c("new_var" = "test_Date")
+  expect_silent({
+    rf <- rename_variable(file = files[1], change = change, root = repo)
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["unstaged"]]) > 0)
+  expect_length(git2r::status(repo)[["staged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  git2r::reset(cm, "hard")
+
+  files <- write_vc(
+    test_data, file = "sorted", root = repo, sorting = "test_Date",
+    split_by = "test_factor", stage = TRUE
+  )
+  cm <- commit(repo, "sorted")
+  # staged & sorted on changed variable
+  change <- c("new_var" = "test_Date")
+  expect_silent({
+    rf <- rename_variable(
+      file = files[1], change = change, root = repo, stage = TRUE
+    )
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["staged"]]) > 0)
+  expect_length(git2r::status(repo)[["unstaged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_equivalent(
+    test_data[order(test_data$test_factor, test_data$test_Date), change],
+    changed_df[, names(change)]
+  )
+  git2r::reset(cm, "hard")
+
+  # staged & split_by variable
+  change <- c("new_var" = "test_factor")
+  expect_silent({
+    rf <- rename_variable(
+      file = files[1], change = change, root = repo, stage = TRUE
+    )
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["staged"]]) > 0)
+  expect_length(git2r::status(repo)[["unstaged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_equivalent(
+    test_data[order(test_data$test_factor, test_data$test_Date), change],
+    changed_df[, names(change)]
+  )
+  git2r::reset(cm, "hard")
+
+  # staged & sorted on other variable
+  change <- c("new_var" = "test_numeric")
+  expect_silent({
+    rf <- rename_variable(
+      file = files[1], change = change, root = repo, stage = TRUE
+    )
+  })
+  expect_identical(unname(files), unname(rf))
+  expect_true(length(git2r::status(repo)[["staged"]]) > 0)
+  expect_length(git2r::status(repo)[["unstaged"]], 0)
+  expect_length(git2r::status(repo)[["untracked"]], 0)
+  expect_silent({
+    changed_df <- read_vc(rf[1], root = repo)
+  })
+  expect_identical(ncol(test_data), ncol(changed_df))
+  updated <- which(colnames(test_data) != colnames(changed_df))
+  expect_identical(length(updated), length(change))
+  expect_identical(colnames(test_data)[updated], unname(change))
+  expect_identical(colnames(changed_df)[updated], names(change))
+  expect_equivalent(
+    test_data[order(test_data$test_factor, test_data$test_Date), change],
+    changed_df[, names(change)]
+  )
+  git2r::reset(cm, "hard")
+
+  file.remove(list.files(root, recursive = TRUE, full.names = TRUE))
+})
+
+test_that("rename_variable() handles wrong type of root", {
+  expect_error(
+    rename_variable(root = 1),
+    "a 'root' of class numeric is not supported"
+  )
+})

From 30f8c32b32eac222bf53aa57752aae8fd80a33fe Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Tue, 22 Sep 2020 18:33:31 +0200
Subject: [PATCH 11/23] Bump package version

---
 DESCRIPTION   | 2 +-
 NEWS.md       | 2 +-
 codemeta.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 015f956..636ebe6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: git2rdata
 Title: Store and Retrieve Data.frames in a Git Repository
-Version: 0.2.3
+Version: 0.3.0
 Authors@R: 
     c(person(given = "Thierry",
              family = "Onkelinx",
diff --git a/NEWS.md b/NEWS.md
index 624b130..8adc6a5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# git2rdata 0.2.3
+# git2rdata 0.3.0
 
 ## New features
 
diff --git a/codemeta.json b/codemeta.json
index 8ad6d43..4dfdc0c 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -14,7 +14,7 @@
   ],
   "issueTracker": "https://github.com/ropensci/git2rdata/issues",
   "license": "https://spdx.org/licenses/GPL-3.0",
-  "version": "0.2.3",
+  "version": "0.3.0",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
@@ -218,7 +218,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "616.079KB",
+  "fileSize": "632.425KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",

From 2a981d28e298b8c7ba9e2aa82db53892d42cf788 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 23 Sep 2020 00:43:52 +0200
Subject: [PATCH 12/23] fix example

---
 NEWS.md                | 1 -
 R/rename_variable.R    | 1 -
 codemeta.json          | 2 +-
 man/rename_variable.Rd | 1 -
 4 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 8adc6a5..540f473 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,7 +12,6 @@
 * `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
   when both the data and metadata are missing.
 
-
 # git2rdata 0.2.2
 
 * Use the [checklist](https://inbo.github.io/checklist) package for CI.
diff --git a/R/rename_variable.R b/R/rename_variable.R
index 76f829a..b4eb121 100644
--- a/R/rename_variable.R
+++ b/R/rename_variable.R
@@ -42,7 +42,6 @@
 #' # check the changes
 #' read_vc("rename", repo)
 #' status(repo)
-#' cm <- commit(repo, "relabel using a list")
 #'
 #' # clean up
 #' junk <- file.remove(
diff --git a/codemeta.json b/codemeta.json
index 4dfdc0c..7f15db2 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -218,7 +218,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "632.425KB",
+  "fileSize": "632.335KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd
index c24c94b..4d720cd 100644
--- a/man/rename_variable.Rd
+++ b/man/rename_variable.Rd
@@ -77,7 +77,6 @@ rename_variable(file = "rename", change = change, root = repo)
 # check the changes
 read_vc("rename", repo)
 status(repo)
-cm <- commit(repo, "relabel using a list")
 
 # clean up
 junk <- file.remove(

From 0f4f10743b2e1cbd30bbe8f8c2ef70d2f34aed59 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 23 Sep 2020 10:15:53 +0200
Subject: [PATCH 13/23] Run test with R devel on Ubuntu 20.04

---
 .github/workflows/check_on_different_r_os.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check_on_different_r_os.yml b/.github/workflows/check_on_different_r_os.yml
index 01cfc5a..aa2d2be 100644
--- a/.github/workflows/check_on_different_r_os.yml
+++ b/.github/workflows/check_on_different_r_os.yml
@@ -18,9 +18,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macOS-latest,   r: 'devel'}
           - {os: macOS-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-20.04,   r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
           - {os: ubuntu-16.04,   r: 'oldrel',  rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 
     env:

From e60f10a8603ad4c52d6a3e616c20060a2face947 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 23 Sep 2020 10:16:54 +0200
Subject: [PATCH 14/23] Use git2r::hash() instead of digest::sha1()

This removes the digest dependency
---
 DESCRIPTION  | 1 -
 NAMESPACE    | 2 --
 R/read_vc.R  | 1 -
 R/write_vc.R | 5 ++---
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 636ebe6..05cae3a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -35,7 +35,6 @@ Depends:
     R (>= 3.5.0)
 Imports:
     assertthat,
-    digest,
     git2r (>= 0.23.0),
     methods,
     yaml
diff --git a/NAMESPACE b/NAMESPACE
index 1606515..df4c980 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -68,11 +68,9 @@ importFrom(assertthat,has_name)
 importFrom(assertthat,is.flag)
 importFrom(assertthat,is.string)
 importFrom(assertthat,noNA)
-importFrom(digest,sha1)
 importFrom(git2r,add)
 importFrom(git2r,commit)
 importFrom(git2r,hash)
-importFrom(git2r,hashfile)
 importFrom(git2r,last_commit)
 importFrom(git2r,odb_blobs)
 importFrom(git2r,pull)
diff --git a/R/read_vc.R b/R/read_vc.R
index 4b6bdde..657a5d0 100644
--- a/R/read_vc.R
+++ b/R/read_vc.R
@@ -30,7 +30,6 @@ read_vc.default <- function(file, root) {
 #' @importFrom yaml read_yaml
 #' @importFrom utils read.table
 #' @importFrom stats setNames
-#' @importFrom git2r hashfile
 read_vc.character <- function(file, root = ".") {
   assert_that(is.string(file), is.string(root))
   root <- normalizePath(root, winslash = "/", mustWork = TRUE)
diff --git a/R/write_vc.R b/R/write_vc.R
index 745187f..54faf18 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -52,10 +52,9 @@ write_vc.default <- function(
 #' This creates a separate file for every combination.
 #' @export
 #' @importFrom assertthat assert_that is.string is.flag
-#' @importFrom digest sha1
 #' @importFrom yaml read_yaml write_yaml
 #' @importFrom utils write.table
-#' @importFrom git2r hashfile
+#' @importFrom git2r hash
 write_vc.character <- function(
   x, file, root = ".", sorting, strict = TRUE, optimize = TRUE,
   na = "NA", ..., split_by = character(0)
@@ -118,7 +117,7 @@ write_vc.character <- function(
     )
   } else {
     index <- unique(raw_data[split_by])
-    index[["..hash"]] <- apply(index, 1, sha1)
+    index[["..hash"]] <- hash(apply(index, 1, paste, collapse = "\t"))
     dir.create(file["raw_file"], showWarnings = FALSE, recursive = TRUE)
     write.table(
       x = index, file = file.path(file["raw_file"], "index.tsv"),

From d403e1a7b6d99b60cb2eeafeee86ccef818bfc4d Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 23 Sep 2020 10:23:50 +0200
Subject: [PATCH 15/23] Improve unit test on subsecond commits

---
 codemeta.json                         | 14 +-------------
 tests/testthat/test_d_recent_commit.R |  4 ++--
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/codemeta.json b/codemeta.json
index 7f15db2..74bcd9c 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -175,18 +175,6 @@
       },
       "sameAs": "https://CRAN.R-project.org/package=assertthat"
     },
-    {
-      "@type": "SoftwareApplication",
-      "identifier": "digest",
-      "name": "digest",
-      "provider": {
-        "@id": "https://cran.r-project.org",
-        "@type": "Organization",
-        "name": "Comprehensive R Archive Network (CRAN)",
-        "url": "https://cran.r-project.org"
-      },
-      "sameAs": "https://CRAN.R-project.org/package=digest"
-    },
     {
       "@type": "SoftwareApplication",
       "identifier": "git2r",
@@ -218,7 +206,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "632.335KB",
+  "fileSize": "632.219KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R
index 52dcfc2..e5fe909 100644
--- a/tests/testthat/test_d_recent_commit.R
+++ b/tests/testthat/test_d_recent_commit.R
@@ -66,10 +66,10 @@ expect_identical(
 
 target <- file.path(git2r::workdir(root), "subsecond.txt")
 while (TRUE) {
-  writeLines(sample(letters), con = target)
+  writeLines(letters, con = target)
   git2r::add(root, target)
   cm_1 <- commit(root, "first subsecond")
-  writeLines(sample(letters), con = target)
+  writeLines(LETTERS, con = target)
   git2r::add(root, target)
   cm_2 <- commit(root, "second subsecond")
   output <- suppressWarnings(

From ede240d48c5d58bd5f21132a69cc44e995b0dee8 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 23 Sep 2020 11:38:20 +0200
Subject: [PATCH 16/23] Set explicit timezone in unit tests.

Required to get tests running under R-devel.
---
 codemeta.json                    | 2 +-
 tests/testthat/setup_test_data.R | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/codemeta.json b/codemeta.json
index 74bcd9c..de04c34 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -206,7 +206,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "632.219KB",
+  "fileSize": "632.073KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",
diff --git a/tests/testthat/setup_test_data.R b/tests/testthat/setup_test_data.R
index 9c6820f..fd47a4e 100644
--- a/tests/testthat/setup_test_data.R
+++ b/tests/testthat/setup_test_data.R
@@ -17,7 +17,8 @@ test_data <- data.frame(
   test_logical = sample(c(TRUE, FALSE), size = test_n, replace = TRUE),
   test_POSIXct = as.POSIXct(
     sample(.Machine$integer.max, size = test_n, replace = TRUE),
-    origin = "1970-01-01"
+    origin = "1970-01-01",
+    tz = "UTC"
   ),
   test_Date = as.Date(
     c(sample(1e5, size = test_n - 1, replace = TRUE), 16000),
@@ -31,13 +32,11 @@ sorted_test_data <- test_data[order(test_data$test_Date), ]
 git2rdata:::set_local_locale(old_locale)
 sorted_test_data$test_character <- enc2utf8(sorted_test_data$test_character)
 rownames(sorted_test_data) <- NULL
-attr(sorted_test_data$test_POSIXct, "tzone") <- "UTC"
 
 test_subset <- head(test_data, ceiling(test_n / 2))
 
 sorted_test_subset <- test_subset[order(test_subset$test_Date), ]
 rownames(sorted_test_subset) <- NULL
-attr(sorted_test_subset$test_POSIXct, "tzone") <- "UTC"
 
 test_na <- test_data
 for (i in seq_along(test_na)) {
@@ -49,4 +48,3 @@ sorted_test_na <- test_na[
 ]
 git2rdata:::set_local_locale(old_locale)
 rownames(sorted_test_na) <- NULL
-attr(sorted_test_na$test_POSIXct, "tzone") <- "UTC"

From a442314b6851beb9ebea7ffc257e7100561dfe18 Mon Sep 17 00:00:00 2001
From: florisvdh <floris.vanderhaeghe@inbo.be>
Date: Wed, 23 Sep 2020 13:56:39 +0200
Subject: [PATCH 17/23] vignette split_by: simplify intro

---
 vignettes/split_by.Rmd | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
index 100a6fb..159115e 100644
--- a/vignettes/split_by.Rmd
+++ b/vignettes/split_by.Rmd
@@ -129,10 +129,9 @@ In such a case we can use the `split_by` argument of `write_vc()`.
 This will store the large dataframe over a set of tab separated files.
 One file for every combination of the variables defined by `split_by`.
 Every partial data file holds one combination of `split_by`.
-We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash.
-This hash becomes the base name of the partial data files.
-The combination of the hash in the `index.tsv` and the base name of the partial data files makes the information of `split_by` in the partial data file redundant.
 We remove the `split_by` variables from the partial data files, reducing their size.
+We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash for each combination.
+This hash becomes the base name of the partial data files.
 
 ## When to Split the Dataframe
 

From cd20c8477f5f1112245a0ecf988f0491de00fb04 Mon Sep 17 00:00:00 2001
From: florisvdh <floris.vanderhaeghe@inbo.be>
Date: Wed, 23 Sep 2020 13:57:33 +0200
Subject: [PATCH 18/23] vignette split_by: use $...$

---
 vignettes/split_by.Rmd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
index 159115e..b0fea12 100644
--- a/vignettes/split_by.Rmd
+++ b/vignettes/split_by.Rmd
@@ -150,14 +150,14 @@ Let's set the following variables:
 -   $N_s$: the number of unique combinations of the `split_by` variables.
 
 Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation.
-The total number of bytes is `T_0 = h_s + h_r + 1 + N (s + r + 1)`.
-The `+ 1` originates from the tab character to separate the `split_by` variables from the remaining variables.
+The total number of bytes is $T_0 = h_s + h_r + 1 + N (s + r + 1)$.
+The $+ 1$ originates from the tab character to separate the `split_by` variables from the remaining variables.
 
 Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables.
-`h_s` bytes for the header and `N_s s` for the data.
+It will use $h_s$ bytes for the header and $N_s s$ for the data.
 The headers of the partial data files require $N_s h_r$ bytes ($N_s$ files and $h_r$ byte per file).
 The data in the partial data files require $N r$ bytes.
-The total number of bytes is `T_s = h_s + N_s s + N_s h_r + N r`.
+The total number of bytes is $T_s = h_s + N_s s + N_s h_r + N r$.
 
 We can look at the ratio of $T_s$ over $T_0$.
 

From bbdc9f467413b76287c700561c8c4acf2a675b24 Mon Sep 17 00:00:00 2001
From: florisvdh <floris.vanderhaeghe@inbo.be>
Date: Wed, 23 Sep 2020 13:57:49 +0200
Subject: [PATCH 19/23] vignette split_by: subscript in axis label

---
 vignettes/split_by.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
index b0fea12..847679f 100644
--- a/vignettes/split_by.Rmd
+++ b/vignettes/split_by.Rmd
@@ -196,7 +196,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) +
   geom_line() +
   facet_wrap(~ paste("r =", r)) +
   scale_x_continuous(
-    "b = N_s / N",
+    expression(b~{"="}~N[s]~{"/"}~N),
     labels = function(x) {
       paste0(100 * x, "%")
     }

From 96b1a5e9b5439c404950eceb39ec87323394316d Mon Sep 17 00:00:00 2001
From: florisvdh <floris.vanderhaeghe@inbo.be>
Date: Wed, 23 Sep 2020 13:58:16 +0200
Subject: [PATCH 20/23] vignette split_by: fix typo y axis label

---
 vignettes/split_by.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
index 847679f..8c691d8 100644
--- a/vignettes/split_by.Rmd
+++ b/vignettes/split_by.Rmd
@@ -202,7 +202,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) +
     }
   ) +
   scale_y_continuous(
-    "Relative amount of disc space",
+    "Relative amount of disk space",
     labels = function(x) {
       paste0(100 * x, "%")
     }

From f89d5d82f6a0dc4d446e1024314396f820b1ba66 Mon Sep 17 00:00:00 2001
From: florisvdh <floris.vanderhaeghe@inbo.be>
Date: Wed, 23 Sep 2020 14:08:43 +0200
Subject: [PATCH 21/23] Update codemeta.json

---
 codemeta.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codemeta.json b/codemeta.json
index de04c34..e5d42f6 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -206,7 +206,7 @@
       "sameAs": "https://CRAN.R-project.org/package=yaml"
     }
   ],
-  "fileSize": "632.073KB",
+  "fileSize": "762.31KB",
   "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md",
   "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md",
   "contIntegration": "https://codecov.io/gh/ropensci/git2rdata",

From 3f44c55d7b76705339dbd078e3e3ccb8d00fd8be Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 13 Jan 2021 15:28:09 +0100
Subject: [PATCH 22/23] Mention that we also use the split_by variables for
 sorting

---
 R/write_vc.R    | 1 +
 man/meta.Rd     | 3 ++-
 man/write_vc.Rd | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/write_vc.R b/R/write_vc.R
index 54faf18..146e800 100644
--- a/R/write_vc.R
+++ b/R/write_vc.R
@@ -50,6 +50,7 @@ write_vc.default <- function(
 #' @rdname write_vc
 #' @param split_by An optional vector of variables name to split the text files.
 #' This creates a separate file for every combination.
+#' We prepend these variables to the vector of `sorting` variables.
 #' @export
 #' @importFrom assertthat assert_that is.string is.flag
 #' @importFrom yaml read_yaml write_yaml
diff --git a/man/meta.Rd b/man/meta.Rd
index 7be6e6f..c7190b9 100644
--- a/man/meta.Rd
+++ b/man/meta.Rd
@@ -62,7 +62,8 @@ See \code{vignette("efficiency", package = "git2rdata")} for an illustration of
 the importance of sorting.}
 
 \item{split_by}{An optional vector of variables name to split the text files.
-This creates a separate file for every combination.}
+This creates a separate file for every combination.
+We prepend these variables to the vector of \code{sorting} variables.}
 }
 \value{
 the optimized vector \code{x} with \code{meta} attribute.
diff --git a/man/write_vc.Rd b/man/write_vc.Rd
index b9aa4e9..ed92e31 100644
--- a/man/write_vc.Rd
+++ b/man/write_vc.Rd
@@ -76,7 +76,8 @@ Defaults to \code{TRUE}.}
 \item{...}{parameters used in some methods}
 
 \item{split_by}{An optional vector of variables name to split the text files.
-This creates a separate file for every combination.}
+This creates a separate file for every combination.
+We prepend these variables to the vector of \code{sorting} variables.}
 
 \item{stage}{Logical value indicating whether to stage the changes after
 writing the data. Defaults to \code{FALSE}.}

From 2ed454e07bf92ea3ee1c8fe2cca1364d675d3c21 Mon Sep 17 00:00:00 2001
From: Thierry Onkelinx <thierry.onkelinx@inbo.be>
Date: Wed, 13 Jan 2021 15:56:22 +0100
Subject: [PATCH 23/23] Tweak the split_by vignette

---
 vignettes/split_by.Rmd | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd
index 8c691d8..490a08a 100644
--- a/vignettes/split_by.Rmd
+++ b/vignettes/split_by.Rmd
@@ -123,16 +123,22 @@ update_geom_defaults("smooth", list(colour = "#356196"))
 ## Introduction
 
 Sometimes, a large dataframe has one or more variables with a small number of unique combinations.
-E.g. a dataframe with factor variables.
+E.g. a dataframe with one or more factor variables.
+Storing the entire dataframe as a single text file requires storing lots of replicated data.
+Each row stores the information for every variable, even if a subset of these variables remains constant over a subset of the data.
 
 In such a case we can use the `split_by` argument of `write_vc()`.
 This will store the large dataframe over a set of tab separated files.
 One file for every combination of the variables defined by `split_by`.
-Every partial data file holds one combination of `split_by`.
+Every partial data file holds the other variables for one combination of `split_by`.
 We remove the `split_by` variables from the partial data files, reducing their size.
 We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash for each combination.
 This hash becomes the base name of the partial data files.
 
+Splitting the dataframe into smaller files makes them easier to handle in version control system.
+The overall size depends on the amount of replication in the dataframe.
+More on that in the next section.
+
 ## When to Split the Dataframe
 
 Let's set the following variables:
@@ -151,7 +157,7 @@ Let's set the following variables:
 
 Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation.
 The total number of bytes is $T_0 = h_s + h_r + 1 + N (s + r + 1)$.
-The $+ 1$ originates from the tab character to separate the `split_by` variables from the remaining variables.
+Both $+ 1$ originate from the tab character to separate the `split_by` variables from the remaining variables.
 
 Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables.
 It will use $h_s$ bytes for the header and $N_s s$ for the data.