From b49e42beb63aff37c6eb85d29bd29505accb9e5e Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Sun, 6 Sep 2020 18:42:49 +0200 Subject: [PATCH 01/23] Increment version number --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e453c04..ccfccb4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.2.2 +Version: 0.2.3 Authors@R: c(person(given = "Thierry", family = "Onkelinx", diff --git a/NEWS.md b/NEWS.md index 8fee349..dbb9c03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# git2rdata 0.2.3 + # git2rdata 0.2.2 * Use the [checklist](https://inbo.github.io/checklist) package for CI. From 5b82717de4a34740a08a97046161f8ecae2f03e3 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 15 Sep 2020 10:11:42 +0200 Subject: [PATCH 02/23] write_vc() gains a split_by argument --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 2 ++ R/meta.R | 18 +++++++++- R/write_vc.R | 66 +++++++++++++++++++++++++--------- codemeta.json | 54 ++++++++++++---------------- man/meta.Rd | 24 +++++++++---- man/rm_data.Rd | 2 +- man/write_vc.Rd | 30 ++++++++++++---- tests/testthat/test_a_basics.R | 12 ++++--- 10 files changed, 144 insertions(+), 66 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ccfccb4..11ae337 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Depends: R (>= 3.5.0) Imports: assertthat, + digest, git2r (>= 0.23.0), methods, yaml diff --git a/NAMESPACE b/NAMESPACE index a11c0fe..19abdd3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,6 +64,7 @@ importFrom(assertthat,has_name) importFrom(assertthat,is.flag) importFrom(assertthat,is.string) importFrom(assertthat,noNA) +importFrom(digest,sha1) importFrom(git2r,add) importFrom(git2r,commit) importFrom(git2r,hash) diff --git a/NEWS.md b/NEWS.md index dbb9c03..c8e1f38 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # git2rdata 0.2.3 +* `write_vc()` gains an optional `split_by` argument. + # git2rdata 0.2.2 * Use the [checklist](https://inbo.github.io/checklist) package for CI. diff --git a/R/meta.R b/R/meta.R index ebee960..df56f6d 100644 --- a/R/meta.R +++ b/R/meta.R @@ -211,12 +211,24 @@ meta.Date <- function(x, optimize = TRUE, ...) { #' @rdname meta #' @inheritParams write_vc meta.data.frame <- function(# nolint - x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ... + x, optimize = TRUE, na = "NA", sorting, strict = TRUE, + split_by = character(0), ... ) { assert_that( !has_name(x, "..generic"), msg = "'..generic' is a reserved name and not allowed as column name") + assert_that( + !has_name(x, "..hash"), + msg = "'..hash' is a reserved name and not allowed as column name") generic <- list(optimize = optimize, "NA string" = na) + assert_that(is.character(split_by)) + assert_that( + all(split_by %in% colnames(x)), + msg = "All split_by variables must be available in the data.frame") + assert_that( + any(!colnames(x) %in% split_by), + msg = "No remaining variables after splitting" + ) dots <- list(...) if (has_name(dots, "old")) { @@ -236,6 +248,7 @@ Sorting is strongly recommended in combination with version control.") assert_that( all(sorting %in% colnames(x)), msg = "All sorting variables must be available in the data.frame") + sorting <- unique(c(split_by, sorting)) if (nrow(x) > 1) { old_locale <- set_c_locale() x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint @@ -248,6 +261,9 @@ Add extra sorting variables to ensure small diffs.", sorted) } } generic <- c(generic, sorting = list(sorting)) + if (length(split_by)) { + generic <- c(generic, split_by = list(split_by)) + } } # calculate meta for each column if (has_name(dots, "old")) { diff --git a/R/write_vc.R b/R/write_vc.R index 47f987e..43f7fdc 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -13,11 +13,12 @@ #' @param root The root of a project. Can be a file path or a `git-repository`. #' Defaults to the current working directory (`"."`). #' @param sorting an optional vector of column names defining which columns to -#' use for sorting `x` and in what order to use them. Omitting `sorting` yields -#' a warning. Add `sorting` to avoid this warning. Strongly recommended -#' in combination with version control. See -#' `vignette("efficiency", package = "git2rdata")` for an illustration of the -#' importance of sorting. +#' use for sorting `x` and in what order to use them. +#' The default empty `sorting` yields a warning. +#' Add `sorting` to avoid this warning. +#' Strongly recommended in combination with version control. +#' See `vignette("efficiency", package = "git2rdata")` for an illustration of +#' the importance of sorting. #' @param strict What to do when the metadata changes. `strict = FALSE` #' overwrites the data and the metadata with a warning listing the changes, #' `strict = TRUE` returns an error and leaves the data and metadata as is. @@ -33,8 +34,8 @@ #' @note `..generic` is a reserved name for the metadata and is a forbidden #' column name in a `data.frame`. write_vc <- function( - x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", - ... + x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ..., + split_by ) { UseMethod("write_vc", root) } @@ -46,14 +47,18 @@ write_vc.default <- function( stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) } +#' @rdname write_vc +#' @param split_by An optional vector of variables name to split the text files. +#' This creates a separate file for every combination. #' @export #' @importFrom assertthat assert_that is.string is.flag +#' @importFrom digest sha1 #' @importFrom yaml read_yaml write_yaml #' @importFrom utils write.table #' @importFrom git2r hashfile write_vc.character <- function( - x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", - ... + x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, + na = "NA", ..., split_by = character(0) ) { assert_that( inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na), @@ -66,7 +71,9 @@ write_vc.character <- function( } if (!file.exists(file["meta_file"])) { - raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting) + raw_data <- meta( + x, optimize = optimize, na = na, sorting = sorting, split_by = split_by + ) } else { tryCatch( is_git2rmeta(file = remove_root(file = file["meta_file"], root = root), @@ -79,7 +86,7 @@ write_vc.character <- function( old <- read_yaml(file["meta_file"]) class(old) <- "meta_list" raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting, - old = old, strict = strict) + old = old, strict = strict, split_by = split_by) problems <- compare_meta(attr(raw_data, "meta"), old) if (length(problems)) { problems <- c( @@ -99,11 +106,38 @@ write_vc.character <- function( } } } - write.table( - x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, - sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, - col.names = TRUE, fileEncoding = "UTF-8" - ) + if (length(split_by) == 0) { + write.table( + x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, + sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, + col.names = TRUE, fileEncoding = "UTF-8" + ) + } else { + index <- unique(raw_data[split_by]) + index[["..hash"]] <- apply(index, 1, sha1) + dir.create(file["raw_file"], showWarnings = FALSE) + write.table( + x = index, file = file.path(file["raw_file"], "index.tsv"), + append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".", + row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" + ) + detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by] + for (i in seq_len(nrow(index))) { + matching <- vapply( + split_by, + function(split) { + raw_data[[split]] == index[[split]][i] + }, + logical(nrow(raw_data)) + ) + write.table( + x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE], + file = file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")), + append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, + dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" + ) + } + } meta_data <- attr(raw_data, "meta") meta_data[["..generic"]][["git2rdata"]] <- as.character( packageVersion("git2rdata") diff --git a/codemeta.json b/codemeta.json index 71ff4c5..4f43830 100644 --- a/codemeta.json +++ b/codemeta.json @@ -1,26 +1,26 @@ { - "@context": [ - "https://doi.org/10.5063/schema/codemeta-2.0", - "http://schema.org" - ], + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "git2rdata", "description": "Make versioning of data.frame easy and efficient using git\n repositories.", "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository", "codeRepository": "https://github.com/ropensci/git2rdata", - "relatedLink": [ - "https://doi.org/10.5281/zenodo.1485309", - "https://CRAN.R-project.org/package=git2rdata" - ], + "relatedLink": "https://doi.org/10.5281/zenodo.1485309", "issueTracker": "https://github.com/ropensci/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.2.2", + "version": "0.2.3", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, "runtimePlatform": "R version 4.0.2 (2020-06-22)", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, "author": [ { "@type": "Person", @@ -169,6 +169,18 @@ }, "sameAs": "https://CRAN.R-project.org/package=assertthat" }, + { + "@type": "SoftwareApplication", + "identifier": "digest", + "name": "digest", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=digest" + }, { "@type": "SoftwareApplication", "identifier": "git2r", @@ -200,27 +212,5 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", - "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", - "fileSize": "578.382KB", - "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", - "developmentStatus": ["https://www.repostatus.org/#active", "https://www.tidyverse.org/lifecycle/#maturing"], - "keywords": [ - "r", - "rstats", - "r-package", - "version-control", - "reproducible-research" - ], - "provider": { - "@id": "https://cran.r-project.org", - "@type": "Organization", - "name": "Comprehensive R Archive Network (CRAN)", - "url": "https://cran.r-project.org" - }, - "review": { - "@type": "Review", - "url": "https://github.com/ropensci/software-review/issues/263", - "provider": "https://ropensci.org" - } + "fileSize": "1765.055KB" } diff --git a/man/meta.Rd b/man/meta.Rd index d1f617b..7be6e6f 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -23,7 +23,15 @@ meta(x, ...) \method{meta}{Date}(x, optimize = TRUE, ...) -\method{meta}{data.frame}(x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...) +\method{meta}{data.frame}( + x, + optimize = TRUE, + na = "NA", + sorting, + strict = TRUE, + split_by = character(0), + ... +) } \arguments{ \item{x}{the vector.} @@ -46,11 +54,15 @@ overwrites the data and the metadata with a warning listing the changes, Defaults to \code{TRUE}.} \item{sorting}{an optional vector of column names defining which columns to -use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields -a warning. Add \code{sorting} to avoid this warning. Strongly recommended -in combination with version control. See -\code{vignette("efficiency", package = "git2rdata")} for an illustration of the -importance of sorting.} +use for sorting \code{x} and in what order to use them. +The default empty \code{sorting} yields a warning. +Add \code{sorting} to avoid this warning. +Strongly recommended in combination with version control. +See \code{vignette("efficiency", package = "git2rdata")} for an illustration of +the importance of sorting.} + +\item{split_by}{An optional vector of variables name to split the text files. +This creates a separate file for every combination.} } \value{ the optimized vector \code{x} with \code{meta} attribute. diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 7c31dd1..6478e66 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -33,7 +33,7 @@ is relative to \code{root}.} the git history and unchanged since the last commit. \code{modified} are files in the git history and changed since the last commit. \code{ignored} refers to file listed in a \code{.gitignore} file. Selecting \code{modified} will remove both -\code{unmodified} and \code{modified} data files. Selecting \verb{ìgnored} will remove +\code{unmodified} and \code{modified} data files. Selecting \code{ìgnored} will remove \code{unmodified}, \code{modified} and \code{ignored} data files. \code{all} refers to all visible data files, including \code{untracked} files.} } diff --git a/man/write_vc.Rd b/man/write_vc.Rd index 1385b35..819b8ca 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/write_vc.R \name{write_vc} \alias{write_vc} +\alias{write_vc.character} \alias{write_vc.git_repository} \title{Store a Data.Frame as a Git2rdata Object on Disk} \usage{ @@ -13,7 +14,20 @@ write_vc( strict = TRUE, optimize = TRUE, na = "NA", - ... + ..., + split_by +) + +\method{write_vc}{character}( + x, + file, + root = ".", + sorting, + strict = TRUE, + optimize = TRUE, + na = "NA", + ..., + split_by = character(0) ) \method{write_vc}{git_repository}( @@ -41,11 +55,12 @@ Note that \code{file} must point to a location within \code{root}.} Defaults to the current working directory (\code{"."}).} \item{sorting}{an optional vector of column names defining which columns to -use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields -a warning. Add \code{sorting} to avoid this warning. Strongly recommended -in combination with version control. See -\code{vignette("efficiency", package = "git2rdata")} for an illustration of the -importance of sorting.} +use for sorting \code{x} and in what order to use them. +The default empty \code{sorting} yields a warning. +Add \code{sorting} to avoid this warning. +Strongly recommended in combination with version control. +See \code{vignette("efficiency", package = "git2rdata")} for an illustration of +the importance of sorting.} \item{strict}{What to do when the metadata changes. \code{strict = FALSE} overwrites the data and the metadata with a warning listing the changes, @@ -60,6 +75,9 @@ Defaults to \code{TRUE}.} \item{...}{parameters used in some methods} +\item{split_by}{An optional vector of variables name to split the text files. +This creates a separate file for every combination.} + \item{stage}{Logical value indicating whether to stage the changes after writing the data. Defaults to \code{FALSE}.} diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index 9e4e78a..1df75e3 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -21,8 +21,10 @@ expect_error( "file should not contain '..'" ) expect_is( - output <- write_vc( - x = test_data, file = "test.txt", root = root, sorting = "test_Date" + suppressWarnings( + output <- write_vc( + x = test_data, file = "test.txt", root = root, sorting = "test_Date" + ) ), "character" ) @@ -43,7 +45,7 @@ for (i in colnames(stored)) { ) } expect_identical( - write_vc(x = test_data, file = "test.xls", root = root), + suppressWarnings(write_vc(x = test_data, file = "test.xls", root = root)), output ) expect_error( @@ -51,7 +53,9 @@ expect_error( "The data was not overwritten because of the issues below." ) expect_error( - write_vc(x = test_data, file = "test", root = root, optimize = FALSE), + suppressWarnings( + write_vc(x = test_data, file = "test", root = root, optimize = FALSE) + ), "New data is verbose, whereas old data was optimized" ) expect_warning( From 58d52d0b8e62c7183bce90182f62e888f8bdff0d Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 15 Sep 2020 11:10:21 +0200 Subject: [PATCH 03/23] Improve message when data file is missing. The old implementation yielded a "missing metadata" error when reading a non existing object. The new implementation yields a "missing object" error. --- NEWS.md | 2 ++ R/is_git2rmeta.R | 6 +++++- tests/testthat/test_b_is_git2rmeta.R | 14 +++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index c8e1f38..a454c1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # git2rdata 0.2.3 * `write_vc()` gains an optional `split_by` argument. +* `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message + when both the data and metadata are missing. # git2rdata 0.2.2 diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R index d350e82..f4ed0b2 100644 --- a/R/is_git2rmeta.R +++ b/R/is_git2rmeta.R @@ -38,7 +38,11 @@ is_git2rmeta.character <- function(file, root = ".", file <- clean_data_path(root = root, file = file) if (!file.exists(file["meta_file"])) { - msg <- "Metadata file missing." + msg <- ifelse( + file.exists(file["raw_file"]), + "Metadata file missing.", + "`git2rdata` object not found." + ) switch(message, error = stop(msg, call. = FALSE), warning = warning(msg, call. = FALSE)) return(FALSE) diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R index 88186f7..99eaa4f 100644 --- a/tests/testthat/test_b_is_git2rmeta.R +++ b/tests/testthat/test_b_is_git2rmeta.R @@ -12,16 +12,16 @@ test_that("is_git2rmeta checks metadata", { expect_false(is_git2rmeta(file = "junk", root = root)) expect_false(is_git2rdata(file = "junk", root = root)) expect_error(is_git2rmeta(file = "junk", root = root, message = "error"), - "Metadata file missing.") + "`git2rdata` object not found.") expect_warning(is_git2rmeta(file = "junk", root = root, message = "warning"), - "Metadata file missing.") + "`git2rdata` object not found.") expect_false( suppressWarnings( is_git2rmeta(file = "junk", root = root, message = "warning") ) ) expect_warning(is_git2rdata(file = "junk", root = root, message = "warning"), - "Metadata file missing.") + "`git2rdata` object not found.") expect_false( suppressWarnings( is_git2rdata(file = "junk", root = root, message = "warning") @@ -32,6 +32,14 @@ test_that("is_git2rmeta checks metadata", { junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + file.remove(file.path(root, junk[2])) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Metadata file missing.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Metadata file missing.") + expect_false(is_git2rmeta(file = file, root = root)) + + junk_yaml <- correct_yaml junk_yaml[["..generic"]] <- NULL yaml::write_yaml(junk_yaml, file.path(root, junk[2])) From 784841a0830a18eef3f0e42d8fdffa637a6a6e87 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 15 Sep 2020 13:25:45 +0200 Subject: [PATCH 04/23] bugfix: calculate data hash when using split_by --- R/write_vc.R | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/R/write_vc.R b/R/write_vc.R index 43f7fdc..6c949e4 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -112,37 +112,45 @@ write_vc.character <- function( sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) + data_hash <- datahash(file["raw_file"]) } else { index <- unique(raw_data[split_by]) index[["..hash"]] <- apply(index, 1, sha1) - dir.create(file["raw_file"], showWarnings = FALSE) + dir.create(file["raw_file"], showWarnings = FALSE, recursive = TRUE) write.table( x = index, file = file.path(file["raw_file"], "index.tsv"), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by] - for (i in seq_len(nrow(index))) { - matching <- vapply( - split_by, - function(split) { - raw_data[[split]] == index[[split]][i] - }, - logical(nrow(raw_data)) - ) - write.table( - x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE], - file = file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")), - append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, - dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" - ) - } + data_hash <- vapply( + seq_len(nrow(index)), + function(i) { + matching <- vapply( + split_by, + function(split) { + raw_data[[split]] == index[[split]][i] + }, + logical(nrow(raw_data)) + ) + rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")) + write.table( + x = raw_data[apply(matching, 1, all), detail_names, drop = FALSE], + file = rf, + append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, + dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" + ) + datahash(rf) + }, + character(1) + ) + data_hash <- sha1(data_hash) } meta_data <- attr(raw_data, "meta") meta_data[["..generic"]][["git2rdata"]] <- as.character( packageVersion("git2rdata") ) - meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) + meta_data[["..generic"]][["data_hash"]] <- data_hash write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") From 87e4dc8e1cf6965fa76af45b12e05409054e5353 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 15 Sep 2020 14:35:21 +0200 Subject: [PATCH 05/23] read_vc() handles split_by data files --- R/is_git2rdata.R | 54 +++++++++++++++++++++++++++++++++++++--------- R/meta.R | 6 +++--- R/read_vc.R | 56 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 96 insertions(+), 20 deletions(-) diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R index d505796..15f9f23 100644 --- a/R/is_git2rdata.R +++ b/R/is_git2rdata.R @@ -43,17 +43,51 @@ is_git2rdata.character <- function(file, root = ".", # read the metadata meta_data <- read_yaml(file["meta_file"]) - - correct <- names(meta_data) - correct <- paste(correct[correct != "..generic"], collapse = "\t") - header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8") - if (correct != header) { - msg <- paste("Corrupt data, incorrect header. Expecting:", correct) - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) + if (has_name(meta_data[["..generic"]], "split_by")) { + header <- readLines( + file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8" + ) + correct <- paste( + c(meta_data[["..generic"]][["split_by"]], "..hash"), + collapse = "\t" + ) + if (correct != header) { + msg <- paste( + "Corrupt data, incorrect header in index.tsv. Expecting:", correct + ) + switch(message, error = stop(msg, call. = FALSE), + warning = warning(msg, call. = FALSE)) + return(FALSE) + } + correct <- names(meta_data) + keep <- !correct %in% c("..generic", meta_data[["..generic"]][["split_by"]]) + correct <- paste(correct[keep], collapse = "\t") + header <- vapply( + list.files(file["raw_file"], pattern = "[[:xdigit:]]{20}\\.tsv"), + function(z) { + readLines( + file.path(file["raw_file"], z), n = 1, encoding = "UTF-8" + ) + }, + character(1) + ) + if (any(header != correct)) { + msg <- paste("Corrupt data, incorrect header. Expecting:", correct) + switch(message, error = stop(msg, call. = FALSE), + warning = warning(msg, call. = FALSE)) + return(FALSE) + } + } else { + correct <- names(meta_data) + correct <- paste(correct[correct != "..generic"], collapse = "\t") + header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8") + if (correct != header) { + msg <- paste("Corrupt data, incorrect header. Expecting:", correct) + switch(message, error = stop(msg, call. = FALSE), + warning = warning(msg, call. = FALSE)) + return(FALSE) + } } - return(TRUE) } diff --git a/R/meta.R b/R/meta.R index df56f6d..8da3c0f 100644 --- a/R/meta.R +++ b/R/meta.R @@ -261,9 +261,9 @@ Add extra sorting variables to ensure small diffs.", sorted) } } generic <- c(generic, sorting = list(sorting)) - if (length(split_by)) { - generic <- c(generic, split_by = list(split_by)) - } + } + if (length(split_by)) { + generic <- c(generic, split_by = list(split_by)) } # calculate meta for each column if (has_name(dots, "old")) { diff --git a/R/read_vc.R b/R/read_vc.R index aac44b6..ec050c4 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -69,14 +69,56 @@ read_vc.character <- function(file, root = ".") { col_classes <- vapply(details, "[[", character(1), "class") # read the raw data and check the data hash - raw_data <- read.table( - file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", - dec = ".", numerals = "warn.loss", na.strings = na_string, - colClasses = setNames(col_type[col_classes], col_names), comment.char = "", - stringsAsFactors = FALSE, fileEncoding = "UTF-8" - ) + if (has_name(meta_data[["..generic"]], "split_by")) { + split_by <- meta_data[["..generic"]][["split_by"]] + which_split_by <- col_names %in% split_by + index <- read.table( + file = file.path(file["raw_file"], "index.tsv"), + header = TRUE, sep = "\t", quote = "\"", + dec = ".", numerals = "warn.loss", na.strings = na_string, + colClasses = setNames( + col_type[col_classes[which_split_by]], + col_names[which_split_by] + ), + comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" + ) + raw_data <- vapply( + seq_len(nrow(index)), + function(i) { + rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")) + raw_data <- read.table( + file = rf, header = TRUE, sep = "\t", quote = "\"", + dec = ".", numerals = "warn.loss", na.strings = na_string, + colClasses = setNames( + col_type[col_classes[!which_split_by]], + col_names[!which_split_by] + ), + comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" + ) + raw_data <- cbind( + index[rep(i, nrow(raw_data)), split_by, drop = FALSE], + raw_data + ) + attr(raw_data, "hash") <- datahash(rf) + return(list(raw_data)) + }, + vector(mode = "list", length = 1) + ) + dh <- sha1(vapply(raw_data, attr, character(1), "hash")) + raw_data <- do.call(rbind, raw_data)[, col_names] + } else { + raw_data <- read.table( + file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", + dec = ".", numerals = "warn.loss", na.strings = na_string, + colClasses = setNames(col_type[col_classes], col_names), + comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" + ) + dh <- datahash(file["raw_file"]) + } - dh <- datahash(file["raw_file"]) if (meta_data[["..generic"]][["data_hash"]] != dh) { meta_data[["..generic"]][["data_hash"]] <- dh warning("Mismatching data hash. Data altered outside of git2rdata.", From 0a5024e61a30d7c11d053ebcc0e5f041f0182454 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 15 Sep 2020 15:27:28 +0200 Subject: [PATCH 06/23] Add unit tests. --- codemeta.json | 28 +++++++++-- tests/testthat/test_f_split_by.R | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test_f_split_by.R diff --git a/codemeta.json b/codemeta.json index 4f43830..ba94061 100644 --- a/codemeta.json +++ b/codemeta.json @@ -1,11 +1,17 @@ { - "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "@context": [ + "https://doi.org/10.5063/schema/codemeta-2.0", + "http://schema.org" + ], "@type": "SoftwareSourceCode", "identifier": "git2rdata", "description": "Make versioning of data.frame easy and efficient using git\n repositories.", "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository", "codeRepository": "https://github.com/ropensci/git2rdata", - "relatedLink": "https://doi.org/10.5281/zenodo.1485309", + "relatedLink": [ + "https://doi.org/10.5281/zenodo.1485309", + "https://CRAN.R-project.org/package=git2rdata" + ], "issueTracker": "https://github.com/ropensci/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", "version": "0.2.3", @@ -212,5 +218,21 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "1765.055KB" + "fileSize": "586.414KB", + "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", + "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", + "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", + "developmentStatus": ["https://www.repostatus.org/#active", "https://www.tidyverse.org/lifecycle/#maturing"], + "review": { + "@type": "Review", + "url": "https://github.com/ropensci/software-review/issues/263", + "provider": "https://ropensci.org" + }, + "keywords": [ + "r", + "rstats", + "r-package", + "version-control", + "reproducible-research" + ] } diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R new file mode 100644 index 0000000..657e540 --- /dev/null +++ b/tests/testthat/test_f_split_by.R @@ -0,0 +1,79 @@ +test_that("write_vc() handles the split_by argument", { + root <- tempfile(pattern = "git2rdata-split-by") + dir.create(root) + + expect_warning( + write_vc( + test_data, file = "unsorted", root = root, split_by = "test_factor" + ), + "No sorting applied." + ) + expect_is({ + z <- read_vc("unsorted", root) + }, + "data.frame" + ) + expect_equal( + z[order(z$test_numeric), ], + test_data[order(test_data$test_numeric), ], + check.attributes = FALSE + ) + + expect_is({ + sorted_file <- write_vc( + test_data, file = "sorted", root = root, + sorting = "test_Date", split_by = "test_factor" + ) + }, + "character" + ) + + expect_is({ + z <- read_vc(sorted_file[1], root) + }, + "data.frame" + ) + expect_equal( + z, + test_data[order(test_data$test_factor, test_data$test_Date), ], + check.attributes = FALSE + ) + + data_file <- list.files( + file.path(root, sorted_file[1]), pattern = "[[:xdigit:]]{20}", + full.names = TRUE + ) + data_file <- sample(data_file, 1) + raw_data <- readLines(data_file) + writeLines(raw_data[-1], data_file) + expect_warning( + is_git2rdata("sorted", root, "warning"), + "Corrupt data, incorrect header" + ) + expect_error( + is_git2rdata("sorted", root, "error"), + "Corrupt data, incorrect header" + ) + expect_false( + suppressWarnings(is_git2rdata("sorted", root, "warning")), + "Corrupt data, incorrect header" + ) + + index_file <- file.path(root, sorted_file[1], "index.tsv") + index <- readLines(index_file) + writeLines(index[-1], index_file) + expect_warning( + is_git2rdata("sorted", root, "warning"), + "Corrupt data, incorrect header in index.tsv" + ) + expect_error( + is_git2rdata("sorted", root, "error"), + "Corrupt data, incorrect header in index.tsv" + ) + expect_false( + suppressWarnings(is_git2rdata("sorted", root, "warning")), + "Corrupt data, incorrect header in index.tsv" + ) + + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) +}) From 35f41cc23218c4637756bd7d79c5614722c8351f Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Thu, 17 Sep 2020 17:12:56 +0200 Subject: [PATCH 07/23] add vignette on split_by --- _pkgdown.yml | 2 + codemeta.json | 2 +- inst/split_by/read_timings.rds | Bin 0 -> 7810 bytes inst/split_by/write_timings.rds | Bin 0 -> 7867 bytes vignettes/split_by.Rmd | 319 ++++++++++++++++++++++++++++++++ 5 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 inst/split_by/read_timings.rds create mode 100644 inst/split_by/write_timings.rds create mode 100644 vignettes/split_by.Rmd diff --git a/_pkgdown.yml b/_pkgdown.yml index f81a74f..bd478a9 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -15,6 +15,8 @@ navbar: href: articles/workflow.html - text: Efficiency href: articles/efficiency.html + - text: Large dataframes + href: articles/split_by.html - text: Functions href: reference/index.html - text: Contributing diff --git a/codemeta.json b/codemeta.json index ba94061..08f8dd0 100644 --- a/codemeta.json +++ b/codemeta.json @@ -218,7 +218,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "586.414KB", + "fileSize": "614.847KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", diff --git a/inst/split_by/read_timings.rds b/inst/split_by/read_timings.rds new file mode 100644 index 0000000000000000000000000000000000000000..c7b63a4366a003610aa09f47029913dde454601f GIT binary patch literal 7810 zcmV-|9(~~-iwFP!0000019f?KJXQZ6zdchTrIOH)%19`Yo+|BK6j33{y7%0DU6j2^ z!G@5e%gQo0Xy3~H^p3*_$??vsE z`^!UyCeJ_~lpfSCrN7hwdh&norD$@?{(tyL53q?ijSgLI>bX4aqIglX#c6A48Xrn; z>X+h8`9qv`mU=Jy_Z?+Z3EHpp-(SktGGhPgO!22|Pwk^-9}WrNe%D^rN1nV4`nOK0leh~(4*Uw;!VY>l-R%WNYU;o zdq~n~QT7@b|8&2|iT!I!Y9HMWRBTgvQ2VGnp!QID4AhtMD?R2Z9?~@aRDMu&DH}+M z{cB??)+ib1ex>%&v_+?1{9XDGhZS-Omr zO(@+4;v(H=Qeyw+p(Kru+yI)C?dUl|ogb(_rI#G-j3n(WJ?157=LYho=N%P?biAkh zCPCv*$0<5S(QQcK8I>ROc$60V*Vghh9&|aWc%b5BAXd}yi0*&7jp=wz$8%ZQMaLUD zPEq!xO(B+_GIAuF28gF`?lA!UI96pnGjyL)m@i4AN9jK>&**#TcuB|bfw)P@O2rAC|3J*5c+>j}dX7;sF%a`*X>_G& z?*{fivSR;lK2r20#QxQJAf^w*e7X+@`i&l+R9sLp(&L++L)1MT2k89@J-)?hYsz<0 zwDa`3OZP3^&h)w`N!v3}MtbZ}u}j&4vJ)M9DZBqKETs2p)IGJP_dt~1^jM?oNq$C z-4;~5&~t}cQ+m>|h?0Yf@qsu`pO5IhGrb?9;|V=K>2uJ)nn0bW&uMaE|K^w!jUFY- zz`j*{0DVet8kS4_ky+}GFTCCOZr&X7PPzZ&-=~B#o4a}1KP%&ZR@47knf$Xd{byzN zcP0CWrs+RR6KW~@sQjr zE`C;WJD4w*$zQ3{1}kJ|iqXqDFkd}>`}vAE*j10h3$IAS_p57H%zrbHvZFKIrHTjc zGL7l%G3y|#%1IcsMiW6xR(iL$x`J@LeRxLjaxjjvR@QNSz`S#2^+Z=|uzM4uJPw`) zceBX)bwDOqZu;eiMVa80`VX-g;SBDf)R}6d=0SKzd#^sD4w;s7ri@E7N20@sr}I^9 z!6-ZWcJns{B+q@NzG_1ggzoQ#t!jS?LXBvh9kw6L`wOy;h}{Qq`s_%bU8_J$)wAIo zOF@d_wT4@{q&|(W{BqNmAnBKC=ke$qaAYMVE)3a*AeATjx`9F@h9%BDWGMz><~-w$ z5w>8et?0dPVhf(=`J$qbuLwJ1vnoP-D8fHVO>R})2Ue4;;q)8ENZPmW$Iy@hBr8^% z{ls)1bb-J*FoAo$CbRJyAcU_MetFVJTe$~mc5(I6T z*PSx;9AfTG9_{G=2+_vpu7uxd1vjkY@Jru|2>DvI$|9%$Z1rmzzUP;~FRp&Z%5HZs zUdn8Az55#8Rm*S08%zbkzEksIMG1J?9aol(ib3S!d9!YA+z!q|51FX&U*M})s(8BH z!j8OSmlXEDhnF4e$JM3`u$LoLvq=R>`Lzmh+QmqoS#?u#EgR82r&f>@L5!bVm-}TC zGD^MKjD~#SskeN7IzB~sMSjP*uyGL1UUKfahb2Vu3XZQY zj0bm@!@4;ddOoJ2 z@hkY>+?z9|>@h@%4C9x_p-f@+>vBBXqvs%7H|rO z#*W-!2X;MYqhjf5aMIM4#{|NHv>A*D zXQPary6nB!lFn zcd6#u(clgLG~XuI0IX}rGyH1x5afFL`(eH}Jg#lAj+(Iygu3xTcS{~vyi?0D{ybQV zbs2|s$a6oZj#mD)1BCH~%_IJc1#9id>V~XHu+HAuUp1i;#FA2jMemLwNOov-m!SY& zl5#m`YI=*C2AQxV5HG0itQ4#y8gIAZXK^RxLd* z1aDXK664JGiTOh&accY%K}r{7O0y&u_rP|8ww?_FWR8#G-5dn= zFRO*Aqpu>#+%Vv~*b9WOf0Ef!ng=3|G*?s!h{4M|3>hJa3yFT1HQXIuR-fAyUYn8n zAIP50UkRf4ezxe!NATyJeh|{!36AtWL+93w;7mGpg16NOM9QQO=2tZlYt}xrK;b=@ zVWQk}`6Xa6vz@jTk$IBz=C;FaS#XNOgs&?&%OlV_7gE4wg%u94(YPrz8Z;vLqm7kX_I=4-YXeE<`Lt2eP98X z#5L>tx55TPFvfS#ZrLB;NG;ZobJzUs_%Srle2n>B_@kAOAii zA&Jbp@9ObpyX26rt=(UT_+v!9+06{F4*PPqjMoR-<64-lRTsiCt;Yoojzro;!DPRC zaq!jiakW`ofWT5GxqC$q!P|CK>661!2sHWz74^7--Kq6m?Xd}V=vHa((dq;Dn&yrp z-OCaA={k3A)lcw$uktK;{RjbIua8yfj4%B^6` z^w5YuDTd?+2KQc7&jVql?^Ax*53IzYo-w=B;CmxGZ%M*J@aCs6czw>KURlYTrxw7Q z-D`Gd`~-Mrs1|*MYlC#GY`PHO3g= zcRucqZ@~s-&%6~C;Fx-NkN^PgbseWT-`n5}TJuV7O+I|rb<2gxm%(#;V}fMcA^3#3 z`!2Bj0j934<M3>d%}4CiwO2p{AI9@%rEAs$;cphY4WkYUyWunhoxyvFE@0bRc<1 zoW`u`$q2W(IOpP#PDF(*(K#Pw2=BDn$LBdzgI_+pqWzvZ7)QQlk6QW!VKN+PWd(Ea zX722-&jJ&3rxm--g27FQons{lpVn{7x*v=Hp>iNO_vBCzwrrN!VY2UB>K;2$H32Ex z?gt3IlYLZrExDZq?zR@WX%pXrwPx0^x=Ky3Ke|;(nRXz>Ts7Ka{Z*uIRFq4T;KAEX zc6Vi=1ei~UYJ9P+Ls+?Kd2q`a5I0VA&VTk8ME&!JE>ms5HF@#uz`N}T-gSMMr$rOI zf)20P`}_u&#eee3TYUr3=yV3*o7>lS&1eK6(-+~~-#>1qOAI_K|?DYMNaTgQ8l8U(!oMlAzJM(90yY_-r z7%{7O9|2)$dU5AF9vChZu`)Yjbz=#_8`NJJ&(}hv zS>gyyy^HW3dD2M!yex=4?kCd6Jq6cw2d~;!&pSdG^izH6pHlGJx)DZt%UNmA_OYFUa z$$VaUE1=-D6u1Uw)t$%Qhi}I;ix&o);IqtUPNrWFI38iHtwUUp^rEdu_jEdVUA?U6 zaIy|v8T0noyrW>xxIVjh+%)jQ2!`XmRPj`n#1&u-*VJBX)=bvZdzVVOZy~KDbr@sWU?h%OH+fuX4;bHHN$`3&jBW{0NC+)MTk-1vodVu*g1CzidJ^w*

zn<&{JNFs5wXRhMxr69gJI7Ah$fp3PDgYdHh0<^}MY`CcnpGPE$%3EQ7)2QK^EBO%m z|Ke0n5F%)IQQ4et3Gm+;({%MqB1Bh}I|n8SAm_|wD^ zuCoCOt!ePKl+meNrHkOcA5T2GE0FpszgRq*0bX#cwrS%ju%9pT4g9(Z!V6KI3X_Bg zTeqiux5gJl1i$f28NZU8d#i^1e7g{Y|GF{p;Q+hibF}jaHdv!)J5`GDAozUq?T5`_ z5SAOvP=h2yr=$Aojn|&9$G5(_1Y#T~V{pkc_*m`|{}Frz%s5qYKqYZ$8d1GAJqiTZ zF)QZ23y7o*&SM)Q5k5Xz-|*!ZgszOQKd-nEQSY=f3(S?kcQ~ql?FBiX#>c;Zx}z2` zb*KC1P$7|4zRX<#>4RcR$-x*Qh7n{#PWGt{dp2GyxD3IA1-oviUWV7_`$?~(r-S`0 zSYS2251f<3-<=6^0rT9oXS=jnVE%aOaaV!`#=YYowYVRV@_o;u2hS#gc$TC6Idupk zr(9lMSK5N0k=uHg-e?3Xzi-jbnmtGz-7L3Y>;Z)S@btZWsvO*oPYS68-w-lomeU#8 zGDOy{N`3a$3|>=bWv$i}Bl`lA#7q}E5Mye7sO59PjLJz|vNI6erw@Yfh6+F!elG8v zLe8}tq>S=}=aJS%EU=#X2|E)s3XeD1Bg!JeqFV9;saK7Kv9uX>=(v7mH_Zo6fBVGo zc3n8=@1x&x$QFzrUYCzJk$K5#{F1e-1w^Oa*p07AoSqQXS{g&ZPhc{j%P zO|c7j)`O&u*!U<&d~*vv%T$3PdOOei9rL2UBxl`v?(%z@0Vg z9EHaa?0k9avvnTub~zPtt$GK9_WLhq^c*Dn+wD4ia#7$nPGyhOQ3f|@c34L2UU2J4 zZ>l6f_<8YdW_2<6>(dSF&m188Cyv>MpezXH+X)EYH)K9p_vQ?p3eWbmq2p$M0c%}0 zIj#nP=ozG09C94Y8I!JtM=&7L%JkiK;{({QV*BTrwBV0>7o^Sw*U)gt@XnbKO&DLL z_jNzg9Ja1D?)HPv`v4^wVHwz>mV8sOy$IhH%lLBl5C}!9{{Abl{mGv7VLkAyIMy0^ z*BsuBgUaVikn`uw>i&FS){&?(K^d&`2dew_kaB%}|H*4;6@+JOy_J~j;JN02!oWl;x`aS4_u+VpSe)A1@DT|wjvfDxY7(VgbXL6qM(jqS1DnoLq ze40zEDcQGu;pkmi1RtNA1J@dD5VSBQq{w7GJeTVC%!q%9LIrRpG=4U;I&PM-Hw1~+%$3bR%@_- z=GXKs&+^v!)!4Pv(MMSqF25M5um_O0x5 z5DE{JBb5TldX=N_T#B4G0u%Lm#g-!V;rN>8ALqhzm8OyS$`}wct`Ik`^FWwxof%(T z1){3&S#;6OaWWDYDG@tPIAtFlJnw4A^0}7?zgNzll@X#|92+B?7zDvR~yOxXNBdX zBRXK$W>q-J&P7IIkj>yFt#?_Dm9ohFaOMe(&i$LX53b_HT@bI4-Gudk}yrQaVh?8WU zTp3)qU~vMNV;c+%OMdqqp>so<>%iYyZA*4^q`o>8GZrR-ucSz}DK=m^4hq|8w z;8ZSeP>AdRVM25jK63`KWb%A#ApsxZWCZ zPK|GsS@Z)0ELuP5=uiQeH!8kqHk<~_sj*?^$&uu|w`!t9wH$~SDu3qdr7Nv!D?Px< z*vR6=>wq=w&ez9<_27jw*o`ViV9#Ooe+T1eHud+1BCSiwT62>S+9AEpIKmR_n{)HN zMdU$L)H=HB)KZ9S_*)gc_Q5M{NcgBN_dxjC8{cnvj#QOnwPiENKJDV#@b&u`W zM}sFTWyhNElw{AcEB` zy^|k-_fY-!hhBuX{4Q%gh%}8Sb8rQAwP6miVFPxwdf(75&_MdDpR(gGGLatdC;k0n zB@(hewtk=23*JQS8)YFX;5cq-Y*}Ir*6nW&X@QHuR}7H7zH}2p%9lmYm?(?Dq;26A zZI#Fx6FWgxD;)k)FBv%{J_XZict0NTv*jlj#I}RWX_u>A?+8JU%Q^p22ADejq4`g) zfs;79>z!mYLb?rqCj|UM&F|J!?t*uIg~6+lqu?XRj?1o4LFmM;Dc#q~@tx zY?%b!8TY}f?`#B5eP7K(|fY4?1V5qe{-*oJ@}K)|Ce!E8{EFbj;7J%{NT?#?K}Pqcs=J{#YeNj zYyMs4c7i=xoMSw(60Db`8|@F^U{TZgr$b8+Ss?vx&xC2<4%ZtUY#EN7hr{JlpX7tR zY}$R9X>6o^e!V+d>;S@Tr-yVDy+;J=-V?VfDN-JT-wiD|b{)M^0ka{r`29II03yR_ zY1W$;AT8J{Jlv_?8_h^Z0@Sg_z2Eg@5wb`M<6uK z^ORn{0o)B5{d2|cktY3iU{C+~lN^+hHR`98vWy(UyNJt-`vOGlXFZSqkq4%#U0=Cc zGnkq6Qa^@O4!^H#t?J3Q0_Hm>Bttj#Lp_H4DU{m*8 z4G#1v@=ssiPyas~CHe*s9PH!a%4WK_FIN)x7uh!IVi*lwduOIHzN6Nrv5@g74wtl>#CniTN}{dvUxJpYbK5Rc@ax7eTW zqW`TQ?u-8UCh}*V$e(eEKkG&Q%n_r^A@`G@j7$BQM?U|KiT{}+^ye*3SuaJICq|J$ z$|Q9XrnJeuBuD5ZOwnDKvacv*Z_+*zl=D+{pxRC7&v)_vVK36gR2|4~l9V%3?IT86 zL&_DR{e5QAZq#|?H&Pa9L-O20w7>Neq1b@5m*j{!LbU%M7b3L3@0GM2DU*7yqF=13o>`Wp32RJ@8&+T@*xQ1&6uO~ojAek!jBQ`S*AMVQhjaY3Gk zJf9F{9@SRlzEu25(*DLhl}pL}sP{nHM2aGx+>`V(5=*2^az9DRd?8Akv@g|PNn4GS zOZ9USJER^|pBAUgk)*UqIn*^&-zLvM>MuljNZqJ7r*bCM_o&z=&qL}@-iI*lZ|)(- zsCP}`Mw0T7+%8J{+s{b<5&Iu|lQt&xp~g@uFOm00exq^#mFK89pmGRlQ!4hT_MpZP z3EJNnBe6`{iW=|8d!oi0DtA!zqWU#87L)U+F+rTNujGg}RW3PB`VQ4EMsg>~3nbR4 z+(eF%`;U|*Mv*~{C!)0f+xMxt0hv#b=OX2ix>Cu9K%KgG4_8@Ud;$UQcH3QZYxKe`J0xMv+V92}+C+{ZFpwe^@zYd)oGx6VVzstihjA2dk6L7RrD1 zRQ~F%_^YS-S5NJ)p8B7j#D9CL{nb?^yORIOSh;+Jeqz?gtei;o{y(J;T3I{WpCnp? z#54BaG8dlKzF~%vw^8hSV%&l4b)c`mTNId@24<*Ne3(TXSSJqGZ}*Y`>mNstqqZ4% z@QwSZJL(Lg9wf$n|6z}-nqs^_&NL)D?kSIHpNH5nY@1%s=LosqmzXuB7Ga;Z(r({5 zfM9KY!vMnt=T}=lb&1eGxZ?Evo0g~|ULY~#TDBhCS(OF7wpWnj`O2Hws{poRVro;; z0DSD$MP_7?z4UeJB2iGo{bBe!I%s_&FSqvv#sE0bT<8Z<%aM{ZIRaI&R~11 zN{B3?BiwS+LSYX+f_OtGmrW}{X5{&LQ@Jg;q}XNeHp&8J2{F}Q?ia#W>MXA6yMQy= zO4xOeD1256g%wQ_1K)(@kS19MUc2z*TGn}#OQ@-hHz@&oXJq%~gk(e=5j4zHpAR3y zqmS7uE`wciajLz)_r=+E8xX{`ee3r zJNR`C_X?9`;d3?XNUZT_@V6vx9Xov+eC-yP%v0NdK!%={$Aq1zNHLuJYHB^0yFIF9 zT#g~){Ql)xo4Vm!IGxoT>jduCmWm>#EFLOmxmKr~hVR?@J^gl3;F;Xh2uTzJFQGHp zGh;D)mfg;|n!gLadRkv9XFUb8*zX)`w=KB&eb<(Eg~OlrkMiYRU0_?r?JoMA2fsR% zy!VT4!6$Ueo}7&m`?{vFZIWQd8Oj4zR@pacDE$7cK?LWv{yfw$(r!j z)EG1Hjg6p}nVv`2eMDH|+0_^J4G2>=lW#N1VoNKeBftP93sS9Bl!x7 zh}3;6ku&Qfc;8-2>=|uL=-r-wS0Mn5N{h3mAC-~88{mIyJPMA{#u|FiHIz?nk`-hs zf}<`S`+UP{_{b}WI6NwYUwB8y^`llO&0BSX7H|&S(@%90G-=>^T2xmb@&iLpg@BPK zGA>;;kXlj&mRn-ftQqz2O@VmzlW3I2RX#|PvOs#@hE3rzr3l`nqUKoh1m|}SbY9;` z=*_Q*OwbKSSkapvrHu;_ylnQDGi!&yKiut8&szz9=}~iMIoyO_XhoY@oFDifl!LZv zY9VBP+o2;TZ4tJ}(@&Ir5y1mU-)rcJuvZZ;PwTiNxnlZ7scLIni+`>yJQN7uG3gKI zEqRTQrrtr?RxfZRr3>z4?Eq`(728ztY^jW>62T2 zV&7A6dvix^%(g_wo5VGgg5_R1 zW@h3F2$l;b86@oo$0RPY`sf|7r{p`*0l%O3xr75CGwHIH)}rO`m#_0kQ5a3&W`q8ij4^m{ zT}$SjYX~^at?ayf3HUB4{-QQp5zN&WKI|~6MTwr5wzbOzc!Vj(Xj^Q8_r0ge%^NIG zk}{v85E4T0?30z)#Hx}KDo`5pDpIUM&OW;3SYf}C5 zY2c}Ar-XXU0$)}r^}^>EFy?lc8$UXR;OXntU*0rFAa~(RLxwpxv&I@ZI_?9rZNWfE zs65iwh}x*`8$w!3USEa!G&~e##j1MxgY`-(BQa|#LayjMw3#dfrjb^}{ofnFus%|w zCoO~2FN5a%*#r+?IdW=KT^|^Q&DoD6*hp0#*QVn<72Z|HCxj`Vg<#3+^M1#z;Qd>! z*8$wBx zs)(g{4cPuGe^q!#gX{EmVQ<%VgzS6VpZop;QWR+wmZL<#v;6XThWv5R6^aAjSY8G9 zo=(qY_xIq>%`6SLK=4hNg=UYC5t#m4-+HubfD?awO0D?|aK~jRw?2yo=Y`)XEeB6L zsLF7BX_p07f264EC^kHO&V8CP-k#vE%l?`3{86GEUjFrAKM^N1%U&+N5B6j2UAZa0 z;gcVrp%LH+KepSX)TiteCUnH>V1W zTOWq~hsV&)<;N5DUZywAV2lOA-3l%?>@I}g8fI;y7Y`Y!bJOPyMnZ7iS2)Dm1($x* zd@*f#4EDXbr;iCV4YJr1bQD4uZ3Jb&ai`zxr#q+*&GBuSY=XHt^!Vf zTSoaY0r-n)Vh?O*f-SFb*?ZDwq%YBNzk6aG1pQ_6oR?N1^n+=T&wLB;=rcuMw2^-#SG=$&r0(cy;?(mv%fK?F!qt#;Rr9t7jacP>}g9IRd4p&Fu+5J;ch73t0g z{aj9g(ATfvPS8)9&5r`RLG_s+*ATpeSGdKrdtm(@+gW$94$SgVeSseN;8b%ih<%&~ zUdh_k5q(TVI(3O%javxrbyKC+d(R=Y-Dv0dBYt3-Z@KL3eFfox9tQgN8FIM zcOn8V1YR<%*G9yxk1h$1A3+e)_K#Sq6hdRW1Gicc=Mz}mSfbPjp9;y+smqeVoV;(I zo~8{Lnhmr^Y!ig0N5#HN%>#W@qMBs)7Ni+JUscnX4Bop`)*P8?&@JyeybL@J?le($ z{GFW;=-W?eDmFowu=Eu(WiOwMG7RI+*vd z>H_l)82oFK*SFh%oAsOX_*MZp52afB=er{7L;BtT*qoG6Q4Jvi&XlQo;S$ z<6SB@fXK^+E^+5&ko>W7Mc!%^B=>EX-uPq|xczCa^q&N;cjUfnlB$Gva-hx`y)XpP zYjwjO=z^}>m2k7J5)7e-hl)A9;HrI=%r{R#>Dwt!O*JGDI?q^e&eR4h6^n%|CJ($U zjg8WkqY-*)PSeTMLioI$@VRnRI0TH_vbqhgLBHC&jT*W8z(+07;I@>)om+V;U%_bQfxaz*v*?p+qVfkjoArnhI|P>T4Qt}j1AVurqhP& zo#D~4@y(+mS$KAD5DwUJ6Tw^~8`-LHV6DhGm~6ns1I6Igiu^kW+cWI|5wIcnWw+?d zY9nxd7@7t5j0Mx|bxTjfTJSQ;=#4_xAUNYvP;fRFT(2$K+U+?AE!=5Sx~LoJvkj&w zi1dJ)xqI&2+IG+%-cnMvQU<5vOThZMIs}f&tF29Sz_ZylAo=7L(kf33=Vf8e8rm}! z3xic8Q@=C!5xDBua`3DL*qd!%NSSt`)TwZBTT4BWXDv>%9ZSTMQ-$xmf35=CB7#VV zbl~GCR;gpX3u%sbpO{*d!TXoUjf=78!I8_&HnbH6TZwsNy3K2(g`6MGgVL%Xe_{J7 zq@3Hn$fYa*p@ypQ@m-Bzn+rYbs&fZ7Pl;cj)q~XC>|b3@qTo)H436vFjc|*hN;Cfw z@DC*n=gYqLMSmwP{sKRNX^PZ@WcZG*S#|r>cOqZQy>v}~CrZ*qxA~So0`u~F{VtOx zu&jflZ!O>>z)s`26#WV~Hs?;%yEuS7;Z4EoHS6F_)9qs*Trvu|D#t}E=wJL zn=+POpPmawiK}dLQUiQD4feihO$E>Q(Rw|hOz`FOl^bL)BHc4xsB`9KFxnD6MvQWR z53^iaZOlTH^f)CI7Dyv~XO_YG_bOl+*R8bqF&$|W^R*Y=)da7GIQ@h$qONQBen_kw zVG9ilRM$U&K-;Qt=dx0SI!C@&++&B($@{alj%`Dls}k>1Y3e= z308|t{((kKu)B8GpG{Z=&f$)+Gd-Na>9(|8F6NDtrADKogVn+4el(o#u#?NSz7tgh zy(#9Vy|d~B{X!Ax+5BR~HgoVbuVi^=eS)9rt5QpM9e6F3x&LeYPjEJ8H8=0) z6MVRZ2)lYnopkW?YS%b;XPdu$I`A00wFNmt>X`@`(mfYGV;@2Jn9;X^!*K(Xw`B=R+CLD~dD+d@ZTHqdbY<(6* z+;{YP+lOjh1b&j`*7Phu=*%a_*TuOby`;JDRRs;6=fhodrf?DZC_boXMLN!IH$B&Llk&;76*M!NL559wO!hMD%uK{IScZlxsAo$p2 ziTWq~C~(X~WUA;yyjdO9=3Gug@;JXUvl0ou(TF_qkM4bhy{S*HN?8eEKKk1Fp6y7NEciDe2XFDLyKi%~2zzK) zy)H=ucZOfod#!8W%p5JdV9O7L?euzia@<|`(8{+r%sGtGu{4Wnz2)G&x_7B2gQ#~R zT6FANr-H@%He64YO;mq(VW%C~g9$`{+fU$WfrWi7QAad7^>r;^qg20hXh(e`1oCQg zI~G(UoR!GgmUj!h-+SNPIQs&8ja>mDPfZae&uaHxKeoVSnZCnD@)JP%2!b#vWk~5ZR8iJppC*h!80nYp5Q};&1fUW#nj=6!zb8Y1& zt#ti}$fEJzmM6U>>ZH?Se)>t_($ja_ZQohql8obU-pJd#^W@>Wg$UTg|1`__6Zkse z3jN0`;5XVcuya8MQFjaKmWRoM-ISO=_UA#c)-Eph0q@7tnN>@OxNh`OMC;5OMARv#)JF}1SJ!3#GH)q3(Ru%9F3m>J zc9CKGR;cB#l*}k6^8b^z3lhY@tA3l9)+z`8G~H*tVjA$z{ng%`z79SrIWumYKZ-!v zz{ze7QpgINVSQA}8vN^9H(dNqM{dQ~$+_KG$m!9_HY@W4OV7<*ZJh_A7i%*Px-3O_ zX~jWVW)K2(&pJt}nZQTs+HfAmx|lk+wW0}(el?8;J)gmBa*XRWBJA-!T~F!!T9n_= z5bobk)GcYNg5OMB0j9mTPPalm*u8^o{+&)J>tal?npuyU?ZwX<#7842cFcP5_m2>s z>i<*s^D~5`oQU+_@d`BoC%neiH6n1~V_`)d2ly7XcoBpIp5tA*?5hSu7VC4j1^&im z;h`02iG=;;Waru6_z9-XP`Rs_Is*5N6F463MPV4 z+8wgQJ_jOyIGtLLfYnnK`GqX_MtuM0*XMDlP?Y`9(N=`aSW#>X2-TeFlSq;3Vik-p>RBfUu9*O=s4^%1Er7j2#7Sq5fO%|VlzrQjI{ zpIA8&gk4wKcV@i-qx6oHno1ke+e=5Q-b+X6)FT_O1mA!lX{z^uqyK=_d0A#><~gKq z*gH$hx*Y<~qiz=3IpAc7oO_rk1fP{U1T4%^7Ex{bbX6u%ufLH9esB{$heC$yeSuZ% zCeuAO@P779Bw$e=xJ5O?`4D4&#|4whesCm8w*~h5f_ucPH_v)51d>8~b@J6va^ka2 zwZ$UPEoLe%by0!0begQo}1v%t7{ZcYE*8n6rw+l$6Tqa>p7 z>Y?~vgpSiA)R~G9$BBon+82S{${z4(DnOWrfn&)!4X|gO=dDTH08aifB3$i4N!vd^ zt_LfkMDv1T4C^dX=N?jdWw0Mi(TUpOm27a^mp}hqa+9dHG{^7n-i?yoN&e;Dcfl=> z^GyD=9qbZYq8!`=Z~1dQSz8x?;Tx*-eEthOa2YlHdpEb)r}};^Q6H*$%(zEuMasNG zJ#O8=gF?+SQ`Uvx{$yw83uE=cO8m*K?>&v;D21)u!YTwgBrR7$H|SGicUk^?2aa@e z%j*9gNmAZY%j&l{d5M%<;IjdoZF`^j$WRRK zof~&+bzg_~mHE9l!`^_McMR1TY4BElbGCb05{kcko;~V56CSLrZ>5Puotz#t&2^CA z{bMVMYp_9ao5N@Sea;998yYLkx&!(hi{X4L*oosk$6E$0jT5&DcGsdfz%0eUN*msz z6Ovom*TGuTRF-(s671NQyEPvC0!!_8t^TC3V9lyoCb-=O`j?<3M8E*c0;6~pTEsni z#G9n90cUb>(n`j5(C5-WJF5tRHA&_6KY~aUfBTTOFXK17(u>AiOkV+dG(GRoZa2`U z)yNN(=)!x{A)5uICGcK3IFHp;3P$S8w~NJzI(h5e<13_ez({e_FBa_tJ#+I*14b1X z?U_3~?-TV3=+=|x`{RD9-qWg#$4E9Wt^1kRhcuVPn-2O`J!OM3YCBJZHhZdd2BiMmqrsgsR6JVr}|)GTxd zBl6k7RqNNn>sg-%0Z4>|nl}dRIDz8+0}dRwXW-dsuk7?QLQL5TOY!Qb@W1cTEGzXD zJX0Uf-sU+-owwXL(d_|3%k_qNgZ^}h=fb(8z>awOAZ?92JZ^7KEFRB8RPewjnXFEf z&0HGKj7ovG%Fu!3r;8Bc_EbzFr4X#Q1^RCJ#$dfyxSQ6~4DXDt-&Qg*V9@POiY_uG z^5avnRoO#eWq6(stA2qH+TCo|xjBe@mHf`~&=znseJn=_hk|p8#xpqA2fEMx>hK(b zk0LZy6vk!~`EQG->dlQ{=nXP{p^(af6$RrAo8H1+d)GIj2+SqxfgayG~Ug zb8XD&2#Z|^6aQTpdU0|hHJi0Vh0j##|W?$D@f?hJxI2;8AP5V*L>1@AQtS@?7cI3cwkh-Cyu$E zfTWuTUu1r50V82!PRB(jBF{T_>wq?azb*bUZI=wdVICb^Bk~5UUH*1~A41^0+5PRr zG1*{rBsNH|bwu($t%`ZO;=y~cNz1=!3w$Nj%Zt)#NB&#!gw=6d^52TWe+ETu&p4bU z+s<~!ZAUDUI%)5+j9fV8--UGI&(P3s_x~me4~^Jx>~TBmllBK}Pgoy5ZgukLpHXQW ZD`%@^2Txl4D}Xlme*iH+Zgj{i001#Fv~B + %\VignetteIndexEntry{Storing Large Dataframes} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteDepends{git2r} + %\VignetteDepends{microbenchmark} + %\VignetteDepends{ggplot2} +--- + +```{r setup, include = FALSE} +library(knitr) +opts_chunk$set( + fig.height = 4, fig.width = 6, + collapse = TRUE, + comment = "#>" +) +library(ggplot2) +inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517", + "#843860", "#C04384", "#C2C444", "#685457") +theme_inbo <- function(base_size = 12, base_family = "") { + rect_bg <- "white" + legend_bg <- "white" + panel_bg <- "#F3F3F3" + panel_grid <- "white" + plot_bg <- "white" + half_line <- base_size / 2 + theme( + line = element_line(colour = "black", size = 0.5, linetype = 1, + lineend = "butt"), + rect = element_rect(fill = rect_bg, colour = "black", size = 0.5, + linetype = 1), + text = element_text(family = base_family, face = "plain", + colour = "#843860", size = base_size, hjust = 0.5, + vjust = 0.5, angle = 0, lineheight = 0.9, + margin = margin(), debug = FALSE), + axis.line = element_blank(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text = element_text(size = rel(0.8)), + axis.text.x = element_text(margin = margin(t = 0.8 * half_line / 2), + vjust = 1), + axis.text.x.top = NULL, + axis.text.y = element_text(margin = margin(r = 0.8 * half_line / 2), + hjust = 1), + axis.text.y.right = NULL, + axis.ticks = element_line(), + axis.ticks.length = unit(0.15, "cm"), + axis.title = element_text(colour = "black"), + axis.title.x = element_text( + margin = margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) + ), + axis.title.x.top = NULL, + axis.title.y = element_text( + margin = margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), + angle = 90 + ), + axis.title.y.right = NULL, + legend.background = element_rect(colour = NA, fill = legend_bg), + legend.key = element_rect(fill = panel_bg, colour = NA), + legend.key.size = unit(1.2, "lines"), + legend.key.height = NULL, + legend.key.width = NULL, + legend.margin = NULL, + legend.spacing = unit(0.2, "cm"), + legend.spacing.x = NULL, + legend.spacing.y = NULL, + legend.text = element_text(size = rel(0.8)), + legend.text.align = NULL, + legend.title = element_text(size = rel(0.8), face = "bold", hjust = 0, + colour = "black"), + legend.title.align = NULL, + legend.position = "right", + legend.direction = NULL, + legend.justification = "center", + legend.box = NULL, + legend.box.margin = margin(t = half_line, r = half_line, b = half_line, + l = half_line), + legend.box.background = element_rect(colour = NA, fill = legend_bg), + legend.box.spacing = unit(0.2, "cm"), + panel.background = element_rect(fill = panel_bg, colour = NA), + panel.border = element_blank(), + panel.grid = element_line(colour = panel_grid), + panel.grid.minor = element_line(colour = panel_grid, size = 0.25), + panel.spacing = unit(half_line, "pt"), + panel.spacing.x = NULL, + panel.spacing.y = NULL, + panel.ontop = FALSE, + strip.background = element_rect(fill = "#8E9DA7", colour = NA), + strip.text = element_text(size = rel(0.8), colour = "#F3F3F3"), + strip.text.x = element_text(margin = margin(t = half_line, b = half_line)), + strip.text.y = element_text(margin = margin(r = half_line, l = half_line), + angle = -90), + strip.switch.pad.grid = unit(0.1, "cm"), + strip.switch.pad.wrap = unit(0.1, "cm"), + strip.placement = "outside", + plot.background = element_rect(colour = NA, fill = plot_bg), + plot.title = element_text(size = rel(1.2), + margin = margin(0, 0, half_line, 0)), + plot.subtitle = element_text(size = rel(1), + margin = margin(0, 0, half_line, 0)), + plot.caption = element_text(size = rel(0.6), + margin = margin(0, 0, half_line, 0)), + plot.margin = margin(t = half_line, r = half_line, b = half_line, + l = half_line), + plot.tag = element_text(size = rel(1.2), hjust = 0.5, vjust = 0.5), + plot.tag.position = "topleft", + complete = TRUE + ) +} +theme_set(theme_inbo()) +update_geom_defaults("line", list(colour = "#356196")) +update_geom_defaults("hline", list(colour = "#356196")) +update_geom_defaults("boxplot", list(colour = "#356196")) +update_geom_defaults("smooth", list(colour = "#356196")) +``` + +## Introduction + +Sometimes, a large dataframe has one or more variables with a small number of unique combinations. +E.g. a dataframe with factor variables. + +In such a case we can use the `split_by` argument of `write_vc()`. +This will store the large dataframe over a set of tab separated files. +One file for every combination of the variables defined by `split_by`. +Every partial data file holds one combination of `split_by`. +We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash. +This hash becomes the base name of the partial data files. +The combination of the hash in the `index.tsv` and the base name of the partial data files makes the information of `split_by` in the partial data file redundant. +We remove the `split_by` variables from the partial data files, reducing their size. + +## When to Split the Dataframe + +Let's set the following variables: + +- $s$: the average number of bytes to store a single line of the `split_by` variables. + +- $r$: the average number of bytes to store a single line of the remaining variables. + +- $h_s$: the number of bytes to store the header of the `split_by` variables. + +- $h_r$: the number of bytes to store the header of the remaining variables. + +- $N$: the number of rows in the dataframe. + +- $N_s$: the number of unique combinations of the `split_by` variables. + +Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation. +The total number of bytes is `T_0 = h_s + h_r + 1 + N (s + r + 1)`. +The `+ 1` originates from the tab character to separate the `split_by` variables from the remaining variables. + +Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables. +`h_s` bytes for the header and `N_s s` for the data. +The headers of the partial data files require $N_s h_r$ bytes ($N_s$ files and $h_r$ byte per file). +The data in the partial data files require $N r$ bytes. +The total number of bytes is `T_s = h_s + N_s s + N_s h_r + N r`. + +We can look at the ratio of $T_s$ over $T_0$. + +$$\frac{T_s}{T_0} = \frac{h_s + N_s s + N_s h_r + N r}{h_s + h_r + 1 + N (s + r + 1)}$$ + +Let's simplify the equation by assuming that we need an equal amount of character for the headers and the data ($h_s = s$ and $h_r = r$). + +$$\frac{T_s}{T_0} = \frac{s + N_s s + N_s r + N r}{s + r + 1 + N (s + r + 1)}$$ + +$$\frac{T_s}{T_0} = \frac{s + N_s s + N_s r + N r}{s + r + 1 + N s + N r + N}$$ + +Let assume that $s = a r$ with $0 < a$ and $N_s = b N$ with $0 < b < 1$. + +$$\frac{T_s}{T_0} = \frac{a r + N a b r + N b r + N r}{a r + r + 1 + N a r + N r + N}$$ + +$$\frac{T_s}{T_0} = \frac{(a + N a b + N b + N) r}{(N + 1) (a r + r + 1)}$$ + +$$\frac{T_s}{T_0} = \frac{a + N a b + N b + N}{(N + 1) (a + 1 + 1 / r)}$$ $$\frac{T_s}{T_0} = \frac{a + (a b + b + 1) N }{(N + 1) (a + 1 + 1 / r)}$$ + +When $N$ is large, we can state that $a \lll N$ and $N / (N + 1) \approx 1$. + +$$\frac{T_s}{T_0} \approx \frac{a b + b + 1}{a + 1 + 1 / r}$$ + +```{r ratio, fig.cap = "Storage space required using `split_by` relative to storing a single file.", echo = FALSE} +combinations <- expand.grid( + a = c(0.25, 0.5, 1, 2, 4), + b = seq(0, 1, length = 41), + r = c(10, 100, 1000) +) +combinations$ratio <- with( + combinations, + (a * b + b + 1) / (a + 1 + 1 / r) +) +ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + + geom_hline(yintercept = 1, linetype = 2) + + geom_line() + + facet_wrap(~ paste("r =", r)) + + scale_x_continuous( + "b = N_s / N", + labels = function(x) { + paste0(100 * x, "%") + } + ) + + scale_y_continuous( + "Relative amount of disc space", + labels = function(x) { + paste0(100 * x, "%") + } + ) + + scale_colour_manual( + "a = s / r", + values = inbo_colours, + labels = c("1/4", "1/2", "1", "2", "4") + ) +``` + +The figure illustrates that using `split_by` is more efficient when the number of unique combinations ($N_s$) of the `split_by` variables is much smaller than the number of rows in the dataframe ($N$). +The efficiency also increases when the storage for a single combination of `split_by` variables ($s$) is larger than the storage needed for a single line of the remain variables ($r$). +The storage needed for a single line of the remain variables ($r$) doesn't influence the efficiency. + +## Benchmarking + +```{r load_data, echo = FALSE} +airbag <- readRDS( + system.file("efficiency", "airbag.rds", package = "git2rdata") +) +``` + +```{r set_tmp_dir} +library(git2rdata) +root <- tempfile("git2rdata-split-by") +dir.create(root) +``` + +```{r get_write_timings, eval = system.file("split_by", "write_timings.rds", package = "git2rdata") == ""} +library(microbenchmark) +mb <- microbenchmark( + part_1 = write_vc(airbag, "part_1", root, sorting = "X"), + part_2 = write_vc(airbag, "part_2", root, sorting = "X", split_by = "airbag"), + part_3 = write_vc(airbag, "part_3", root, sorting = "X", split_by = "abcat"), + part_4 = write_vc( + airbag, "part_4", root, sorting = "X", split_by = c("airbag", "sex") + ), + part_5 = write_vc(airbag, "part_5", root, sorting = "X", split_by = "dvcat"), + part_6 = write_vc( + airbag, "part_6", root, sorting = "X", split_by = "yearacc" + ), + part_15 = write_vc( + airbag, "part_15", root, sorting = "X", split_by = c("dvcat", "abcat") + ), + part_45 = write_vc( + airbag, "part_45", root, sorting = "X", split_by = "yearVeh" + ), + part_270 = write_vc( + airbag, "part_270", root, sorting = "X", split_by = c("yearacc", "yearVeh") + ) +) +mb$time <- mb$time / 1e6 +``` + +```{r store_write_timings, echo = FALSE} +if (system.file("split_by", "write_timings.rds", package = "git2rdata") == "") { + dir.create(file.path("..", "inst", "split_by"), showWarnings = FALSE) + saveRDS(mb, file.path("..", "inst", "split_by", "write_timings.rds")) +} else { + mb <- readRDS( + system.file("split_by", "write_timings.rds", package = "git2rdata") + ) +} +``` + +Splitting the dataframe over more than one file takes more time to write the data. +The log time seems to increase quadratic with log number of parts. + +```{r plot_write_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for different number of parts."} +mb$combinations <- as.integer(gsub("part_", "", levels(mb$expr)))[mb$expr] +ggplot(mb, aes(x = combinations, y = time)) + + geom_boxplot(aes(group = combinations)) + + scale_x_log10("Number of parts") + + scale_y_log10("Time (in milliseconds)") +``` + +```{r get_read_timings, eval = system.file("split_by", "read_timings.rds", package = "git2rdata") == ""} +mb_r <- microbenchmark( + part_1 = read_vc("part_1", root), + part_2 = read_vc("part_2", root), + part_3 = read_vc("part_3", root), + part_4 = read_vc("part_4", root), + part_5 = read_vc("part_5", root), + part_6 = read_vc("part_6", root), + part_15 = read_vc("part_15", root), + part_45 = read_vc("part_45", root), + part_270 = read_vc("part_270", root) +) +mb_r$time <- mb_r$time / 1e6 +``` + +```{r store_read_timings, echo = FALSE} +if (system.file("split_by", "read_timings.rds", package = "git2rdata") == "") { + saveRDS(mb_r, file.path("..", "inst", "split_by", "read_timings.rds")) +} else { + mb_r <- readRDS( + system.file("split_by", "read_timings.rds", package = "git2rdata") + ) +} +``` + +A small number of parts does not seem to affect the read timings much. +Above ten parts, the required time for reading seems to increase. +The log time seems to increase quadratic with log number of parts. + +```{r plot_read_timings, echo = FALSE, fig.cap = "Boxplot of the read timings for the different number of parts."} +mb_r$combinations <- as.integer(gsub("part_", "", levels(mb_r$expr)))[mb_r$expr] +ggplot(mb_r, aes(x = combinations, y = time)) + + geom_boxplot(aes(group = combinations)) + + scale_x_log10("Number of parts") + + scale_y_log10("Time (in milliseconds)") +``` From 8d74022c49eba206a86ba052970975ffbda389db Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 18 Sep 2020 14:39:14 +0200 Subject: [PATCH 08/23] Change split_by storage Handle the case where a file stored without split_by is replaced with a version with split_by and vice versa. Also check changes in split_by variables. --- R/write_vc.R | 17 +++++++++++++++++ codemeta.json | 2 +- tests/testthat/test_f_split_by.R | 27 +++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/R/write_vc.R b/R/write_vc.R index 6c949e4..4acbdd0 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -106,6 +106,10 @@ write_vc.character <- function( } } } + assert_that( + unlink(file["raw_file"], recursive = TRUE) == 0, + msg = "Failed to remove existing files." + ) if (length(split_by) == 0) { write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, @@ -232,6 +236,19 @@ compare_meta <- function(new, old) { ) -> extra problems <- c(problems, extra) } + new_split_by <- new[["..generic"]][["split_by"]] + old_split_by <- old[["..generic"]][["split_by"]] + if (!isTRUE(all.equal(new_split_by, old_split_by))) { + sprintf( + "- The split_by variables changed. + - Split_by for the new data: %s. + - Split_by for the old data: %s.", + paste(sprintf("'%s'", new_split_by), collapse = ", "), + paste(sprintf("'%s'", old_split_by), collapse = ", ") + ) -> extra + problems <- c(problems, extra) + } + new <- new[names(new) != "..generic"] old <- old[names(old) != "..generic"] diff --git a/codemeta.json b/codemeta.json index 08f8dd0..8ad6d43 100644 --- a/codemeta.json +++ b/codemeta.json @@ -218,7 +218,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "614.847KB", + "fileSize": "616.079KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R index 657e540..789770f 100644 --- a/tests/testthat/test_f_split_by.R +++ b/tests/testthat/test_f_split_by.R @@ -39,6 +39,33 @@ test_that("write_vc() handles the split_by argument", { check.attributes = FALSE ) + expect_error( + write_vc( + test_data, file = "sorted", root = root, split_by = character(0) + ), + "The split_by variables changed." + ) + expect_warning( + write_vc( + test_data, file = "sorted", root = root, split_by = character(0), + strict = FALSE + ), + "The split_by variables changed." + ) + expect_error( + write_vc( + test_data, file = "sorted", root = root, split_by = "test_factor" + ), + "The split_by variables changed." + ) + expect_warning( + write_vc( + test_data, file = "sorted", root = root, split_by = "test_factor", + strict = FALSE + ), + "The split_by variables changed." + ) + data_file <- list.files( file.path(root, sorted_file[1]), pattern = "[[:xdigit:]]{20}", full.names = TRUE From b5a68690b745c649a9ee1e214d6e023034460bf1 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 22 Sep 2020 16:18:34 +0200 Subject: [PATCH 09/23] datahash() handles split_by objects This simplifies the logical in write_vc() and read_vc() --- NAMESPACE | 1 + R/datahash.R | 42 ++++++++++++++++++++++++++---------------- R/meta.R | 2 +- R/read_vc.R | 4 +--- R/write_vc.R | 10 ++++------ 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 19abdd3..061a942 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -78,6 +78,7 @@ importFrom(git2r,status) importFrom(git2r,workdir) importFrom(methods,setOldClass) importFrom(stats,setNames) +importFrom(utils,file_test) importFrom(utils,packageVersion) importFrom(utils,read.table) importFrom(utils,write.table) diff --git a/R/datahash.R b/R/datahash.R index 3c13188..0512a55 100644 --- a/R/datahash.R +++ b/R/datahash.R @@ -8,23 +8,12 @@ #' @family internal #' @importFrom assertthat assert_that #' @importFrom git2r hash +#' @importFrom utils file_test datahash <- function(file) { - chunk_size <- 1e4 - hashes <- character(chunk_size + 1) - i <- 0 - rawdata <- scan( - file = file, what = character(), nmax = -1, sep = "\n", quote = "", - skip = i * chunk_size, nlines = chunk_size, na.strings = "", - flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE, - blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE, - encoding = "UTF-8", skipNul = FALSE - ) - while (length(rawdata)) { - hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n")) - i <- i + 1 - if (i %% chunk_size == 0) { - hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov - } + if (file_test("-f", file)) { + chunk_size <- 1e4 + hashes <- character(chunk_size + 1) + i <- 0 rawdata <- scan( file = file, what = character(), nmax = -1, sep = "\n", quote = "", skip = i * chunk_size, nlines = chunk_size, na.strings = "", @@ -32,6 +21,27 @@ datahash <- function(file) { blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE, encoding = "UTF-8", skipNul = FALSE ) + while (length(rawdata)) { + hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n")) + i <- i + 1 + if (i %% chunk_size == 0) { + hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov + } + rawdata <- scan( + file = file, what = character(), nmax = -1, sep = "\n", quote = "", + skip = i * chunk_size, nlines = chunk_size, na.strings = "", + flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE, + blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE, + encoding = "UTF-8", skipNul = FALSE + ) + } + } else { + hashes <- sapply( + list.files( + file, pattern = "(index|[[:xdigit:]]{20}\\.tsv$)", full.names = TRUE + ), + datahash + ) } hash(paste(hashes, collapse = "")) } diff --git a/R/meta.R b/R/meta.R index 8da3c0f..b213e1c 100644 --- a/R/meta.R +++ b/R/meta.R @@ -262,7 +262,7 @@ Add extra sorting variables to ensure small diffs.", sorted) } generic <- c(generic, sorting = list(sorting)) } - if (length(split_by)) { + if (length(split_by) > 0) { generic <- c(generic, split_by = list(split_by)) } # calculate meta for each column diff --git a/R/read_vc.R b/R/read_vc.R index ec050c4..4b6bdde 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -101,12 +101,10 @@ read_vc.character <- function(file, root = ".") { index[rep(i, nrow(raw_data)), split_by, drop = FALSE], raw_data ) - attr(raw_data, "hash") <- datahash(rf) return(list(raw_data)) }, vector(mode = "list", length = 1) ) - dh <- sha1(vapply(raw_data, attr, character(1), "hash")) raw_data <- do.call(rbind, raw_data)[, col_names] } else { raw_data <- read.table( @@ -116,8 +114,8 @@ read_vc.character <- function(file, root = ".") { comment.char = "", stringsAsFactors = FALSE, fileEncoding = "UTF-8" ) - dh <- datahash(file["raw_file"]) } + dh <- datahash(file["raw_file"]) if (meta_data[["..generic"]][["data_hash"]] != dh) { meta_data[["..generic"]][["data_hash"]] <- dh diff --git a/R/write_vc.R b/R/write_vc.R index 4acbdd0..745187f 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -116,7 +116,6 @@ write_vc.character <- function( sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) - data_hash <- datahash(file["raw_file"]) } else { index <- unique(raw_data[split_by]) index[["..hash"]] <- apply(index, 1, sha1) @@ -127,7 +126,7 @@ write_vc.character <- function( row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) detail_names <- colnames(raw_data)[!colnames(raw_data) %in% split_by] - data_hash <- vapply( + vapply( seq_len(nrow(index)), function(i) { matching <- vapply( @@ -144,17 +143,16 @@ write_vc.character <- function( append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) - datahash(rf) + return(TRUE) }, - character(1) + logical(1) ) - data_hash <- sha1(data_hash) } meta_data <- attr(raw_data, "meta") meta_data[["..generic"]][["git2rdata"]] <- as.character( packageVersion("git2rdata") ) - meta_data[["..generic"]][["data_hash"]] <- data_hash + meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") From 44868e4e5020dc9a038481f08580cd5306209a4a Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 22 Sep 2020 18:31:35 +0200 Subject: [PATCH 10/23] Add rename_variable() --- .Rbuildignore | 2 + .gitignore | 2 + DESCRIPTION | 1 + NAMESPACE | 4 + NEWS.md | 9 ++ R/rename_variable.R | 160 +++++++++++++++++++ man/list_data.Rd | 1 + man/prune_meta.Rd | 1 + man/read_vc.Rd | 1 + man/relabel.Rd | 1 + man/rename_variable.Rd | 97 ++++++++++++ man/rm_data.Rd | 1 + man/write_vc.Rd | 1 + tests/testthat/test_g_rename_variable.R | 202 ++++++++++++++++++++++++ 14 files changed, 483 insertions(+) create mode 100644 R/rename_variable.R create mode 100644 man/rename_variable.Rd create mode 100644 tests/testthat/test_g_rename_variable.R diff --git a/.Rbuildignore b/.Rbuildignore index af21982..b9532f6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,5 @@ ^codecov.yml$ ^LICENSE.md$ ^\.httr-oauth$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 155cda3..cde4424 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ inst/doc docs .httr-oauth +doc +Meta diff --git a/DESCRIPTION b/DESCRIPTION index 11ae337..015f956 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,5 +67,6 @@ Collate: 'recent_commit.R' 'reexport.R' 'relabel.R' + 'rename_variable.R' 'upgrade_data.R' 'utils.R' diff --git a/NAMESPACE b/NAMESPACE index 061a942..1606515 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,9 @@ S3method(recent_commit,git_repository) S3method(relabel,data.frame) S3method(relabel,default) S3method(relabel,list) +S3method(rename_variable,character) +S3method(rename_variable,default) +S3method(rename_variable,git_repository) S3method(rm_data,character) S3method(rm_data,default) S3method(rm_data,git_repository) @@ -53,6 +56,7 @@ export(push) export(read_vc) export(recent_commit) export(relabel) +export(rename_variable) export(repository) export(rm_data) export(status) diff --git a/NEWS.md b/NEWS.md index a454c1e..624b130 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,18 @@ # git2rdata 0.2.3 +## New features + * `write_vc()` gains an optional `split_by` argument. + See `vignette("split_by")` for more details. +* `rename_variable()` efficiently renames variables in a stored `git2rdata` + object. + +## Bugfixes + * `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message when both the data and metadata are missing. + # git2rdata 0.2.2 * Use the [checklist](https://inbo.github.io/checklist) package for CI. diff --git a/R/rename_variable.R b/R/rename_variable.R new file mode 100644 index 0000000..76f829a --- /dev/null +++ b/R/rename_variable.R @@ -0,0 +1,160 @@ +#' Rename a Variable +#' +#' The raw data file contains a header with the variable names. +#' The metadata list the variable names and their type. +#' Changing a variable name and overwriting the `git2rdata` object with result +#' in an error. +#' Because it will look like removing an existing variable and adding a new one. +#' Overwriting the object with `strict = FALSE` potentially changes the order of +#' the variables, leading to a large diff. +#' +#' This function solves this by only updating the raw data header and the +#' metadata. +#' @inheritParams write_vc +#' @param change A named vector with the old names as values and the new names +#' as names. +#' @return invisible `NULL`. +#' @export +#' @examples +#' +#' # initialise a git repo using git2r +#' repo_path <- tempfile("git2rdata-repo-") +#' dir.create(repo_path) +#' repo <- git2r::init(repo_path) +#' git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") +#' +#' # Create a dataframe and store it as an optimized git2rdata object. +#' # Note that write_vc() uses optimization by default. +#' # Stage and commit the git2rdata object. +#' ds <- data.frame( +#' a = c("a1", "a2"), +#' b = c("b2", "b1"), +#' stringsAsFactors = TRUE +#' ) +#' junk <- write_vc(ds, "rename", repo, sorting = "b", stage = TRUE) +#' cm <- commit(repo, "initial commit") +#' # check that the workspace is clean +#' status(repo) +#' +#' # Define change. +#' change <- c(new_name = "a") +#' rename_variable(file = "rename", change = change, root = repo) +#' # check the changes +#' read_vc("rename", repo) +#' status(repo) +#' cm <- commit(repo, "relabel using a list") +#' +#' # clean up +#' junk <- file.remove( +#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, +#' include.dirs = TRUE, all.files = TRUE)), +#' repo_path) +#' @family storage +rename_variable <- function(file, change, root = ".", ...) { + UseMethod("rename_variable", root) +} + +#' @rdname rename_variable +#' @export +#' @importFrom assertthat assert_that noNA +#' @importFrom yaml read_yaml write_yaml +#' @importFrom utils file_test +rename_variable.character <- function(file, change, root = ".", ...) { + assert_that(is.character(change), noNA(change), length(change) > 0) + assert_that(length(names(change)) > 0, msg = "`change` must have names.") + assert_that( + length(unique(change)) == length(change), + length(unique(names(change))) == length(names(change)), + msg = "The names and values in `change` are not unique." + ) + is_git2rdata(file = file, root = root, message = "error") + file <- clean_data_path(root = root, file = file) + yaml <- read_yaml(file[["meta_file"]]) + assert_that( + all(change %in% names(yaml)), + msg = "Not every old name in `change` present in the `git2rdata` object." + ) + assert_that( + !any(names(change) %in% names(yaml)), + msg = "New name in `change` present in the existing `git2rdata` object." + ) + names(yaml) <- replace_vector(names(yaml), change) + yaml[["..generic"]][["sorting"]] <- replace_vector( + yaml[["..generic"]][["sorting"]], change + ) + if (file_test("-f", file["raw_file"])) { + replace_header(file["raw_file"], change) + } else { + vapply( + c( + file.path(file["raw_file"], "index.tsv"), + list.files( + file["raw_file"], pattern = "[[:xdigit:]]{20}.tsv", full.names = TRUE + ) + ), + replace_header, change = change, logical(1) + ) + yaml[["..generic"]][["split_by"]] <- replace_vector( + yaml[["..generic"]][["split_by"]], change + ) + } + yaml[["..generic"]][["hash"]] <- metadata_hash(yaml) + yaml[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) + write_yaml(yaml, file["meta_file"], fileEncoding = "UTF-8") + + hashes <- remove_root(file = file, root = root) + names(hashes) <- + c( + yaml[["..generic"]][["data_hash"]], + yaml[["..generic"]][["hash"]] + ) + + return(hashes) +} + +replace_vector <- function(x, change) { + if (!any(change %in% x)) { + return(x) + } + for (i in seq_along(change)) { + x[x == change[i]] <- names(change[i]) + } + return(x) +} + +replace_header <- function(x, change) { + raw_data <- readLines(x) + header <- strsplit(raw_data[1], "\t")[[1]] + for (i in seq_along(change)) { + header[header == change[i]] <- names(change)[i] + } + raw_data[1] <- paste0(header, collapse = "\t") + writeLines(text = raw_data, con = x) + return(TRUE) +} + +#' @rdname rename_variable +#' @export +rename_variable.default <- function(file, change, root, ...) { + stop("a 'root' of class ", class(root), " is not supported", + call. = FALSE) +} + +#' @rdname rename_variable +#' @export +#' @inheritParams write_vc +#' @inheritParams git2r::add +#' @importFrom assertthat assert_that is.flag noNA +#' @importFrom git2r add workdir +rename_variable.git_repository <- function( + file, change, root, ..., stage = FALSE, force = FALSE +) { + assert_that(is.flag(stage), noNA(stage), is.flag(force), noNA(force)) + hashes <- rename_variable(file = file, root = workdir(root), change = change) + if (!stage) { + return(hashes) + } + + add(root, path = hashes, force = force) + return(hashes) +} diff --git a/man/list_data.Rd b/man/list_data.Rd index 96953a3..435ddd3 100644 --- a/man/list_data.Rd +++ b/man/list_data.Rd @@ -103,6 +103,7 @@ Other storage: \code{\link{prune_meta}()}, \code{\link{read_vc}()}, \code{\link{relabel}()}, +\code{\link{rename_variable}()}, \code{\link{rm_data}()}, \code{\link{write_vc}()} } diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd index 9a0b6fd..7d4a6de 100644 --- a/man/prune_meta.Rd +++ b/man/prune_meta.Rd @@ -117,6 +117,7 @@ Other storage: \code{\link{list_data}()}, \code{\link{read_vc}()}, \code{\link{relabel}()}, +\code{\link{rename_variable}()}, \code{\link{rm_data}()}, \code{\link{write_vc}()} } diff --git a/man/read_vc.Rd b/man/read_vc.Rd index f2431e5..6976451 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -100,6 +100,7 @@ Other storage: \code{\link{list_data}()}, \code{\link{prune_meta}()}, \code{\link{relabel}()}, +\code{\link{rename_variable}()}, \code{\link{rm_data}()}, \code{\link{write_vc}()} } diff --git a/man/relabel.Rd b/man/relabel.Rd index 0a08c79..5914631 100644 --- a/man/relabel.Rd +++ b/man/relabel.Rd @@ -91,6 +91,7 @@ Other storage: \code{\link{list_data}()}, \code{\link{prune_meta}()}, \code{\link{read_vc}()}, +\code{\link{rename_variable}()}, \code{\link{rm_data}()}, \code{\link{write_vc}()} } diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd new file mode 100644 index 0000000..c24c94b --- /dev/null +++ b/man/rename_variable.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rename_variable.R +\name{rename_variable} +\alias{rename_variable} +\alias{rename_variable.character} +\alias{rename_variable.default} +\alias{rename_variable.git_repository} +\title{Rename a Variable} +\usage{ +rename_variable(file, change, root = ".", ...) + +\method{rename_variable}{character}(file, change, root = ".", ...) + +\method{rename_variable}{default}(file, change, root, ...) + +\method{rename_variable}{git_repository}(file, change, root, ..., stage = FALSE, force = FALSE) +} +\arguments{ +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}. +Note that \code{file} must point to a location within \code{root}.} + +\item{change}{A named vector with the old names as values and the new names +as names.} + +\item{root}{The root of a project. Can be a file path or a \code{git-repository}. +Defaults to the current working directory (\code{"."}).} + +\item{...}{parameters used in some methods} + +\item{stage}{Logical value indicating whether to stage the changes after +writing the data. Defaults to \code{FALSE}.} + +\item{force}{Add ignored files. Default is FALSE.} +} +\value{ +invisible \code{NULL}. +} +\description{ +The raw data file contains a header with the variable names. +The metadata list the variable names and their type. +Changing a variable name and overwriting the \code{git2rdata} object with result +in an error. +Because it will look like removing an existing variable and adding a new one. +Overwriting the object with \code{strict = FALSE} potentially changes the order of +the variables, leading to a large diff. +} +\details{ +This function solves this by only updating the raw data header and the +metadata. +} +\examples{ + +# initialise a git repo using git2r +repo_path <- tempfile("git2rdata-repo-") +dir.create(repo_path) +repo <- git2r::init(repo_path) +git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") + +# Create a dataframe and store it as an optimized git2rdata object. +# Note that write_vc() uses optimization by default. +# Stage and commit the git2rdata object. +ds <- data.frame( + a = c("a1", "a2"), + b = c("b2", "b1"), + stringsAsFactors = TRUE +) +junk <- write_vc(ds, "rename", repo, sorting = "b", stage = TRUE) +cm <- commit(repo, "initial commit") +# check that the workspace is clean +status(repo) + +# Define change. +change <- c(new_name = "a") +rename_variable(file = "rename", change = change, root = repo) +# check the changes +read_vc("rename", repo) +status(repo) +cm <- commit(repo, "relabel using a list") + +# clean up +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) +} +\seealso{ +Other storage: +\code{\link{list_data}()}, +\code{\link{prune_meta}()}, +\code{\link{read_vc}()}, +\code{\link{relabel}()}, +\code{\link{rm_data}()}, +\code{\link{write_vc}()} +} +\concept{storage} diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 6478e66..31d4052 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -134,6 +134,7 @@ Other storage: \code{\link{prune_meta}()}, \code{\link{read_vc}()}, \code{\link{relabel}()}, +\code{\link{rename_variable}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/write_vc.Rd b/man/write_vc.Rd index 819b8ca..b9aa4e9 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -171,6 +171,7 @@ Other storage: \code{\link{prune_meta}()}, \code{\link{read_vc}()}, \code{\link{relabel}()}, +\code{\link{rename_variable}()}, \code{\link{rm_data}()} } \concept{storage} diff --git a/tests/testthat/test_g_rename_variable.R b/tests/testthat/test_g_rename_variable.R new file mode 100644 index 0000000..4d85bd0 --- /dev/null +++ b/tests/testthat/test_g_rename_variable.R @@ -0,0 +1,202 @@ +test_that("rename_variable() handles single files", { + root <- tempfile(pattern = "git2rdata-rename") + dir.create(root) + repo <- git2r::init(root) + git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") + files <- suppressWarnings( + write_vc(test_data, file = "unsorted", root = repo, stage = TRUE) + ) + cm <- commit(repo, "initial commit") + + # unsorted unstaged + change <- c("new_var" = "test_Date") + expect_silent({ + rf <- rename_variable(file = files[1], change = change, root = repo) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["unstaged"]]) > 0) + expect_length(git2r::status(repo)[["staged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_identical(test_data[, change], changed_df[, names(change)]) + git2r::reset(cm, "hard") + + files <- write_vc( + test_data, file = "sorted", root = repo, sorting = "test_Date", stage = TRUE + ) + cm <- commit(repo, "sorted") + # staged & sorted on changed variable + change <- c("new_var" = "test_Date") + expect_silent({ + rf <- rename_variable( + file = files[1], change = change, root = repo, stage = TRUE + ) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["staged"]]) > 0) + expect_length(git2r::status(repo)[["unstaged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_equivalent(sorted_test_data[, change], changed_df[, names(change)]) + git2r::reset(cm, "hard") + + # staged & sorted on other variable + change <- c("new_var" = "test_numeric") + expect_silent({ + rf <- rename_variable( + file = files[1], change = change, root = repo, stage = TRUE + ) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["staged"]]) > 0) + expect_length(git2r::status(repo)[["unstaged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_equivalent(sorted_test_data[, change], changed_df[, names(change)]) + git2r::reset(cm, "hard") + + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) +}) + +test_that("rename_variable() handles split_by files", { + root <- tempfile(pattern = "git2rdata-rename") + dir.create(root) + repo <- git2r::init(root) + git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") + files <- suppressWarnings( + write_vc( + test_data, file = "unsorted", split_by = "test_factor", root = repo, + stage = TRUE + ) + ) + cm <- commit(repo, "initial commit") + + # unsorted unstaged + change <- c("new_var" = "test_Date") + expect_silent({ + rf <- rename_variable(file = files[1], change = change, root = repo) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["unstaged"]]) > 0) + expect_length(git2r::status(repo)[["staged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + git2r::reset(cm, "hard") + + files <- write_vc( + test_data, file = "sorted", root = repo, sorting = "test_Date", + split_by = "test_factor", stage = TRUE + ) + cm <- commit(repo, "sorted") + # staged & sorted on changed variable + change <- c("new_var" = "test_Date") + expect_silent({ + rf <- rename_variable( + file = files[1], change = change, root = repo, stage = TRUE + ) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["staged"]]) > 0) + expect_length(git2r::status(repo)[["unstaged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_equivalent( + test_data[order(test_data$test_factor, test_data$test_Date), change], + changed_df[, names(change)] + ) + git2r::reset(cm, "hard") + + # staged & split_by variable + change <- c("new_var" = "test_factor") + expect_silent({ + rf <- rename_variable( + file = files[1], change = change, root = repo, stage = TRUE + ) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["staged"]]) > 0) + expect_length(git2r::status(repo)[["unstaged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_equivalent( + test_data[order(test_data$test_factor, test_data$test_Date), change], + changed_df[, names(change)] + ) + git2r::reset(cm, "hard") + + # staged & sorted on other variable + change <- c("new_var" = "test_numeric") + expect_silent({ + rf <- rename_variable( + file = files[1], change = change, root = repo, stage = TRUE + ) + }) + expect_identical(unname(files), unname(rf)) + expect_true(length(git2r::status(repo)[["staged"]]) > 0) + expect_length(git2r::status(repo)[["unstaged"]], 0) + expect_length(git2r::status(repo)[["untracked"]], 0) + expect_silent({ + changed_df <- read_vc(rf[1], root = repo) + }) + expect_identical(ncol(test_data), ncol(changed_df)) + updated <- which(colnames(test_data) != colnames(changed_df)) + expect_identical(length(updated), length(change)) + expect_identical(colnames(test_data)[updated], unname(change)) + expect_identical(colnames(changed_df)[updated], names(change)) + expect_equivalent( + test_data[order(test_data$test_factor, test_data$test_Date), change], + changed_df[, names(change)] + ) + git2r::reset(cm, "hard") + + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) +}) + +test_that("rename_variable() handles wrong type of root", { + expect_error( + rename_variable(root = 1), + "a 'root' of class numeric is not supported" + ) +}) From 30f8c32b32eac222bf53aa57752aae8fd80a33fe Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 22 Sep 2020 18:33:31 +0200 Subject: [PATCH 11/23] Bump package version --- DESCRIPTION | 2 +- NEWS.md | 2 +- codemeta.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 015f956..636ebe6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.2.3 +Version: 0.3.0 Authors@R: c(person(given = "Thierry", family = "Onkelinx", diff --git a/NEWS.md b/NEWS.md index 624b130..8adc6a5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# git2rdata 0.2.3 +# git2rdata 0.3.0 ## New features diff --git a/codemeta.json b/codemeta.json index 8ad6d43..4dfdc0c 100644 --- a/codemeta.json +++ b/codemeta.json @@ -14,7 +14,7 @@ ], "issueTracker": "https://github.com/ropensci/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.2.3", + "version": "0.3.0", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", @@ -218,7 +218,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "616.079KB", + "fileSize": "632.425KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", From 2a981d28e298b8c7ba9e2aa82db53892d42cf788 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 23 Sep 2020 00:43:52 +0200 Subject: [PATCH 12/23] fix example --- NEWS.md | 1 - R/rename_variable.R | 1 - codemeta.json | 2 +- man/rename_variable.Rd | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8adc6a5..540f473 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,7 +12,6 @@ * `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message when both the data and metadata are missing. - # git2rdata 0.2.2 * Use the [checklist](https://inbo.github.io/checklist) package for CI. diff --git a/R/rename_variable.R b/R/rename_variable.R index 76f829a..b4eb121 100644 --- a/R/rename_variable.R +++ b/R/rename_variable.R @@ -42,7 +42,6 @@ #' # check the changes #' read_vc("rename", repo) #' status(repo) -#' cm <- commit(repo, "relabel using a list") #' #' # clean up #' junk <- file.remove( diff --git a/codemeta.json b/codemeta.json index 4dfdc0c..7f15db2 100644 --- a/codemeta.json +++ b/codemeta.json @@ -218,7 +218,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "632.425KB", + "fileSize": "632.335KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd index c24c94b..4d720cd 100644 --- a/man/rename_variable.Rd +++ b/man/rename_variable.Rd @@ -77,7 +77,6 @@ rename_variable(file = "rename", change = change, root = repo) # check the changes read_vc("rename", repo) status(repo) -cm <- commit(repo, "relabel using a list") # clean up junk <- file.remove( From 0f4f10743b2e1cbd30bbe8f8c2ef70d2f34aed59 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 23 Sep 2020 10:15:53 +0200 Subject: [PATCH 13/23] Run test with R devel on Ubuntu 20.04 --- .github/workflows/check_on_different_r_os.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_on_different_r_os.yml b/.github/workflows/check_on_different_r_os.yml index 01cfc5a..aa2d2be 100644 --- a/.github/workflows/check_on_different_r_os.yml +++ b/.github/workflows/check_on_different_r_os.yml @@ -18,9 +18,9 @@ jobs: fail-fast: false matrix: config: - - {os: macOS-latest, r: 'devel'} - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} env: From e60f10a8603ad4c52d6a3e616c20060a2face947 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 23 Sep 2020 10:16:54 +0200 Subject: [PATCH 14/23] Use git2r::hash() instead of digest::sha1() This removes the digest dependency --- DESCRIPTION | 1 - NAMESPACE | 2 -- R/read_vc.R | 1 - R/write_vc.R | 5 ++--- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 636ebe6..05cae3a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,7 +35,6 @@ Depends: R (>= 3.5.0) Imports: assertthat, - digest, git2r (>= 0.23.0), methods, yaml diff --git a/NAMESPACE b/NAMESPACE index 1606515..df4c980 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,11 +68,9 @@ importFrom(assertthat,has_name) importFrom(assertthat,is.flag) importFrom(assertthat,is.string) importFrom(assertthat,noNA) -importFrom(digest,sha1) importFrom(git2r,add) importFrom(git2r,commit) importFrom(git2r,hash) -importFrom(git2r,hashfile) importFrom(git2r,last_commit) importFrom(git2r,odb_blobs) importFrom(git2r,pull) diff --git a/R/read_vc.R b/R/read_vc.R index 4b6bdde..657a5d0 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -30,7 +30,6 @@ read_vc.default <- function(file, root) { #' @importFrom yaml read_yaml #' @importFrom utils read.table #' @importFrom stats setNames -#' @importFrom git2r hashfile read_vc.character <- function(file, root = ".") { assert_that(is.string(file), is.string(root)) root <- normalizePath(root, winslash = "/", mustWork = TRUE) diff --git a/R/write_vc.R b/R/write_vc.R index 745187f..54faf18 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -52,10 +52,9 @@ write_vc.default <- function( #' This creates a separate file for every combination. #' @export #' @importFrom assertthat assert_that is.string is.flag -#' @importFrom digest sha1 #' @importFrom yaml read_yaml write_yaml #' @importFrom utils write.table -#' @importFrom git2r hashfile +#' @importFrom git2r hash write_vc.character <- function( x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ..., split_by = character(0) @@ -118,7 +117,7 @@ write_vc.character <- function( ) } else { index <- unique(raw_data[split_by]) - index[["..hash"]] <- apply(index, 1, sha1) + index[["..hash"]] <- hash(apply(index, 1, paste, collapse = "\t")) dir.create(file["raw_file"], showWarnings = FALSE, recursive = TRUE) write.table( x = index, file = file.path(file["raw_file"], "index.tsv"), From d403e1a7b6d99b60cb2eeafeee86ccef818bfc4d Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 23 Sep 2020 10:23:50 +0200 Subject: [PATCH 15/23] Improve unit test on subsecond commits --- codemeta.json | 14 +------------- tests/testthat/test_d_recent_commit.R | 4 ++-- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/codemeta.json b/codemeta.json index 7f15db2..74bcd9c 100644 --- a/codemeta.json +++ b/codemeta.json @@ -175,18 +175,6 @@ }, "sameAs": "https://CRAN.R-project.org/package=assertthat" }, - { - "@type": "SoftwareApplication", - "identifier": "digest", - "name": "digest", - "provider": { - "@id": "https://cran.r-project.org", - "@type": "Organization", - "name": "Comprehensive R Archive Network (CRAN)", - "url": "https://cran.r-project.org" - }, - "sameAs": "https://CRAN.R-project.org/package=digest" - }, { "@type": "SoftwareApplication", "identifier": "git2r", @@ -218,7 +206,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "632.335KB", + "fileSize": "632.219KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R index 52dcfc2..e5fe909 100644 --- a/tests/testthat/test_d_recent_commit.R +++ b/tests/testthat/test_d_recent_commit.R @@ -66,10 +66,10 @@ expect_identical( target <- file.path(git2r::workdir(root), "subsecond.txt") while (TRUE) { - writeLines(sample(letters), con = target) + writeLines(letters, con = target) git2r::add(root, target) cm_1 <- commit(root, "first subsecond") - writeLines(sample(letters), con = target) + writeLines(LETTERS, con = target) git2r::add(root, target) cm_2 <- commit(root, "second subsecond") output <- suppressWarnings( From ede240d48c5d58bd5f21132a69cc44e995b0dee8 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 23 Sep 2020 11:38:20 +0200 Subject: [PATCH 16/23] Set explicit timezone in unit tests. Required to get tests running under R-devel. --- codemeta.json | 2 +- tests/testthat/setup_test_data.R | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/codemeta.json b/codemeta.json index 74bcd9c..de04c34 100644 --- a/codemeta.json +++ b/codemeta.json @@ -206,7 +206,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "632.219KB", + "fileSize": "632.073KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", diff --git a/tests/testthat/setup_test_data.R b/tests/testthat/setup_test_data.R index 9c6820f..fd47a4e 100644 --- a/tests/testthat/setup_test_data.R +++ b/tests/testthat/setup_test_data.R @@ -17,7 +17,8 @@ test_data <- data.frame( test_logical = sample(c(TRUE, FALSE), size = test_n, replace = TRUE), test_POSIXct = as.POSIXct( sample(.Machine$integer.max, size = test_n, replace = TRUE), - origin = "1970-01-01" + origin = "1970-01-01", + tz = "UTC" ), test_Date = as.Date( c(sample(1e5, size = test_n - 1, replace = TRUE), 16000), @@ -31,13 +32,11 @@ sorted_test_data <- test_data[order(test_data$test_Date), ] git2rdata:::set_local_locale(old_locale) sorted_test_data$test_character <- enc2utf8(sorted_test_data$test_character) rownames(sorted_test_data) <- NULL -attr(sorted_test_data$test_POSIXct, "tzone") <- "UTC" test_subset <- head(test_data, ceiling(test_n / 2)) sorted_test_subset <- test_subset[order(test_subset$test_Date), ] rownames(sorted_test_subset) <- NULL -attr(sorted_test_subset$test_POSIXct, "tzone") <- "UTC" test_na <- test_data for (i in seq_along(test_na)) { @@ -49,4 +48,3 @@ sorted_test_na <- test_na[ ] git2rdata:::set_local_locale(old_locale) rownames(sorted_test_na) <- NULL -attr(sorted_test_na$test_POSIXct, "tzone") <- "UTC" From a442314b6851beb9ebea7ffc257e7100561dfe18 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Wed, 23 Sep 2020 13:56:39 +0200 Subject: [PATCH 17/23] vignette split_by: simplify intro --- vignettes/split_by.Rmd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index 100a6fb..159115e 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -129,10 +129,9 @@ In such a case we can use the `split_by` argument of `write_vc()`. This will store the large dataframe over a set of tab separated files. One file for every combination of the variables defined by `split_by`. Every partial data file holds one combination of `split_by`. -We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash. -This hash becomes the base name of the partial data files. -The combination of the hash in the `index.tsv` and the base name of the partial data files makes the information of `split_by` in the partial data file redundant. We remove the `split_by` variables from the partial data files, reducing their size. +We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash for each combination. +This hash becomes the base name of the partial data files. ## When to Split the Dataframe From cd20c8477f5f1112245a0ecf988f0491de00fb04 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Wed, 23 Sep 2020 13:57:33 +0200 Subject: [PATCH 18/23] vignette split_by: use $...$ --- vignettes/split_by.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index 159115e..b0fea12 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -150,14 +150,14 @@ Let's set the following variables: - $N_s$: the number of unique combinations of the `split_by` variables. Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation. -The total number of bytes is `T_0 = h_s + h_r + 1 + N (s + r + 1)`. -The `+ 1` originates from the tab character to separate the `split_by` variables from the remaining variables. +The total number of bytes is $T_0 = h_s + h_r + 1 + N (s + r + 1)$. +The $+ 1$ originates from the tab character to separate the `split_by` variables from the remaining variables. Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables. -`h_s` bytes for the header and `N_s s` for the data. +It will use $h_s$ bytes for the header and $N_s s$ for the data. The headers of the partial data files require $N_s h_r$ bytes ($N_s$ files and $h_r$ byte per file). The data in the partial data files require $N r$ bytes. -The total number of bytes is `T_s = h_s + N_s s + N_s h_r + N r`. +The total number of bytes is $T_s = h_s + N_s s + N_s h_r + N r$. We can look at the ratio of $T_s$ over $T_0$. From bbdc9f467413b76287c700561c8c4acf2a675b24 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Wed, 23 Sep 2020 13:57:49 +0200 Subject: [PATCH 19/23] vignette split_by: subscript in axis label --- vignettes/split_by.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index b0fea12..847679f 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -196,7 +196,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + geom_line() + facet_wrap(~ paste("r =", r)) + scale_x_continuous( - "b = N_s / N", + expression(b~{"="}~N[s]~{"/"}~N), labels = function(x) { paste0(100 * x, "%") } From 96b1a5e9b5439c404950eceb39ec87323394316d Mon Sep 17 00:00:00 2001 From: florisvdh Date: Wed, 23 Sep 2020 13:58:16 +0200 Subject: [PATCH 20/23] vignette split_by: fix typo y axis label --- vignettes/split_by.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index 847679f..8c691d8 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -202,7 +202,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + } ) + scale_y_continuous( - "Relative amount of disc space", + "Relative amount of disk space", labels = function(x) { paste0(100 * x, "%") } From f89d5d82f6a0dc4d446e1024314396f820b1ba66 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Wed, 23 Sep 2020 14:08:43 +0200 Subject: [PATCH 21/23] Update codemeta.json --- codemeta.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codemeta.json b/codemeta.json index de04c34..e5d42f6 100644 --- a/codemeta.json +++ b/codemeta.json @@ -206,7 +206,7 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "fileSize": "632.073KB", + "fileSize": "762.31KB", "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", "contIntegration": "https://codecov.io/gh/ropensci/git2rdata", From 3f44c55d7b76705339dbd078e3e3ccb8d00fd8be Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 13 Jan 2021 15:28:09 +0100 Subject: [PATCH 22/23] Mention that we also use the split_by variables for sorting --- R/write_vc.R | 1 + man/meta.Rd | 3 ++- man/write_vc.Rd | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/write_vc.R b/R/write_vc.R index 54faf18..146e800 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -50,6 +50,7 @@ write_vc.default <- function( #' @rdname write_vc #' @param split_by An optional vector of variables name to split the text files. #' This creates a separate file for every combination. +#' We prepend these variables to the vector of `sorting` variables. #' @export #' @importFrom assertthat assert_that is.string is.flag #' @importFrom yaml read_yaml write_yaml diff --git a/man/meta.Rd b/man/meta.Rd index 7be6e6f..c7190b9 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -62,7 +62,8 @@ See \code{vignette("efficiency", package = "git2rdata")} for an illustration of the importance of sorting.} \item{split_by}{An optional vector of variables name to split the text files. -This creates a separate file for every combination.} +This creates a separate file for every combination. +We prepend these variables to the vector of \code{sorting} variables.} } \value{ the optimized vector \code{x} with \code{meta} attribute. diff --git a/man/write_vc.Rd b/man/write_vc.Rd index b9aa4e9..ed92e31 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -76,7 +76,8 @@ Defaults to \code{TRUE}.} \item{...}{parameters used in some methods} \item{split_by}{An optional vector of variables name to split the text files. -This creates a separate file for every combination.} +This creates a separate file for every combination. +We prepend these variables to the vector of \code{sorting} variables.} \item{stage}{Logical value indicating whether to stage the changes after writing the data. Defaults to \code{FALSE}.} From 2ed454e07bf92ea3ee1c8fe2cca1364d675d3c21 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 13 Jan 2021 15:56:22 +0100 Subject: [PATCH 23/23] Tweak the split_by vignette --- vignettes/split_by.Rmd | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index 8c691d8..490a08a 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -123,16 +123,22 @@ update_geom_defaults("smooth", list(colour = "#356196")) ## Introduction Sometimes, a large dataframe has one or more variables with a small number of unique combinations. -E.g. a dataframe with factor variables. +E.g. a dataframe with one or more factor variables. +Storing the entire dataframe as a single text file requires storing lots of replicated data. +Each row stores the information for every variable, even if a subset of these variables remains constant over a subset of the data. In such a case we can use the `split_by` argument of `write_vc()`. This will store the large dataframe over a set of tab separated files. One file for every combination of the variables defined by `split_by`. -Every partial data file holds one combination of `split_by`. +Every partial data file holds the other variables for one combination of `split_by`. We remove the `split_by` variables from the partial data files, reducing their size. We add an `index.tsv` containing the combinations of the `split_by` variables and a unique hash for each combination. This hash becomes the base name of the partial data files. +Splitting the dataframe into smaller files makes them easier to handle in version control system. +The overall size depends on the amount of replication in the dataframe. +More on that in the next section. + ## When to Split the Dataframe Let's set the following variables: @@ -151,7 +157,7 @@ Let's set the following variables: Storing the dataframe with `write_vc()` without `split_by` requires $h_s + h_r + 1$ bytes for the header and $s + r + 1$ bytes for every observation. The total number of bytes is $T_0 = h_s + h_r + 1 + N (s + r + 1)$. -The $+ 1$ originates from the tab character to separate the `split_by` variables from the remaining variables. +Both $+ 1$ originate from the tab character to separate the `split_by` variables from the remaining variables. Storing the dataframe with `write_vc()` with `split_by` requires an index file to store the combinations of the `split_by` variables. It will use $h_s$ bytes for the header and $N_s s$ for the data.