From e09967484c476688d94787fb36013486321763ef Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 11 Oct 2024 11:50:24 +0200 Subject: [PATCH 01/16] write_vc() gains an append argument --- R/write_vc.R | 20 +++++++++++++++++--- man/write_vc.Rd | 5 +++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/R/write_vc.R b/R/write_vc.R index 50715ec..a33de91 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -58,12 +58,16 @@ write_vc.default <- function( #' @importFrom git2r hash write_vc.character <- function( x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, - na = "NA", ..., split_by = character(0) + na = "NA", ..., append = FALSE, split_by = character(0) ) { assert_that( - inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na), - noNA(na), no_whitespace(na), is.flag(strict), is.flag(optimize) + inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na), + noNA(na), no_whitespace(na), is.flag(strict), is.flag(optimize), + is.flag(append), noNA(append), noNA(strict), noNA(optimize) ) + if (append) { + x <- append_df(x = x, file = file, root = root) + } root <- normalizePath(root, winslash = "/", mustWork = TRUE) file <- clean_data_path(root = root, file = file) if (!file.exists(dirname(file["raw_file"]))) { @@ -331,3 +335,13 @@ remove_root <- function(file, root) { file[has_root] <- substr(file[has_root], n_root + 1, nchar(file[has_root])) return(file) } + +#' @importFrom assertthat assert_that +append_df <- function(x, file, root) { + assert_that(inherits(x, "data.frame")) + if (!is_git2rdata(file = file, root = root, message = "none")) { + return(x) + } + read_vc(file = file, root = root) |> + rbind(x) +} diff --git a/man/write_vc.Rd b/man/write_vc.Rd index 8a4e7d9..ecc25bc 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -27,6 +27,7 @@ write_vc( optimize = TRUE, na = "NA", ..., + append = FALSE, split_by = character(0) ) @@ -79,6 +80,10 @@ Defaults to \code{TRUE}.} This creates a separate file for every combination. We prepend these variables to the vector of \code{sorting} variables.} +\item{append}{logical. Only relevant if \code{file} is a character + string. If \code{TRUE}, the output is appended to the + file. If \code{FALSE}, any existing file of the name is destroyed.} + \item{stage}{Logical value indicating whether to stage the changes after writing the data. Defaults to \code{FALSE}.} From 3297092b356158da65528aa971011c6a289027dc Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 11 Oct 2024 12:12:14 +0200 Subject: [PATCH 02/16] update_metadata() gains a stage and force argument --- R/update_metadata.R | 21 ++++++++++++++++----- R/write_vc.R | 4 ++-- man/update_metadata.Rd | 12 +++++++++++- tests/testthat/test_d_description.R | 10 ++++++---- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/R/update_metadata.R b/R/update_metadata.R index f09b6bb..96465b6 100644 --- a/R/update_metadata.R +++ b/R/update_metadata.R @@ -13,30 +13,41 @@ #' @param name a character string with the new table name of the object. #' @param title a character string with the new title of the object. #' @param description a character string with the new description of the object. +#' @param ... parameters used in some methods #' @family storage #' @export #' @importFrom assertthat assert_that has_name update_metadata <- function( - file, root = ".", field_description, name, title, description + file, root = ".", field_description, name, title, description, ... ) { UseMethod("update_metadata", root) } #' @export update_metadata.default <- function( - file, root = ".", field_description, name, title, description + file, root = ".", field_description, name, title, description, ... ) { stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) } #' @export +#' @importFrom assertthat assert_that is.string noNA +#' @importFrom git2r add +#' @inheritParams git2r::add update_metadata.git_repository <- function( - file, root = ".", field_description, name, title, description + file, root = ".", field_description, name, title, description, ..., + stage = FALSE, force = FALSE ) { - update_metadata( + assert_that(is.flag(stage), is.flag(force), noNA(stage), noNA(force)) + file <- update_metadata( file = file, root = workdir(root), name = name, title = title, description = description, field_description = field_description ) + if (!stage) { + return(invisible(file)) + } + add(root, path = file, force = force) + return(invisible(file)) } #' @export @@ -83,7 +94,7 @@ update_metadata.character <- function( as.character() -> old[["..generic"]][["git2rdata"]] metadata_hash(old) -> old[["..generic"]][["hash"]] write_yaml(old, file["meta_file"]) - return(invisible(NULL)) + return(invisible(file["meta_file"])) } #' @importFrom assertthat assert_that is.string diff --git a/R/write_vc.R b/R/write_vc.R index a33de91..ee93e9b 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -188,12 +188,12 @@ setOldClass("git_repository") #' @inheritParams git2r::add #' @export #' @importFrom git2r workdir add -#' @importFrom assertthat assert_that is.flag +#' @importFrom assertthat assert_that is.flag noNA write_vc.git_repository <- function( x, file, root, sorting, strict = TRUE, optimize = TRUE, na = "NA", ..., stage = FALSE, force = FALSE ) { - assert_that(is.flag(stage), is.flag(force)) + assert_that(is.flag(stage), is.flag(force), noNA(stage), noNA(force)) hashes <- write_vc( x = x, file = file, root = workdir(root), sorting = sorting, strict = strict, optimize = optimize, na = na, ... diff --git a/man/update_metadata.Rd b/man/update_metadata.Rd index 968e138..864fc5f 100644 --- a/man/update_metadata.Rd +++ b/man/update_metadata.Rd @@ -4,7 +4,15 @@ \alias{update_metadata} \title{Update the description of a \code{git2rdata} object} \usage{ -update_metadata(file, root = ".", field_description, name, title, description) +update_metadata( + file, + root = ".", + field_description, + name, + title, + description, + ... +) } \arguments{ \item{file}{the name of the git2rdata object. Git2rdata objects cannot @@ -24,6 +32,8 @@ The names of the vector must match the variable names.} \item{title}{a character string with the new title of the object.} \item{description}{a character string with the new description of the object.} + +\item{...}{parameters used in some methods} } \description{ Allows to update the description of the fields, the table name, the title, diff --git a/tests/testthat/test_d_description.R b/tests/testthat/test_d_description.R index d6a3140..6b88b82 100644 --- a/tests/testthat/test_d_description.R +++ b/tests/testthat/test_d_description.R @@ -16,13 +16,14 @@ test_that("description", { "character" ) - expect_null( + expect_type( update_metadata( file = "test", root = root, field_description = c( test_character = "Some information", test_factor = "Some information", test_integer = "Some information" ) - ) + ), + "character" ) expect_is({ @@ -48,12 +49,13 @@ test_that("description", { git2r::add(root, ".gitignore") commit(root, "initial commit") - expect_null( + expect_type( update_metadata( file = "test", root = root, name = "my_table", title = "My Table", description = "This is description for the unit tests", field_description = c(test_character = NA, test_factor = "") - ) + ), + "character" ) expect_is({ output <- read_vc("test", root = root) From e2af5bd84d191d190cdfe0f3cb4682b9eda86991 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 26 Nov 2024 18:11:04 +0100 Subject: [PATCH 03/16] read_vc() with split_by handles empty data --- R/read_vc.R | 55 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/R/read_vc.R b/R/read_vc.R index 2c57d41..ddd346d 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -83,29 +83,38 @@ read_vc.character <- function(file, root = ".") { comment.char = "", stringsAsFactors = FALSE, fileEncoding = "UTF-8" ) - raw_data <- vapply( - seq_len(nrow(index)), - function(i) { - rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")) - raw_data <- read.table( - file = rf, header = TRUE, sep = "\t", quote = "\"", - dec = ".", numerals = "warn.loss", na.strings = na_string, - colClasses = setNames( - col_type[col_classes[!which_split_by]], - col_names[!which_split_by] - ), - comment.char = "", - stringsAsFactors = FALSE, fileEncoding = "UTF-8" - ) - raw_data <- cbind( - index[rep(i, nrow(raw_data)), split_by, drop = FALSE], - raw_data - ) - return(list(raw_data)) - }, - vector(mode = "list", length = 1) - ) - raw_data <- do.call(rbind, raw_data)[, col_names] + if (nrow(index) == 0) { + list( + character = character(0), factor = character(0), integer = integer(0), + numeric = numeric(0) + )[col_classes] |> + setNames(col_names) |> + as.data.frame() -> raw_data + } else { + raw_data <- vapply( + seq_len(nrow(index)), + function(i) { + rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv")) + raw_data <- read.table( + file = rf, header = TRUE, sep = "\t", quote = "\"", + dec = ".", numerals = "warn.loss", na.strings = na_string, + colClasses = setNames( + col_type[col_classes[!which_split_by]], + col_names[!which_split_by] + ), + comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" + ) + raw_data <- cbind( + index[rep(i, nrow(raw_data)), split_by, drop = FALSE], + raw_data + ) + return(list(raw_data)) + }, + vector(mode = "list", length = 1) + ) + raw_data <- do.call(rbind, raw_data)[, col_names] + } } else { raw_data <- read.table( file = file["raw_file"], header = TRUE, sep = ifelse(optimize, "\t", ","), From fb937245e77817b49d35fe80d59aac68b43df8fa Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 26 Nov 2024 18:14:08 +0100 Subject: [PATCH 04/16] bump package version --- .zenodo.json | 2 +- CITATION.cff | 2 +- DESCRIPTION | 2 +- NEWS.md | 4 ++++ inst/CITATION | 4 ++-- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index 9191de3..01e42a1 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,6 +1,6 @@ { "title": "git2rdata: Store and Retrieve Data.frames in a Git Repository", - "version": "0.4.1", + "version": "0.5.0", "license": "GPL-3.0", "upload_type": "software", "description": "

The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(“plain_text”, package = “git2rdata”). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(“version_control”, package = “git2rdata”). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(“workflow”, package = “git2rdata”) gives a toy example. 4) vignette(“efficiency”, package = “git2rdata”) provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.<\/p>", diff --git a/CITATION.cff b/CITATION.cff index 9afa3b0..9dbf321 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -40,4 +40,4 @@ identifiers: value: 10.5281/zenodo.1485309 - type: url value: https://ropensci.github.io/git2rdata/ -version: 0.4.1 +version: 0.5.0 diff --git a/DESCRIPTION b/DESCRIPTION index 50f863d..3c213e4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.4.1 +Version: 0.5.0 Authors@R: c( person("Thierry", "Onkelinx", , "thierry.onkelinx@inbo.be", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-8804-4216", affiliation = "Research Institute for Nature and Forest (INBO)")), diff --git a/NEWS.md b/NEWS.md index f8a08c0..26a570d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# git2rdata 0.5.0 + +* `read_vc()` handles empty datasets stored with `split_by`. + # git2rdata 0.4.1 * Add `update_metadata()` to update the description of a `git2rdata` object. diff --git a/inst/CITATION b/inst/CITATION index 03d5c32..78c6240 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -2,12 +2,12 @@ citHeader("To cite `git2rdata` in publications please use:") # begin checklist entry bibentry( bibtype = "Manual", - title = "git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.4.1", + title = "git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.5.0", author = c( author = c(person(given = "Thierry", family = "Onkelinx"))), year = 2024, url = "https://ropensci.github.io/git2rdata/", abstract = "The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(\"plain_text\", package = \"git2rdata\"). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(\"version_control\", package = \"git2rdata\"). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(\"workflow\", package = \"git2rdata\") gives a toy example. 4) vignette(\"efficiency\", package = \"git2rdata\") provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.", - textVersion = "Onkelinx, Thierry (2024) git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.4.1. https://ropensci.github.io/git2rdata/", + textVersion = "Onkelinx, Thierry (2024) git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.5.0. https://ropensci.github.io/git2rdata/", keywords = "git; version control; plain text data", doi = "10.5281/zenodo.1485309", ) From 35ad6ddaa5f6a40bde76ebf895a2f695bb3e7051 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Sat, 7 Dec 2024 20:02:58 +0100 Subject: [PATCH 05/16] write_vc() and meta() gain a digit argument --- NAMESPACE | 1 + R/meta.R | 53 ++++++-- R/recent_commit.R | 12 +- R/update_metadata.R | 2 +- R/upgrade_data.R | 11 +- R/write_vc.R | 14 ++- man-roxygen/example_io.R | 5 +- man/meta.Rd | 15 ++- man/read_vc.Rd | 5 +- man/recent_commit.Rd | 12 +- man/upgrade_data.Rd | 11 +- man/write_vc.Rd | 16 ++- tests/testthat/setup_test_data.R | 12 ++ tests/testthat/test_a_basics.R | 144 +++++++++------------- tests/testthat/test_b_is_git2rmeta.R | 12 +- tests/testthat/test_b_prune.R | 15 ++- tests/testthat/test_b_update.R | 29 +++-- tests/testthat/test_b_verify_vc.R | 3 +- tests/testthat/test_c_git.R | 78 ++++-------- tests/testthat/test_d_description.R | 3 +- tests/testthat/test_d_recent_commit.R | 6 +- tests/testthat/test_e_validate_metadata.R | 12 +- tests/testthat/test_f_split_by.R | 11 +- tests/testthat/test_g_rename_variable.R | 11 +- 24 files changed, 288 insertions(+), 205 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7371ab5..4fc0da7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ importFrom(assertthat,"on_failure<-") importFrom(assertthat,assert_that) importFrom(assertthat,has_attr) importFrom(assertthat,has_name) +importFrom(assertthat,is.count) importFrom(assertthat,is.flag) importFrom(assertthat,is.string) importFrom(assertthat,noNA) diff --git a/R/meta.R b/R/meta.R index 22483df..614c940 100644 --- a/R/meta.R +++ b/R/meta.R @@ -14,7 +14,7 @@ #' @examples #' meta(c(NA, "'NA'", '"NA"', "abc\tdef", "abc\ndef")) #' meta(1:3) -#' meta(seq(1, 3, length = 4)) +#' meta(seq(1, 3, length = 4), digits = 6) #' meta(factor(c("b", NA, "NA"), levels = c("NA", "b", "c"))) #' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c")), optimize = FALSE) #' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c"), ordered = TRUE)) @@ -29,7 +29,7 @@ #' meta(as.POSIXct("2019-02-01 10:59:59", tz = "CET"), optimize = FALSE) #' meta(as.Date("2019-02-01")) #' meta(as.Date("2019-02-01"), optimize = FALSE) -meta <- function(x, ...) { +meta <- function(x, ..., digits) { UseMethod("meta", x) } @@ -63,8 +63,11 @@ meta.integer <- function(x, ...) { } #' @export -meta.numeric <- function(x, ...) { - list(class = "numeric") -> m +#' @importFrom assertthat assert_that is.count +meta.numeric <- function(x, ..., digits) { + assert_that(is.count(digits)) + x <- signif(x, digits = digits) + list(class = "numeric", digits = as.integer(digits)) -> m class(m) <- "meta_detail" attr(x, "meta") <- m return(x) @@ -218,7 +221,7 @@ meta.Date <- function(x, optimize = TRUE, ...) { #' @inheritParams write_vc meta.data.frame <- function(# nolint x, optimize = TRUE, na = "NA", sorting, strict = TRUE, - split_by = character(0), ... + split_by = character(0), ..., digits ) { assert_that( !has_name(x, "..generic"), @@ -237,13 +240,46 @@ meta.data.frame <- function(# nolint ) dots <- list(...) + float <- vapply(x, is.numeric, logical(1)) & + !vapply(x, is.integer, logical(1)) if (has_name(dots, "old")) { old <- dots$old assert_that(inherits(old, "meta_list")) if (missing(sorting)) { sorting <- old[["..generic"]][["sorting"]] } + if (any(float) && missing(digits)) { + old_numeric <- vapply( + old, FUN.VALUE = logical(1), + FUN = function(x) { + has_name(x, "class") && x$class == "numeric" + } + ) + digits <- vapply( + old[old_numeric], FUN.VALUE = numeric(1), + FUN = function(x) { + x[["digits"]] + } + ) + relevant <- names(float)[float][!names(float)[float] %in% names(digits)] + rep(6L, length(relevant)) -> digits[relevant] + } + } + if (any(float) && missing(digits)) { + digits <- 6L + warning("`digits` was not set. Setting is automatically to 6. See ?meta") } + if (any(float) && is.null(names(digits))) { + stopifnot( + "`digits` must be either named or have length 1" = length(digits) == 1 + ) + digits <- rep(digits, sum(float)) + names(digits) <- names(float)[float] + } + stopifnot( + "`digits` must contain all numeric variables of `x`" = + all(!float) || all(names(float)[float] %in% names(digits)) + ) # apply sorting if (missing(sorting) || is.null(sorting) || !length(sorting)) { @@ -271,12 +307,13 @@ Add extra sorting variables to ensure small diffs.", sorted) if (length(split_by) > 0) { generic <- c(generic, split_by = list(split_by)) } + # calculate meta for each column if (!has_name(dots, "old")) { z <- lapply( colnames(x), function(id, optimize, na) { - meta(x[[id]], optimize = optimize, na = na) + meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]]) }, optimize = optimize, na = na ) @@ -290,7 +327,7 @@ Add extra sorting variables to ensure small diffs.", sorted) meta( x[[id]], optimize = optimize, na = na, index = setNames(old[[id]][["index"]], old[[id]][["labels"]]), - strict = strict + strict = strict, digits = digits[[id]] ) }, optimize = old[["..generic"]][["optimize"]], @@ -305,7 +342,7 @@ Add extra sorting variables to ensure small diffs.", sorted) z_new <- lapply( new, function(id, optimize, na) { - meta(x[[id]], optimize = optimize, na = na) + meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]]) }, optimize = optimize, na = na ) diff --git a/R/recent_commit.R b/R/recent_commit.R index 81a05b2..304e9af 100644 --- a/R/recent_commit.R +++ b/R/recent_commit.R @@ -28,15 +28,19 @@ #' #' # write and commit a first dataframe #' # store the output of write_vc() minimize screen output -#' junk <- write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", -#' stage = TRUE) +#' junk <- write_vc( +#' iris[1:6, ], "iris", repo, sorting = "Sepal.Length", stage = TRUE, +#' digits = 6 +#' ) #' commit(repo, "important analysis", session = TRUE) #' list.files(repo_path) #' Sys.sleep(1.1) # required because git doesn't handle subsecond timings #' #' # write and commit a second dataframe -#' junk <- write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", -#' stage = TRUE) +#' junk <- write_vc( +#' iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", stage = TRUE, +#' digits = 6 +#' ) #' commit(repo, "important analysis", session = TRUE) #' list.files(repo_path) #' Sys.sleep(1.1) # required because git doesn't handle subsecond timings diff --git a/R/update_metadata.R b/R/update_metadata.R index 96465b6..dacd5a6 100644 --- a/R/update_metadata.R +++ b/R/update_metadata.R @@ -52,7 +52,7 @@ update_metadata.git_repository <- function( #' @export update_metadata.character <- function( - file, root = ".", field_description, name, title, description + file, root = ".", field_description, name, title, description, ... ) { root <- normalizePath(root, winslash = "/", mustWork = TRUE) file <- clean_data_path(root = root, file = file) diff --git a/R/upgrade_data.R b/R/upgrade_data.R index f35e1ab..d0a8c79 100644 --- a/R/upgrade_data.R +++ b/R/upgrade_data.R @@ -18,9 +18,14 @@ #' dir.create(root) #' #' # write dataframes to the root -#' write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") -#' write_vc(iris[5:10, ], file = "subdir/iris", root = root, -#' sorting = "Sepal.Length") +#' write_vc( +#' iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length", +#' digits = 6 +#' ) +#' write_vc( +#' iris[5:10, ], file = "subdir/iris", root = root, sorting = "Sepal.Length", +#' digits = 6 +#' ) #' # upgrade a single git2rdata object #' upgrade_data(file = "iris", root = root) #' # use path = "." to upgrade all git2rdata objects under root diff --git a/R/write_vc.R b/R/write_vc.R index ee93e9b..0175737 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -51,6 +51,13 @@ write_vc.default <- function( #' @param split_by An optional vector of variables name to split the text files. #' This creates a separate file for every combination. #' We prepend these variables to the vector of `sorting` variables. +#' @param digits The number of significant digits of the smallest absolute +#' value. +#' The function applies the rounding automatically. +#' Only relevant for numeric variables. +#' Either a single positive integer or a named vector where the names link to +#' the variables in the `data.frame`. +#' Defaults to `6` with a warning. #' @export #' @importFrom assertthat assert_that is.string is.flag #' @importFrom yaml read_yaml write_yaml @@ -58,7 +65,7 @@ write_vc.default <- function( #' @importFrom git2r hash write_vc.character <- function( x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, - na = "NA", ..., append = FALSE, split_by = character(0) + na = "NA", ..., append = FALSE, split_by = character(0), digits ) { assert_that( inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na), @@ -76,7 +83,8 @@ write_vc.character <- function( if (!file.exists(file["meta_file"])) { raw_data <- meta( - x, optimize = optimize, na = na, sorting = sorting, split_by = split_by + x, optimize = optimize, na = na, sorting = sorting, split_by = split_by, + digits = digits ) } else { tryCatch( @@ -91,7 +99,7 @@ write_vc.character <- function( class(old) <- "meta_list" raw_data <- meta( x, optimize = optimize, na = na, sorting = sorting, old = old, - strict = strict, split_by = split_by + strict = strict, split_by = split_by, digits = digits ) problems <- compare_meta(attr(raw_data, "meta"), old) if (length(problems)) { diff --git a/man-roxygen/example_io.R b/man-roxygen/example_io.R index a9adaed..6cf1922 100644 --- a/man-roxygen/example_io.R +++ b/man-roxygen/example_io.R @@ -6,7 +6,10 @@ #' dir.create(root) #' #' # write a dataframe to the directory -#' write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") +#' write_vc( +#' iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length", +#' digits = 6 +#' ) #' # check that a data file (.tsv) and a metadata file (.yml) exist. #' list.files(root, recursive = TRUE) #' # read the git2rdata object from the directory diff --git a/man/meta.Rd b/man/meta.Rd index 58bf948..0d770f8 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -11,7 +11,7 @@ \alias{meta.data.frame} \title{Optimize an Object for Storage as Plain Text and Add Metadata} \usage{ -meta(x, ...) +meta(x, ..., digits) \method{meta}{character}(x, na = "NA", optimize = TRUE, ...) @@ -30,7 +30,8 @@ meta(x, ...) sorting, strict = TRUE, split_by = character(0), - ... + ..., + digits ) } \arguments{ @@ -38,6 +39,14 @@ meta(x, ...) \item{...}{further arguments to the methods.} +\item{digits}{The number of significant digits of the smallest absolute +value. +The function applies the rounding automatically. +Only relevant for numeric variables. +Either a single positive integer or a named vector where the names link to +the variables in the \code{data.frame}. +Defaults to \code{6} with a warning.} + \item{na}{the string to use for missing values in the data.} \item{optimize}{If \code{TRUE}, recode the data to get smaller text files. @@ -93,7 +102,7 @@ Add \code{strict = FALSE} to enforce the new order of factor levels. \examples{ meta(c(NA, "'NA'", '"NA"', "abc\tdef", "abc\ndef")) meta(1:3) -meta(seq(1, 3, length = 4)) +meta(seq(1, 3, length = 4), digits = 6) meta(factor(c("b", NA, "NA"), levels = c("NA", "b", "c"))) meta(factor(c("b", NA, "a"), levels = c("a", "b", "c")), optimize = FALSE) meta(factor(c("b", NA, "a"), levels = c("a", "b", "c"), ordered = TRUE)) diff --git a/man/read_vc.Rd b/man/read_vc.Rd index 909bf7c..afc5e96 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -38,7 +38,10 @@ root <- tempfile("git2rdata-") dir.create(root) # write a dataframe to the directory -write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") +write_vc( + iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length", + digits = 6 +) # check that a data file (.tsv) and a metadata file (.yml) exist. list.files(root, recursive = TRUE) # read the git2rdata object from the directory diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index 79fc8c7..4e3a30a 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -42,15 +42,19 @@ git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") # write and commit a first dataframe # store the output of write_vc() minimize screen output -junk <- write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", - stage = TRUE) +junk <- write_vc( + iris[1:6, ], "iris", repo, sorting = "Sepal.Length", stage = TRUE, + digits = 6 +) commit(repo, "important analysis", session = TRUE) list.files(repo_path) Sys.sleep(1.1) # required because git doesn't handle subsecond timings # write and commit a second dataframe -junk <- write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", - stage = TRUE) +junk <- write_vc( + iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", stage = TRUE, + digits = 6 +) commit(repo, "important analysis", session = TRUE) list.files(repo_path) Sys.sleep(1.1) # required because git doesn't handle subsecond timings diff --git a/man/upgrade_data.Rd b/man/upgrade_data.Rd index b103ead..467068e 100644 --- a/man/upgrade_data.Rd +++ b/man/upgrade_data.Rd @@ -54,9 +54,14 @@ root <- tempfile("git2rdata-") dir.create(root) # write dataframes to the root -write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") -write_vc(iris[5:10, ], file = "subdir/iris", root = root, - sorting = "Sepal.Length") +write_vc( + iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length", + digits = 6 +) +write_vc( + iris[5:10, ], file = "subdir/iris", root = root, sorting = "Sepal.Length", + digits = 6 +) # upgrade a single git2rdata object upgrade_data(file = "iris", root = root) # use path = "." to upgrade all git2rdata objects under root diff --git a/man/write_vc.Rd b/man/write_vc.Rd index ecc25bc..492ac9c 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -28,7 +28,8 @@ write_vc( na = "NA", ..., append = FALSE, - split_by = character(0) + split_by = character(0), + digits ) \method{write_vc}{git_repository}( @@ -84,6 +85,14 @@ We prepend these variables to the vector of \code{sorting} variables.} string. If \code{TRUE}, the output is appended to the file. If \code{FALSE}, any existing file of the name is destroyed.} +\item{digits}{The number of significant digits of the smallest absolute +value. +The function applies the rounding automatically. +Only relevant for numeric variables. +Either a single positive integer or a named vector where the names link to +the variables in the \code{data.frame}. +Defaults to \code{6} with a warning.} + \item{stage}{Logical value indicating whether to stage the changes after writing the data. Defaults to \code{FALSE}.} @@ -112,7 +121,10 @@ root <- tempfile("git2rdata-") dir.create(root) # write a dataframe to the directory -write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") +write_vc( + iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length", + digits = 6 +) # check that a data file (.tsv) and a metadata file (.yml) exist. list.files(root, recursive = TRUE) # read the git2rdata object from the directory diff --git a/tests/testthat/setup_test_data.R b/tests/testthat/setup_test_data.R index fd47a4e..cbe3794 100644 --- a/tests/testthat/setup_test_data.R +++ b/tests/testthat/setup_test_data.R @@ -33,10 +33,21 @@ git2rdata:::set_local_locale(old_locale) sorted_test_data$test_character <- enc2utf8(sorted_test_data$test_character) rownames(sorted_test_data) <- NULL +sorted_test_data_6 <- sorted_test_data +sorted_test_data_6$test_numeric <- signif(sorted_test_data_6$test_numeric, 6) + +sorted_test_data_4 <- sorted_test_data +sorted_test_data_4$test_numeric <- signif(sorted_test_data_4$test_numeric, 4) + + test_subset <- head(test_data, ceiling(test_n / 2)) sorted_test_subset <- test_subset[order(test_subset$test_Date), ] rownames(sorted_test_subset) <- NULL +sorted_test_subset_6 <- sorted_test_subset +sorted_test_subset_6$test_numeric <- signif( + sorted_test_subset_6$test_numeric, 6 +) test_na <- test_data for (i in seq_along(test_na)) { @@ -46,5 +57,6 @@ old_locale <- git2rdata:::set_c_locale() sorted_test_na <- test_na[ order(test_na$test_Date, test_na$test_integer, test_na$test_numeric), ] +sorted_test_na$test_numeric <- signif(sorted_test_na$test_numeric, 6) git2rdata:::set_local_locale(old_locale) rownames(sorted_test_na) <- NULL diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index 7f17f04..a666a29 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -21,10 +21,9 @@ test_that("write_vc() and read_vc() on a file system", { "file should not contain '..'" ) expect_is( - suppressWarnings( - output <- write_vc( - x = test_data, file = "test.txt", root = root, sorting = "test_Date" - ) + output <- write_vc( + x = test_data, file = "test.txt", root = root, sorting = "test_Date", + digits = 6 ), "character" ) @@ -32,31 +31,28 @@ test_that("write_vc() and read_vc() on a file system", { expect_identical(unname(output), c("test.tsv", "test.yml")) expect_true(all(file.exists(git2rdata:::clean_data_path(root, "test")))) expect_equal( - stored <- read_vc(file = "test.xls", root = root), - sorted_test_data, + stored <- read_vc(file = "test.xls", root = root), sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } expect_identical( - suppressWarnings(write_vc(x = test_data, file = "test.xls", root = root)), - output + write_vc(x = test_data, file = "test.xls", root = root), output ) expect_error( write_vc( - data.frame(junk = 5), file = "test", root = root, sorting = "junk" + data.frame(junk = 5), file = "test", root = root, sorting = "junk", + digits = 6 ), "The data was not overwritten because of the issues below." ) expect_error( - suppressWarnings( - write_vc(x = test_data, file = "test", root = root, optimize = FALSE) + write_vc( + x = test_data, file = "test", root = root, optimize = FALSE, digits = 6 ), "New data is verbose, whereas old data was optimized" ) @@ -81,7 +77,7 @@ test_that("write_vc() and read_vc() on a file system", { expect_is( output <- write_vc( x = test_data, file = file.path("a", "verbose"), root = root, - sorting = "test_Date", optimize = FALSE + sorting = "test_Date", optimize = FALSE, digits = 6 ), "character" ) @@ -90,14 +86,11 @@ test_that("write_vc() and read_vc() on a file system", { ) expect_equal( stored <- read_vc(file = file.path("a", "verbose"), root = root), - sorted_test_data, - check.attributes = FALSE + sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } @@ -108,31 +101,30 @@ test_that("write_vc() and read_vc() on a file system", { expect_is( output <- write_vc( - test_na, file = "na", root = root, + test_na, file = "na", root = root, digits = 6, sorting = c("test_Date", "test_integer", "test_numeric") ), "character" ) expect_equal( stored <- read_vc(file = "na", root = root), - sorted_test_na, - check.attributes = FALSE + sorted_test_na, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_na[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_na[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_na$", i) ) } expect_error( - write_vc(test_data, file = "error", root = root, sorting = 1), + write_vc(test_data, file = "error", root = root, sorting = 1, digits = 6), "sorting is not a character vector" ) expect_error( - write_vc(test_data, file = "error", root = root, sorting = "junk"), + write_vc( + test_data, file = "error", root = root, sorting = "junk", digits = 6 + ), "All sorting variables must be available" ) expect_false(any(file.exists(git2rdata:::clean_data_path(root, "sorting")))) @@ -164,19 +156,17 @@ test_that("write_vc() and read_vc() on a file system", { test_changed <- test_data test_changed$junk <- test_changed$test_character expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), + write_vc(test_changed, file = "sorting", root = root), "New data has a different number of variables" ) test_changed$test_character <- NULL expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), - "New variables: junk" + write_vc(test_changed, file = "sorting", root = root), "New variables: junk" ) test_changed <- test_data test_changed$test_character <- factor(test_changed$test_character) expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root - )), + write_vc(test_changed, file = "sorting", root = root), "Change in class: 'test_character' from character to factor" ) expect_error( @@ -194,8 +184,7 @@ test_that("write_vc() and read_vc() on a file system", { ordered = FALSE ) expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root - )), + write_vc(test_changed, file = "sorting", root = root), "'test_ordered' changes from ordinal to nominal" ) @@ -203,22 +192,21 @@ test_that("write_vc() and read_vc() on a file system", { test_no$test_ordered <- NULL expect_is( output <- write_vc( - x = test_no, file = "no_ordered", root = root, sorting = "test_Date" + x = test_no, file = "no_ordered", root = root, sorting = "test_Date", + digits = 6 ), "character" ) sorted_test_no <- sorted_test_data sorted_test_no$test_ordered <- NULL + sorted_test_no$test_numeric <- signif(sorted_test_no$test_numeric, 6) expect_equal( - stored <- read_vc(file = "no_ordered", root = root), - sorted_test_no, + stored <- read_vc(file = "no_ordered", root = root), sorted_test_no, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_no[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_no[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } @@ -227,82 +215,65 @@ test_that("write_vc() and read_vc() on a file system", { test_that( "meta() works on complex", { z <- complex(real = runif(10), imaginary = runif(10)) + expect_equal(mz <- meta(z), z, check.attributes = FALSE) + expect_true(assertthat::has_attr(mz, "meta")) expect_equal( - mz <- meta(z), - z, - check.attributes = FALSE + attr(mz, "meta"), list(class = "complex"), check.attributes = FALSE ) - expect_true(assertthat::has_attr(mz, "meta")) - expect_equal(attr(mz, "meta"), list(class = "complex"), - check.attributes = FALSE) } ) - test_that("user specified na strings work", { x <- data.frame( - a = c(NA, "NA", "b"), - b = factor(c("NA", NA, "d")), - z = c(1:2, NA), - y = c(pi, NA, Inf), - stringsAsFactors = FALSE + a = c(NA, "NA", "b"), b = factor(c("NA", NA, "d")), z = c(1:2, NA), + y = c(pi, NA, Inf), stringsAsFactors = FALSE ) root <- tempfile("na_string") dir.create(root) expect_error( - suppressWarnings( - write_vc(x, "test_na_string_verbose", root, "a", optimize = FALSE) + write_vc( + x, "test_na_string_verbose", root, "a", optimize = FALSE, digits = 6 ), "one of the strings matches the NA string" ) expect_is( - fn <- suppressWarnings( - write_vc(x, "test_na_string_verbose", root, "a", optimize = FALSE, - na = "junk") + fn <- write_vc( + x, "test_na_string_verbose", root, "a", optimize = FALSE, na = "junk", + digits = 6 ), "character" ) old_locale <- git2rdata:::set_c_locale() - expect_equal( - read_vc(fn[1], root), - x[order(x$a), ], - check.attributes = FALSE - ) + target <- x[order(x$a), ] + target$y <- signif(target$y, 6) + expect_equal(read_vc(fn[1], root), target, check.attributes = FALSE) git2rdata:::set_local_locale(old_locale) expect_identical( - grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), - 2:4 + grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), 2:4 ) expect_error( - suppressWarnings( - write_vc(x, "test_na_string_verbose", root, "a", optimize = FALSE, - na = "different") + write_vc( + x, "test_na_string_verbose", root, "a", optimize = FALSE, na = "different" ), "New data uses 'different' as NA string, whereas old data used 'junk'" ) expect_is( - fn <- suppressWarnings( - write_vc(x, "test_na_string_optimize", root, "a", na = "junk") + fn <- write_vc( + x, "test_na_string_optimize", root, "a", na = "junk", digits = 6 ), "character" ) old_locale <- git2rdata:::set_c_locale() - expect_equal( - read_vc(fn[1], root), - x[order(x$a), ], - check.attributes = FALSE - ) + expect_equal(read_vc(fn[1], root), target, check.attributes = FALSE) git2rdata:::set_local_locale(old_locale) expect_identical( - grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), - 2:4 + grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), 2:4 ) }) test_that("write_vc() allows changes in factor levels", { x <- data.frame( - test_factor = factor(c("a", "b")), - stringsAsFactors = FALSE + test_factor = factor(c("a", "b")), stringsAsFactors = FALSE ) root <- tempfile("factor_levels") dir.create(root) @@ -321,14 +292,17 @@ test_that("write_vc() allows changes in factor levels", { ) x$test_factor <- factor(x$test_factor, levels = c("a", "b", "c")) expect_error( - write_vc(x, "factor_levels", root), - "New factor labels for 'test_factor'" + write_vc(x, "factor_levels", root), "New factor labels for 'test_factor'" ) }) test_that("meta attributes are printed as yaml", { - expect_output(print(suppressWarnings(attr(meta(test_data), "meta"))), - "hash: d8b9851bcc840c6203c39f70c514803e7acb96d0") - expect_output(print(attr(meta(test_data$test_factor), "meta")), - "class: factor.*\nordered: no") + expect_output( + print(suppressWarnings(attr(meta(test_data, digits = 6), "meta"))), + "hash: [0-9a-f]{40}" + ) + expect_output( + print(attr(meta(test_data$test_factor), "meta")), + "class: factor.*\nordered: no" + ) }) diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R index 3aa32f4..72ab715 100644 --- a/tests/testthat/test_b_is_git2rmeta.R +++ b/tests/testthat/test_b_is_git2rmeta.R @@ -29,7 +29,9 @@ test_that("is_git2rmeta checks metadata", { ) file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) file.remove(file.path(root, junk[2])) @@ -127,7 +129,9 @@ test_that("is_git2rmeta checks metadata", { test_that("is_git2rdata checks data", { file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) yaml::write_yaml(correct_yaml, file.path(root, junk[2])) correct_data <- readLines(file.path(root, junk[1]), encoding = "UTF-8") @@ -163,7 +167,9 @@ git2r::config(root, user.name = "Alice", user.email = "alice@example.org") test_that("is_git2rmeta handle git repositories", { file <- basename(tempfile(tmpdir = git2r::workdir(root))) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) expect_true(is_git2rmeta(file = file, root = root)) expect_true(is_git2rdata(file = file, root = root)) }) diff --git a/tests/testthat/test_b_prune.R b/tests/testthat/test_b_prune.R index 4fd70fc..22dbdc7 100644 --- a/tests/testthat/test_b_prune.R +++ b/tests/testthat/test_b_prune.R @@ -13,10 +13,13 @@ test_that("rm_data & prune_meta", { expect_error(prune_meta(root), root) dir.create(root) expect_null(prune_meta(root, path = "junk")) - write_vc(test_data, file = "test", root = root, sorting = "test_Date") + write_vc( + test_data, file = "test", root = root, sorting = "test_Date", + digits = 6 + ) write_vc( test_data, file = file.path("a", "verbose"), root = root, - sorting = "test_Date", optimize = FALSE + sorting = "test_Date", optimize = FALSE, digits = 6 ) current <- list.files(root, recursive = TRUE) @@ -42,13 +45,15 @@ test_that("rm_data & prune_meta", { expect_identical(rm_data(root, path = "."), character(0)) expect_identical(list.files(root, recursive = TRUE), current) - write_vc(test_data, file = "test1", root = root, sorting = "test_Date") + write_vc( + test_data, file = "test1", root = root, sorting = "test_Date", digits = 6 + ) junk <- write_vc( - test_data, file = "test2", root = root, sorting = "test_Date" + test_data, file = "test2", root = root, sorting = "test_Date", digits = 6 ) write_vc( test_data, file = file.path("a", "test2"), root = root, - sorting = "test_Date" + sorting = "test_Date", digits = 6 ) meta_data <- yaml::read_yaml(file.path(root, junk[2])) meta_data[["..generic"]] <- NULL diff --git a/tests/testthat/test_b_update.R b/tests/testthat/test_b_update.R index dfa72b9..8c5f2c8 100644 --- a/tests/testthat/test_b_update.R +++ b/tests/testthat/test_b_update.R @@ -18,7 +18,7 @@ original <- data.frame( stringsAsFactors = FALSE ) test_that("updates to logical", { - write_vc(original, "logical", root, sorting = "test_logical") + write_vc(original, "logical", root, sorting = "test_logical", digits = 6) updated <- matrix(TRUE, ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated) expect_is( @@ -30,7 +30,7 @@ test_that("updates to logical", { expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) }) test_that("updates to integer", { - write_vc(original, "integer", root, sorting = "test_logical") + write_vc(original, "integer", root, sorting = "test_logical", digits = 6) updated <- matrix(1L, ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated) expect_is( @@ -42,7 +42,7 @@ test_that("updates to integer", { expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) }) test_that("updates to numeric", { - write_vc(original, "numeric", root, sorting = "test_logical") + write_vc(original, "numeric", root, sorting = "test_logical", digits = 6) updated <- matrix(pi, ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated) expect_is( @@ -51,10 +51,12 @@ test_that("updates to numeric", { ), "character" ) - expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) + expect_equal( + read_vc(fn[1], root), signif(updated, 6), check.attributes = FALSE + ) }) test_that("updates to character", { - write_vc(original, "character", root, sorting = "test_logical") + write_vc(original, "character", root, sorting = "test_logical", digits = 6) updated <- matrix("xyz", ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated, stringsAsFactor = FALSE) expect_is( @@ -66,7 +68,7 @@ test_that("updates to character", { expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) }) test_that("updates to factor", { - write_vc(original, "factor", root, sorting = "test_logical") + write_vc(original, "factor", root, sorting = "test_logical", digits = 6) updated <- matrix("xyz", ncol = ncol(original), dimnames = dimnames(original)) updated <- apply(updated, 2, list) updated <- as.data.frame(lapply(updated, factor, levels = c("xyz", "abc"))) @@ -87,7 +89,7 @@ test_that("updates to factor", { ) }) test_that("updates to Date", { - write_vc(original, "Date", root, sorting = "test_logical") + write_vc(original, "Date", root, sorting = "test_logical", digits = 6) updated <- matrix(Sys.Date(), ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated) @@ -100,10 +102,13 @@ test_that("updates to Date", { expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) }) test_that("updates to POSIXct", { - write_vc(original, "POSIXct", root, sorting = "test_logical") - updated <- matrix(Sys.time(), ncol = ncol(original), - dimnames = dimnames(original)) - updated <- as.data.frame(updated) + write_vc(original, "POSIXct", root, sorting = "test_logical", digits = 6) + Sys.time() |> + as.POSIXct(tz = "UTC") |> + list() |> + rep(ncol(original)) |> + setNames(colnames(original)) |> + as.data.frame() -> updated expect_is( suppressWarnings( fn <- write_vc(updated, "POSIXct", root, strict = FALSE) @@ -113,7 +118,7 @@ test_that("updates to POSIXct", { expect_equal(read_vc(fn[1], root), updated, check.attributes = FALSE) }) test_that("updates to complex", { - write_vc(original, "complex", root, sorting = "test_logical") + write_vc(original, "complex", root, sorting = "test_logical", digits = 6) updated <- matrix(complex(imaginary = 1), ncol = ncol(original), dimnames = dimnames(original)) updated <- as.data.frame(updated) diff --git a/tests/testthat/test_b_verify_vc.R b/tests/testthat/test_b_verify_vc.R index 41f35f6..93210d7 100644 --- a/tests/testthat/test_b_verify_vc.R +++ b/tests/testthat/test_b_verify_vc.R @@ -2,7 +2,8 @@ test_that("verify_vc", { root <- tempfile(pattern = "git2rdata-verify-vc") dir.create(root) write_vc( - x = test_data, file = "test.txt", root = root, sorting = "test_integer" + x = test_data, file = "test.txt", root = root, sorting = "test_integer", + digits = 6 ) expect_s3_class( verify_vc("test.txt", root = root, variables = "test_integer"), diff --git a/tests/testthat/test_c_git.R b/tests/testthat/test_c_git.R index c4dd751..2d80371 100644 --- a/tests/testthat/test_c_git.R +++ b/tests/testthat/test_c_git.R @@ -8,7 +8,7 @@ git2r::add(root, ".gitignore") commit(root, "initial commit") expect_identical(rm_data(root, "."), character(0)) untracked <- write_vc( - test_data, file = "untracked", root = root, sorting = "test_Date" + test_data, file = "untracked", root = root, sorting = "test_Date", digits = 6 ) expect_equal( status(root, ignored = TRUE), @@ -19,21 +19,19 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "untracked", root = root), - sorted_test_data, + stored <- read_vc(file = "untracked", root = root), sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } staged <- write_vc( - test_data, file = "staged", root = root, sorting = "test_Date", stage = TRUE + test_data, file = "staged", root = root, sorting = "test_Date", stage = TRUE, + digits = 6 ) expect_equal( status(root, ignored = TRUE), @@ -44,21 +42,19 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "staged", root = root), - sorted_test_data, + stored <- read_vc(file = "staged", root = root), sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } ignored <- write_vc( - test_data, file = "ignore", root = root, sorting = "test_Date", stage = TRUE + test_data, file = "ignore", root = root, sorting = "test_Date", stage = TRUE, + digits = 6 ) expect_equal( status(root, ignored = TRUE), @@ -69,21 +65,18 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "ignore", root = root), - sorted_test_data, + stored <- read_vc(file = "ignore", root = root), sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } forced <- write_vc( - test_data, file = file.path("forced", "force"), root = root, + test_data, file = file.path("forced", "force"), root = root, digits = 6, sorting = "test_Date", stage = TRUE, force = TRUE ) expect_equal( @@ -98,22 +91,18 @@ expect_equal( ) expect_equal( stored <- read_vc(file = file.path("forced", "force"), root = root), - sorted_test_data, - check.attributes = FALSE + sorted_test_data_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_data_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_data$", i) ) } commit(root, "add data") staged <- write_vc( - test_subset, - file = "staged", root = root, stage = FALSE + test_subset, file = "staged", root = root, stage = FALSE, digits = 6 ) expect_equal( status(root, ignored = TRUE), @@ -124,22 +113,18 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "staged", root = root), - sorted_test_subset, + stored <- read_vc(file = "staged", root = root), sorted_test_subset_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_subset[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_subset_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_subset$", i) ) } staged <- write_vc( - test_subset, - file = "staged", root = root, stage = TRUE + test_subset, file = "staged", root = root, stage = TRUE, digits = 6 ) expect_equal( status(root, ignored = TRUE), @@ -150,15 +135,12 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "staged", root = root), - sorted_test_subset, + stored <- read_vc(file = "staged", root = root), sorted_test_subset_6, check.attributes = FALSE ) for (i in colnames(stored)) { expect_equal( - stored[[i]], - sorted_test_subset[[i]], - label = paste0("stored$", i), + stored[[i]], sorted_test_subset_6[[i]], label = paste0("stored$", i), expected.label = paste0("sorted_test_subset$", i) ) } @@ -167,13 +149,11 @@ commit(root, "update data") expect_null(prune_meta(root, path = "junk")) staged <- write_vc( - test_data, - file = "staged", root = root, stage = TRUE + test_data, file = "staged", root = root, stage = TRUE, digits = 6 ) current <- list.files(git2r::workdir(root), recursive = TRUE) expect_identical( - rm_data(root = root, path = "."), - file.path("forced", "force.tsv") + rm_data(root = root, path = "."), file.path("forced", "force.tsv") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], @@ -200,8 +180,7 @@ expect_null(prune_meta(root, path = ".")) git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") staged <- write_vc( - test_data, - file = "staged", root = root, stage = TRUE + test_data, file = "staged", root = root, stage = TRUE ) expect_identical( rm_data(root = root, path = ".", type = "m"), @@ -225,8 +204,7 @@ expect_identical( git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") staged <- write_vc( - test_data, - file = "staged", root = root, stage = TRUE + test_data, file = "staged", root = root, stage = TRUE ) expect_identical( rm_data(root = root, path = ".", type = "i", stage = TRUE), @@ -250,12 +228,10 @@ expect_identical( git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") ignored <- write_vc( - test_data, file = "ignore", root = root, sorting = "test_Date", stage = TRUE -) -staged <- write_vc( - test_data, - file = "staged", root = root, stage = TRUE + test_data, file = "ignore", root = root, sorting = "test_Date", stage = TRUE, + digits = 6 ) +staged <- write_vc(test_data, file = "staged", root = root, stage = TRUE) expect_identical( rm_data(root = root, path = ".", type = "all", stage = TRUE), c( diff --git a/tests/testthat/test_d_description.R b/tests/testthat/test_d_description.R index 6b88b82..b02a508 100644 --- a/tests/testthat/test_d_description.R +++ b/tests/testthat/test_d_description.R @@ -11,7 +11,8 @@ test_that("description", { expect_is( write_vc( - x = test_data, file = "test.txt", root = root, sorting = "test_Date" + x = test_data, file = "test.txt", root = root, sorting = "test_Date", + digits = 6 ), "character" ) diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R index 38e9237..699c962 100644 --- a/tests/testthat/test_d_recent_commit.R +++ b/tests/testthat/test_d_recent_commit.R @@ -15,13 +15,13 @@ git2r::config(root, user.name = "Alice", user.email = "alice@example.org") write_vc( test_data[1:2, ], file = "test1", root = root, stage = TRUE, - sorting = "test_Date" + sorting = "test_Date", digits = 6 ) commit_1 <- commit(root, "initial commit") write_vc( test_data[3:4, ], file = file.path("junk", "test1"), root = root, - stage = TRUE, sorting = "test_Date" + stage = TRUE, sorting = "test_Date", digits = 6 ) commit_2 <- commit(root, "second file") @@ -34,7 +34,7 @@ commit_3 <- commit(root, "update first file") write_vc( test_data[7:8, ], file = "test3", root = root, stage = TRUE, - sorting = "test_Date" + sorting = "test_Date", digits = 6 ) Sys.sleep(subsecond) commit_4 <- commit(root, "add third file") diff --git a/tests/testthat/test_e_validate_metadata.R b/tests/testthat/test_e_validate_metadata.R index 4a0e38c..7501f1a 100644 --- a/tests/testthat/test_e_validate_metadata.R +++ b/tests/testthat/test_e_validate_metadata.R @@ -3,7 +3,9 @@ root <- tempfile("git2rdata-check-meta") dir.create(root) test_that("read_vc() checks hash", { file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) junk_yaml <- correct_yaml junk_yaml[["test_Date"]] <- NULL @@ -34,7 +36,9 @@ test_that("read_vc() checks hash", { test_that("read_vc() handles changes in rawdata", { file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) correct_data <- readLines(file.path(root, junk[1]), encoding = "UTF-8") correct_header <- strsplit(correct_data[1], "\t")[[1]] junk_data <- correct_data @@ -51,7 +55,9 @@ test_that("read_vc() handles changes in rawdata", { test_that("write_vc() checks existing metadata", { file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = file, root = root, sorting = "test_Date", digits = 6 + ) correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) junk_yaml <- correct_yaml junk_yaml[["test_Date"]] <- NULL diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R index 921bce5..418b888 100644 --- a/tests/testthat/test_f_split_by.R +++ b/tests/testthat/test_f_split_by.R @@ -15,13 +15,13 @@ test_that("write_vc() handles the split_by argument", { ) expect_equal( z[order(z$test_numeric), ], - test_data[order(test_data$test_numeric), ], + sorted_test_data_6[order(sorted_test_data_6$test_numeric), ], check.attributes = FALSE ) expect_is({ sorted_file <- write_vc( - test_data, file = "sorted", root = root, + test_data, file = "sorted", root = root, digits = 6, sorting = "test_Date", split_by = "test_factor" ) }, @@ -35,13 +35,16 @@ test_that("write_vc() handles the split_by argument", { ) expect_equal( z, - test_data[order(test_data$test_factor, test_data$test_Date), ], + sorted_test_data_6[ + order(sorted_test_data_6$test_factor, sorted_test_data_6$test_Date), + ], check.attributes = FALSE ) expect_error( write_vc( - test_data, file = "sorted", root = root, split_by = character(0) + test_data, file = "sorted", root = root, split_by = character(0), + digits = 6 ), "The split_by variables changed." ) diff --git a/tests/testthat/test_g_rename_variable.R b/tests/testthat/test_g_rename_variable.R index 78dc202..a22dc5a 100644 --- a/tests/testthat/test_g_rename_variable.R +++ b/tests/testthat/test_g_rename_variable.R @@ -29,7 +29,8 @@ test_that("rename_variable() handles single files", { git2r::reset(cm, "hard") files <- write_vc( - test_data, file = "sorted", root = repo, sorting = "test_Date", stage = TRUE + test_data, file = "sorted", root = repo, sorting = "test_Date", + stage = TRUE, digits = 6 ) cm <- commit(repo, "sorted") # staged & sorted on changed variable @@ -73,7 +74,7 @@ test_that("rename_variable() handles single files", { expect_identical(length(updated), length(change)) expect_identical(colnames(test_data)[updated], unname(change)) expect_identical(colnames(changed_df)[updated], names(change)) - expect_equivalent(sorted_test_data[, change], changed_df[, names(change)]) + expect_equivalent(sorted_test_data_6[, change], changed_df[, names(change)]) git2r::reset(cm, "hard") }) @@ -111,7 +112,7 @@ test_that("rename_variable() handles split_by files", { files <- write_vc( test_data, file = "sorted", root = repo, sorting = "test_Date", - split_by = "test_factor", stage = TRUE + split_by = "test_factor", stage = TRUE, digits = 6 ) cm <- commit(repo, "sorted") # staged & sorted on changed variable @@ -184,7 +185,9 @@ test_that("rename_variable() handles split_by files", { expect_identical(colnames(test_data)[updated], unname(change)) expect_identical(colnames(changed_df)[updated], names(change)) expect_equivalent( - test_data[order(test_data$test_factor, test_data$test_Date), change], + signif( + test_data[order(test_data$test_factor, test_data$test_Date), change], 6 + ), changed_df[, names(change)] ) git2r::reset(cm, "hard") From 43dbedc1b1c6ac2bd77dea0c19571ae4e2fa638d Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Sun, 8 Dec 2024 13:40:25 +0100 Subject: [PATCH 06/16] add unit tests on the new digits argument --- R/meta.R | 2 +- tests/testthat/test_a_basics.R | 61 ++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/R/meta.R b/R/meta.R index 614c940..6e55c18 100644 --- a/R/meta.R +++ b/R/meta.R @@ -65,7 +65,7 @@ meta.integer <- function(x, ...) { #' @export #' @importFrom assertthat assert_that is.count meta.numeric <- function(x, ..., digits) { - assert_that(is.count(digits)) + stopifnot("`digits` must be a strict positive integer" = is.count(digits)) x <- signif(x, digits = digits) list(class = "numeric", digits = as.integer(digits)) -> m class(m) <- "meta_detail" diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index a666a29..490a254 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -306,3 +306,64 @@ test_that("meta attributes are printed as yaml", { "class: factor.*\nordered: no" ) }) + +test_that("digits works as expected", { + x <- data.frame( + a = c(exp(1), pi), b = c(1.23456789, 1.23456789), + stringsAsFactors = FALSE + ) + root <- tempfile("digits") + dir.create(root) + expect_warning( + fn <- write_vc(x, "default", root, sorting = "a"), + "`digits` was not set." + ) + expect_equal( + read_vc(fn[1], root), check.attributes = FALSE, + signif(x, 6) + ) + + expect_is( + fn <- write_vc(x, "digits", root, digits = 4, sorting = "a"), + "character" + ) + expect_equal( + read_vc(fn[1], root), check.attributes = FALSE, + signif(x, 4) + ) + write_vc(x, "digits", root, digits = 6) + expect_equal( + read_vc(fn[1], root), check.attributes = FALSE, + signif(x, 6) + ) + + expect_is( + fn <- write_vc(x, "delta", root, digits = c(a = 4, b = 5), sorting = "a"), + "character" + ) + expect_equal( + read_vc(fn[1], root), check.attributes = FALSE, + data.frame(a = signif(x$a, 4), b = signif(x$b, 5)) + ) + expect_is( + fn <- write_vc(x, "delta", root, digits = c(a = 5, b = 4)), + "character" + ) + expect_equal( + read_vc(fn[1], root), check.attributes = FALSE, + data.frame(a = signif(x$a, 5), b = signif(x$b, 4)) + ) + + expect_error( + write_vc(x, "faults", root, digits = c(4, 5), sorting = "a"), + "`digits` must be either named" + ) + expect_error( + write_vc(x, "faults", root, digits = c(a = 4, b = NA), sorting = "a"), + "`digits` must be a strict positive integer" + ) + expect_error( + write_vc(x, "faults", root, digits = c(a = 4, c = 5), sorting = "a"), + "`digits` must contain all numeric variables" + ) +}) From c3b5ae4bf729b7f61b63b86e47b25ed9317925da Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 9 Dec 2024 11:54:51 +0100 Subject: [PATCH 07/16] =?UTF-8?q?=F0=9F=93=9D=20Update=20NEWS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 26a570d..2346424 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,9 @@ # git2rdata 0.5.0 * `read_vc()` handles empty datasets stored with `split_by`. +* `write_vc()` and `meta()` gain a `digits` argument. + The arguments specifies the number of significant digits to store for numeric + values. # git2rdata 0.4.1 From 7c92e6db4e165984e86ff3793099fd5659666360 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 9 Dec 2024 15:29:33 +0100 Subject: [PATCH 08/16] =?UTF-8?q?=F0=9F=91=B7=20Check=20PR=20title?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that PR's to the main branch have a version number as title. --- .github/workflows/pr_title.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/pr_title.yml diff --git a/.github/workflows/pr_title.yml b/.github/workflows/pr_title.yml new file mode 100644 index 0000000..553db64 --- /dev/null +++ b/.github/workflows/pr_title.yml @@ -0,0 +1,20 @@ +name: 'PR Title Checker' +on: + pull_request: + types: [edited, opened, synchronize, reopened] + branches: + - main + - master + +jobs: + title-check: + runs-on: ubuntu-latest + steps: + - uses: naveenk1223/action-pr-title@master + with: + regex: 'Version [0-9]+\.[0-9]+(\.[0-9]+)?$' # Regex the title should match. + allowed_prefixes: ':bookmark:' # title should start with the given prefix + prefix_case_sensitive: true # title prefix are case insensitive + min_length: 11 # Min length of the title + max_length: -1 # Max length of the title + name: Check PR title From 95e8d42edaf12a2422af3557671989c34c7b5f2b Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Thu, 12 Dec 2024 20:59:06 +0100 Subject: [PATCH 09/16] =?UTF-8?q?=F0=9F=90=9B=20Meta()=20handles=20overwri?= =?UTF-8?q?ting=20files=20stored=20with=20older=20versions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Older versions had no digits argument stored in the yaml. Fixes #76 --- R/meta.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/meta.R b/R/meta.R index 6e55c18..6ee9bfd 100644 --- a/R/meta.R +++ b/R/meta.R @@ -252,7 +252,7 @@ meta.data.frame <- function(# nolint old_numeric <- vapply( old, FUN.VALUE = logical(1), FUN = function(x) { - has_name(x, "class") && x$class == "numeric" + has_name(x, "class") && x$class == "numeric" && has_name(x, "digits") } ) digits <- vapply( From 631bf80f94d35f53ed49107b7b782d6643777564 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Thu, 12 Dec 2024 21:01:05 +0100 Subject: [PATCH 10/16] =?UTF-8?q?=E2=9C=85=20Using=20stage=20with=20update?= =?UTF-8?q?=5Fmetadata()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test if update_metadata() correctly handles the stage argument. --- tests/testthat/test_d_description.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/testthat/test_d_description.R b/tests/testthat/test_d_description.R index b02a508..9eed121 100644 --- a/tests/testthat/test_d_description.R +++ b/tests/testthat/test_d_description.R @@ -71,4 +71,29 @@ test_that("description", { expect_output(print(output), "display_metadata") expect_output(summary(output), "display_metadata") expect_output(display_metadata(output), "Table name: my_table") + + expect_is(current_status <- status(root), "git_status") + expect_equal( + unname(unlist(current_status$untracked)), c("test.tsv", "test.yml") + ) + expect_equal(unname(current_status$staged), list()) + + expect_type( + update_metadata( + file = "test", root = root, name = "staged_table", title = "Staged table", + description = "This is description for the unit tests", stage = TRUE, + field_description = c(test_character = NA, test_factor = "") + ), + "character" + ) + expect_is({ + output <- read_vc("test", root = root) + }, "git2rdata" + ) + expect_output(display_metadata(output), "Table name: staged_table") + + expect_is(current_status <- status(root), "git_status") + expect_equal(unname(unlist(current_status$untracked)), "test.tsv") + expect_equal(unname(unlist(current_status$staged)), "test.yml") + }) From 2a1255785558652401190debdd89d7f14d03fa4f Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 13 Dec 2024 21:15:32 +0100 Subject: [PATCH 11/16] =?UTF-8?q?=E2=9C=A8=20Add=20first=20version=20of=20?= =?UTF-8?q?data=5Fpackage()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a data-package.json based on csv files without additional metadata. --- DESCRIPTION | 2 + NAMESPACE | 1 + R/data_package.R | 87 +++++++++++++++++++++++++++++++++++++++++ man/data_package.Rd | 32 +++++++++++++++ man/display_metadata.Rd | 1 + man/list_data.Rd | 1 + man/prune_meta.Rd | 1 + man/read_vc.Rd | 1 + man/relabel.Rd | 1 + man/rename_variable.Rd | 1 + man/rm_data.Rd | 1 + man/update_metadata.Rd | 1 + man/verify_vc.Rd | 1 + man/write_vc.Rd | 1 + 14 files changed, 132 insertions(+) create mode 100644 R/data_package.R create mode 100644 man/data_package.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 3c213e4..0658f8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,6 +46,7 @@ Imports: yaml Suggests: ggplot2, + jsonlite, knitr, microbenchmark, rmarkdown, @@ -60,6 +61,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Collate: 'clean_data_path.R' + 'data_package.R' 'datahash.R' 'display_metadata.R' 'git2rdata_package.R' diff --git a/NAMESPACE b/NAMESPACE index 4fc0da7..0778fdc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ S3method(write_vc,character) S3method(write_vc,default) S3method(write_vc,git_repository) export(commit) +export(data_package) export(display_metadata) export(is_git2rdata) export(is_git2rmeta) diff --git a/R/data_package.R b/R/data_package.R new file mode 100644 index 0000000..3d0ac35 --- /dev/null +++ b/R/data_package.R @@ -0,0 +1,87 @@ +#' Create a Data Package for a directory of CSV files +#' +#' @description +#' Create a `datapackage.json` file for a directory of CSV files. +#' The function will look for all `.csv` files in the directory and its +#' subdirectories. +#' It will then create a `datapackage.json` file with the metadata of each CSV +#' file. +#' +#' @param path the directory in which to create the `datapackage.json` file. +#' @family storage +#' @export +#' @importFrom assertthat assert_that is.string noNA +data_package <- function(path = ".") { + assert_that( + is.string(path), noNA(path), requireNamespace("jsonlite", quietly = TRUE) + ) + stopifnot("`path` is not a directory" = file_test("-d", path)) + + data_files <- list.files(path, pattern = ".csv$", recursive = TRUE) + relevant <- vapply( + data_files, FUN = is_git2rdata, FUN.VALUE = logical(1), root = path + ) + stopifnot( + "no non-optimized git2rdata objects found at `path`" = any(relevant) + ) + data_files <- data_files[relevant] + + list( + resources = vapply( + data_files, path = path, FUN = data_resource, + FUN.VALUE = vector(mode = "list", length = 1) + ) |> + unname() + ) |> + jsonlite::toJSON(pretty = TRUE, auto_unbox = TRUE) |> + writeLines(file.path(path, "datapackage.json")) + return(file.path(path, "datapackage.json")) +} + +#' @importFrom assertthat assert_that is.string noNA +#' @importFrom yaml read_yaml +data_resource <- function(file, path = ".") { + assert_that( + is.string(file), is.string(path), noNA(file), noNA(path) + ) + stopifnot("`path` is not a directory" = file_test("-d", path)) + + clean_data_path(root = path, file = file)[2] |> + read_yaml() -> metadata + list( + name = file, path = file, "encoding" = "utf-8", + format = "csv", media_type = "text/csv", + hash = paste0("sha1:", metadata[["..generic"]][["data_hash"]]), + schema = list( + fields = vapply( + names(metadata)[-1], metadata = metadata, FUN = field_schema, + FUN.VALUE = vector(mode = "list", length = 1) + ) |> + unname(), + missingValues = list( + c(value = metadata[["..generic"]][["NA string"]], label = "missing") + ) + ) + ) |> + list() +} + +field_schema <- function(x, metadata) { + list(switch( + metadata[[x]]$class, + "character" = list(name = x, type = "string"), + "Date" = list(name = x, type = "date"), + "logical" = list( + name = x, type = "boolean", trueValues = c("TRUE", "true"), + falseValues = c("FALSE", "false") + ), + "factor" = list( + name = x, type = "string", categories = metadata[[x]][["labels"]], + categoriesOrdered = metadata[[x]][["ordered"]] + ), + "integer" = list(name = x, type = "integer"), + "numeric" = list(name = x, type = "number"), + "POSIXct" = list(name = x, type = "datetime"), + stop("field_schema() can't handle ", metadata[[x]]$class) + )) +} diff --git a/man/data_package.Rd b/man/data_package.Rd new file mode 100644 index 0000000..dff82cf --- /dev/null +++ b/man/data_package.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_package.R +\name{data_package} +\alias{data_package} +\title{Create a Data Package for a directory of CSV files} +\usage{ +data_package(path = ".") +} +\arguments{ +\item{path}{the directory in which to create the \code{datapackage.json} file.} +} +\description{ +Create a \code{datapackage.json} file for a directory of CSV files. +The function will look for all \code{.csv} files in the directory and its +subdirectories. +It will then create a \code{datapackage.json} file with the metadata of each CSV +file. +} +\seealso{ +Other storage: +\code{\link{display_metadata}()}, +\code{\link{list_data}()}, +\code{\link{prune_meta}()}, +\code{\link{read_vc}()}, +\code{\link{relabel}()}, +\code{\link{rename_variable}()}, +\code{\link{rm_data}()}, +\code{\link{update_metadata}()}, +\code{\link{verify_vc}()}, +\code{\link{write_vc}()} +} +\concept{storage} diff --git a/man/display_metadata.Rd b/man/display_metadata.Rd index 732ba4f..11db43d 100644 --- a/man/display_metadata.Rd +++ b/man/display_metadata.Rd @@ -16,6 +16,7 @@ Display metadata for a \code{git2rdata} object } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, \code{\link{read_vc}()}, diff --git a/man/list_data.Rd b/man/list_data.Rd index 22dd8ed..b0b610e 100644 --- a/man/list_data.Rd +++ b/man/list_data.Rd @@ -92,6 +92,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{prune_meta}()}, \code{\link{read_vc}()}, diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd index 754f961..db5b73a 100644 --- a/man/prune_meta.Rd +++ b/man/prune_meta.Rd @@ -106,6 +106,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{read_vc}()}, diff --git a/man/read_vc.Rd b/man/read_vc.Rd index afc5e96..37174c9 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -94,6 +94,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/relabel.Rd b/man/relabel.Rd index 9d6486f..10be43f 100644 --- a/man/relabel.Rd +++ b/man/relabel.Rd @@ -82,6 +82,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd index 3fd8f25..5876df0 100644 --- a/man/rename_variable.Rd +++ b/man/rename_variable.Rd @@ -80,6 +80,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 21b4d77..606c48f 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -122,6 +122,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/update_metadata.Rd b/man/update_metadata.Rd index 864fc5f..3914b9f 100644 --- a/man/update_metadata.Rd +++ b/man/update_metadata.Rd @@ -44,6 +44,7 @@ field from the metadata. } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/verify_vc.Rd b/man/verify_vc.Rd index 0867f7d..5be8d0d 100644 --- a/man/verify_vc.Rd +++ b/man/verify_vc.Rd @@ -24,6 +24,7 @@ data.frame. } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, diff --git a/man/write_vc.Rd b/man/write_vc.Rd index 492ac9c..49f75c1 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -177,6 +177,7 @@ status(repo) } \seealso{ Other storage: +\code{\link{data_package}()}, \code{\link{display_metadata}()}, \code{\link{list_data}()}, \code{\link{prune_meta}()}, From d677eaa6e6ca222094b1cff352779c06de50e4f8 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Sat, 14 Dec 2024 15:35:40 +0100 Subject: [PATCH 12/16] =?UTF-8?q?=E2=9C=A8=20Use=20available=20metadata=20?= =?UTF-8?q?when=20creating=20a=20data=20package?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit data_package() uses the metadata generated by update_metadata() --- R/data_package.R | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/R/data_package.R b/R/data_package.R index 3d0ac35..e5148cc 100644 --- a/R/data_package.R +++ b/R/data_package.R @@ -49,8 +49,8 @@ data_resource <- function(file, path = ".") { clean_data_path(root = path, file = file)[2] |> read_yaml() -> metadata list( - name = file, path = file, "encoding" = "utf-8", - format = "csv", media_type = "text/csv", + name = coalesce(metadata[["..generic"]][["name"]], file), path = file, + "encoding" = "utf-8", format = "csv", media_type = "text/csv", hash = paste0("sha1:", metadata[["..generic"]][["data_hash"]]), schema = list( fields = vapply( @@ -62,12 +62,15 @@ data_resource <- function(file, path = ".") { c(value = metadata[["..generic"]][["NA string"]], label = "missing") ) ) - ) |> + ) -> dr + extra <- c("title", "description") + metadata[["..generic"]][extra[extra %in% names(metadata[["..generic"]])]] |> + c(dr) |> list() } field_schema <- function(x, metadata) { - list(switch( + switch( metadata[[x]]$class, "character" = list(name = x, type = "string"), "Date" = list(name = x, type = "date"), @@ -81,7 +84,24 @@ field_schema <- function(x, metadata) { ), "integer" = list(name = x, type = "integer"), "numeric" = list(name = x, type = "number"), - "POSIXct" = list(name = x, type = "datetime"), + "POSIXct" = list( + name = x, type = "datetime", format = "%Y-%m-%dT%H:%M:%SZ" + ), stop("field_schema() can't handle ", metadata[[x]]$class) - )) + ) -> fs + if ("description" %in% names(metadata[[x]])) { + fs$description <- metadata[[x]][["description"]] + } + return(list(fs)) +} + +coalesce <- function(...) { + dots <- list(...) + if (length(dots) == 0) { + return(NULL) + } + if (!is.null(dots[[1]])) { + return(dots[[1]]) + } + coalesce(dots[-1]) } From 05a13799ffd84136f8c1b0c4b8474ad0cf05d16d Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Sat, 14 Dec 2024 15:37:10 +0100 Subject: [PATCH 13/16] =?UTF-8?q?=E2=9C=85=20Add=20tests=20for=20data=5Fpa?= =?UTF-8?q?ckage()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic test for data_pacakge() --- tests/testthat/test_e_data_package.R | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/testthat/test_e_data_package.R diff --git a/tests/testthat/test_e_data_package.R b/tests/testthat/test_e_data_package.R new file mode 100644 index 0000000..2578abb --- /dev/null +++ b/tests/testthat/test_e_data_package.R @@ -0,0 +1,28 @@ +test_that("datapackage", { + root <- tempfile("datapackage") + dir.create(root) + write_vc( + x = test_data, file = "human_readable", root = root, optimize = FALSE, + sorting = "test_integer", digits = 4 + ) + expect_identical( + data_package(path = root), file.path(root, "datapackage.json") + ) + write_vc( + x = test_data, file = "human_readable_meta", root = root, optimize = FALSE, + sorting = "test_integer", digits = 4 + ) + update_metadata( + file = "human_readable_meta", root = root, title = "Test title", + description = "Test description", name = "test", + field_description = c( + test_integer = "Test integer", test_numeric = "Test numeric", + test_character = "Test character", test_factor = "Test factor", + test_logical = "Test logical", test_Date = "Test date", + test_POSIXct = "Test POSIXct", test_ordered = "Test ordered" + ) + ) + expect_identical( + data_package(path = root), file.path(root, "datapackage.json") + ) +}) From a6fc1b13ce5aab0d41b69b5a8b3c6f548aa482e0 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Thu, 23 Jan 2025 18:55:39 +0100 Subject: [PATCH 14/16] =?UTF-8?q?=F0=9F=90=9B=20Bugfix=20in=20coalesce()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/data_package.R | 2 +- tests/testthat/test_a_coalesce.R | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test_a_coalesce.R diff --git a/R/data_package.R b/R/data_package.R index e5148cc..a170105 100644 --- a/R/data_package.R +++ b/R/data_package.R @@ -103,5 +103,5 @@ coalesce <- function(...) { if (!is.null(dots[[1]])) { return(dots[[1]]) } - coalesce(dots[-1]) + do.call(coalesce, dots[-1]) } diff --git a/tests/testthat/test_a_coalesce.R b/tests/testthat/test_a_coalesce.R new file mode 100644 index 0000000..f84022e --- /dev/null +++ b/tests/testthat/test_a_coalesce.R @@ -0,0 +1,6 @@ +test_that("coalesce()", { + expect_equal(coalesce(), NULL) + expect_equal(coalesce(NULL), NULL) + expect_equal(coalesce(NULL, 1), 1) + expect_equal(coalesce(NULL, NULL, 1), 1) +}) From 419fb7cab9f2edb8ac17a3d475bc30d5ece5e465 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Thu, 23 Jan 2025 19:53:15 +0100 Subject: [PATCH 15/16] =?UTF-8?q?=F0=9F=93=9D=20Update=20citation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- inst/CITATION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/CITATION b/inst/CITATION index 78c6240..ab4419e 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -4,10 +4,10 @@ bibentry( bibtype = "Manual", title = "git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.5.0", author = c( author = c(person(given = "Thierry", family = "Onkelinx"))), - year = 2024, + year = 2025, url = "https://ropensci.github.io/git2rdata/", abstract = "The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(\"plain_text\", package = \"git2rdata\"). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(\"version_control\", package = \"git2rdata\"). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(\"workflow\", package = \"git2rdata\") gives a toy example. 4) vignette(\"efficiency\", package = \"git2rdata\") provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.", - textVersion = "Onkelinx, Thierry (2024) git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.5.0. https://ropensci.github.io/git2rdata/", + textVersion = "Onkelinx, Thierry (2025) git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.5.0. https://ropensci.github.io/git2rdata/", keywords = "git; version control; plain text data", doi = "10.5281/zenodo.1485309", ) From a954bae926ed630c2b4ea90ea1f7e0c19822f063 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 24 Jan 2025 10:47:44 +0100 Subject: [PATCH 16/16] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Require=20R=204.1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because of the use of the base pipe. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0658f8a..5eb3554 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,7 @@ URL: https://ropensci.github.io/git2rdata/, https://doi.org/10.5281/zenodo.1485309 BugReports: https://github.com/ropensci/git2rdata/issues Depends: - R (>= 3.5.0) + R (>= 4.1.0) Imports: assertthat, git2r (>= 0.23.0),