diff --git a/.Rbuildignore b/.Rbuildignore index 408770f..672c633 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,6 +4,7 @@ ^_pkgdown.yml$ ^appveyor\.yml$ ^codemeta\.json$ +^.zenodo\.json$ ^docs$ ^man-roxygen$ ^pkgdown$ diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 0000000..e9f93d2 --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,34 @@ +{ + "creators": [ + { + "affiliation": "Research Institute for Nature and Forest (INBO)", + "name": "Onkelinx, Thierry", + "orcid": "0000-0001-8804-4216" + } + ], + "contributors": [ + { + "affiliation": "Research Institute for Nature and Forest (INBO)", + "name": "Onkelinx, Thiery", + "orcid": "0000-0001-8804-4216", + "type": ["Contactperson", "ProjectLeader"] + }, + { + "affiliation": "Research Institute for Nature and Forest (INBO)", + "name": "Vanderhaeghe, Floris", + "orcid": "0000-0002-6378-6229", + "type": "Projectmember" + }, + { + "name": "Research Institute for Nature and Forest (INBO)", + "type": "Rightsholder" + } + ], + "keywords": [ + "r", + "version control", + "data.frame", + "plain text" + ], + "license": "GPL-3" +} diff --git a/DESCRIPTION b/DESCRIPTION index 8443ba6..e9324c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.0.3 +Version: 0.0.4 Authors@R: c( person( "Thierry", "Onkelinx", role = c("aut", "cre"), @@ -14,7 +14,7 @@ Authors@R: c( "Research Institute for Nature and Forest", role = c("cph", "fnd"), email = "info@inbo.be")) Description: Make versioning of data.frame easy and efficient using git repositories. -Depends: R (>= 3.4.0) +Depends: R (>= 3.5.0) Imports: assertthat, git2r (>= 0.23.0), @@ -36,12 +36,15 @@ BugReports: https://github.com/inbo/git2rdata/issues Collate: 'clean_data_path.R' 'git2rdata-package.R' + 'write_vc.R' + 'is_git2rdata.R' + 'is_git2rmeta.R' 'list_data.R' 'meta.R' - 'write_vc.R' 'prune.R' 'read_vc.R' 'recent_commit.R' 'reexport.R' 'relabel.R' + 'upgrade_data.R' VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 14eae60..a11c0fe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,12 @@ S3method(format,meta_detail) S3method(format,meta_list) +S3method(is_git2rdata,character) +S3method(is_git2rdata,default) +S3method(is_git2rdata,git_repository) +S3method(is_git2rmeta,character) +S3method(is_git2rmeta,default) +S3method(is_git2rmeta,git_repository) S3method(list_data,character) S3method(list_data,default) S3method(list_data,git_repository) @@ -22,6 +28,7 @@ S3method(prune_meta,git_repository) S3method(read_vc,character) S3method(read_vc,default) S3method(read_vc,git_repository) +S3method(recent_commit,default) S3method(recent_commit,git_repository) S3method(relabel,data.frame) S3method(relabel,default) @@ -29,10 +36,15 @@ S3method(relabel,list) S3method(rm_data,character) S3method(rm_data,default) S3method(rm_data,git_repository) +S3method(upgrade_data,character) +S3method(upgrade_data,default) +S3method(upgrade_data,git_repository) S3method(write_vc,character) S3method(write_vc,default) S3method(write_vc,git_repository) export(commit) +export(is_git2rdata) +export(is_git2rmeta) export(list_data) export(meta) export(prune_meta) @@ -44,6 +56,7 @@ export(relabel) export(repository) export(rm_data) export(status) +export(upgrade_data) export(write_vc) importFrom(assertthat,"on_failure<-") importFrom(assertthat,assert_that) @@ -64,6 +77,7 @@ importFrom(git2r,status) importFrom(git2r,workdir) importFrom(methods,setOldClass) importFrom(stats,setNames) +importFrom(utils,packageVersion) importFrom(utils,read.table) importFrom(utils,write.table) importFrom(yaml,as.yaml) diff --git a/NEWS.md b/NEWS.md index d4da48d..b95a2fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,47 +1,86 @@ +git2rdata 0.0.4 (2019-05-16) +============================ + +### BREAKING FEATURES + + * `write_vc()` stores the `git2rdata` version number to the metadata. Use `upgrade_data()` to update existing data. + +### NEW FEATURES + + * `read_vc()` checks the meta data hash. A mismatch results in an error. + * The meta data gains a data hash. A mismatch throws a warning when reading the object. This tolerates updating the data by other software, while informing the user that such change occurred. + * `is_git2rmeta()` validates metadata. + * `list_data()` lists files with valid metadata. + * `rm_data()` and `prune_meta()` remove files with valid metadata. Other files are untouched. + * Files with invalid metadata yield a warning with `list_data()`, `rm_data()` and `prune_meta()`. + +### Bugfixes + + * `write_vc()` and `relabel()` handle empty strings (`''`) in characters and factors (#24). + * `read_vc()` no longer treats `#` as a comment character. + * `read_vc()` handles non ASCII characters on Windows. + +### Other changes + + * Use a faster algorithm to detect duplicates (suggestion by @brodieG). + * Improve documentation. + * Fix typo's in documentation, vignettes and README. + * Add a ROpenSci review badge to the README. + * The README mentions on upper bound on the size of dataframes. + * Set lifecycle to "maturing" and repo status to "active". + * The functions handle `root` containing regex expressions. + * Rework `vignette("workflow", package = "git2rdata")`. + * Update timings in `vignette("efficiency", package = "git2rdata")` + * Minor tweaks in `vignette("plain_text", package = "git2rdata")` + +git2rdata 0.0.3 (2019-03-12) +============================ + + * Fix typo's in documentation, vignettes and README. + git2rdata 0.0.2 (2019-02-26) ============================ ### BREAKING CHANGES - * metadata is added as a list to the objects rather than in YAML format. - * the [yaml](https://cran.r-project.org/package=yaml) package is used to store the metadata list in YAML format. - * `write_vc()` now uses the 'strict' argument instead of 'override' - * the functionality `rm_data()` is split into `rm_data()` and `prune_meta()` (#9) + * `meta()` appends the metadata as a list to the objects rather than in YAML format. + * `yaml::write_yaml()` writes the metadata list in YAML format. + * `write_vc()` now uses the 'strict' argument instead of 'override'. + * `rm_data()` removes the data files. Use `prune_meta()` to remove left-over metadata files (#9). ### NEW FEATURES - * vignette on [efficiency](../articles/efficiency.html) added (#2) - * existing vignette was split over three vignettes - * focus on the [plain text format](../articles/plain_text.html) - * focus on [version control](../articles/version_control.html) - * focus on [workflows](../articles/workflow.html) - * S4 methods are replaced by S3 methods (#8) - * optimized factors use stable indices, resulting in smaller diffs when levels are added or removed (#13) - * use `relabel()` to alter factor levels without changing their index (#13) - * the raw data is written and read by base R functions instead of `readr` functions (#7) - * `write_vc()` and `read_vc()` use the current working directory as default root (#6, @florisvdh) - * the user can specify a string to code missing values (default = `NA`). This allows the storage of the character string `"NA"`. + * Vignette on [efficiency](../articles/efficiency.html) added (#2). + * Three separate vignettes instead of one large vignette. + * Focus on the [plain text format](../arsticles/plain_text.html). + * Focus on [version control](../articles/version_control.html). + * Focus on [workflows](../articles/workflow.html). + * S3 methods replace the old S4 methods (#8). + * Optimized factors use stable indices. Adding or removing levels result in smaller diffs (#13). + * Use `relabel()` to alter factor levels without changing their index (#13). + * `write.table()` stores the raw data instead of `readr::write_tsv()` (#7). This avoids the `readr` dependency. + * `write_vc()` and `read_vc()` use the current working directory as default root (#6, @florisvdh). + * The user can specify a string to code missing values (default = `NA`). This allows the storage of the character string `"NA"`. * `write_vc()` returns a list of issues which potentially result in large diffs. - * `list_data()` returns a vector with dataframes in the repository + * `list_data()` returns a vector with dataframes in the repository. ### Other changes - * `write_vc()` allows to use a custom NA string - * each helpfile contains a working example (#11) - * README updated (#12) - * Updated the rationale with links to the vignettes - * `git2rdata` has a hexsticker logo - * A DOI is added - * The installation instructions use `remotes` and build the vignettes - * `auto_commit()` was removed because of limited extra functionality over `git2r::commit()` - * dataframes are read and written by base R functions instead of `readr` functions + * `write_vc()` allows to use a custom `NA` string. + * Each helpfile contains a working example (#11). + * README updated (#12). + * Updated the rationale with links to the vignettes. + * `git2rdata` has a hexsticker logo. + * Add the [![DOI](https://zenodo.org/badge/147685405.svg)](https://zenodo.org/badge/latestdoi/147685405). + * The installation instructions use `remotes` and build the vignettes. + * We removed `auto_commit()` because of limited extra functionality over `git2r::commit()`. git2rdata 0.0.1 (2018-11-12) ============================ ### NEW FEATURES - * use readr to write and read plain text files - * allows storage of strings with "NA" or special characters - * handle ordered factors - * stop handling complex numbers + * Use `readr` to write and read plain text files. + * Allow storage of strings with "NA" or special characters. + * Handle ordered factors. + * Stop handling complex numbers. diff --git a/R/clean_data_path.R b/R/clean_data_path.R index 6151a86..9d54957 100644 --- a/R/clean_data_path.R +++ b/R/clean_data_path.R @@ -1,13 +1,15 @@ -#' Clean the data path +#' Clean the Data Path #' Strips any file extension from the path and adds the `".tsv"` and `".yml"` #' file extensions #' @inheritParams write_vc -#' @param normalize normalize the path? Defaults to TRUE -#' @return a named vector with "raw_file" and "meta_file", refering to the -#' `".tsv"` and `".yml"` files +#' @param normalize Normalize the path? Defaults to TRUE +#' @return A named vector with "raw_file" and "meta_file", refering to the +#' `".tsv"` and `".yml"` files. #' @noRd #' @family internal +#' @importFrom assertthat assert_that is.flag noNA clean_data_path <- function(root, file, normalize = TRUE) { + assert_that(is.flag(normalize), noNA(normalize)) dir_name <- dirname(file) file <- gsub("\\..*$", "", basename(file)) if (dir_name == ".") { @@ -15,7 +17,7 @@ clean_data_path <- function(root, file, normalize = TRUE) { } else { path <- file.path(root, dir_name, file) } - if (isTRUE(normalize)) { + if (normalize) { path <- normalizePath(path, winslash = "/", mustWork = FALSE) } c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R new file mode 100644 index 0000000..2cacef7 --- /dev/null +++ b/R/is_git2rdata.R @@ -0,0 +1,72 @@ +#' Check Whether a Git2rdata Object is Valid. +#' +#' A valid git2rdata object has valid metadata. The data hash must match the +#' data hash stored in the metadata. +#' @inheritParams write_vc +#' @inheritParams is_git2rmeta +#' @return A logical value. `TRUE` in case of a valid git2rdata object. +#' Otherwise `FALSE`. +#' @rdname is_git2rdata +#' @export +#' @family internal +#' @template example-isgit2r +is_git2rdata <- function(file, root = ".", + message = c("none", "warning", "error")) { + UseMethod("is_git2rdata", root) +} + +#' @export +is_git2rdata.default <- function(file, root, message) { + stop("a 'root' of class ", class(root), " is not supported") +} + +#' @export +#' @importFrom assertthat assert_that is.string +#' @importFrom yaml read_yaml as.yaml +#' @importFrom utils packageVersion +#' @importFrom git2r hash +is_git2rdata.character <- function(file, root = ".", + message = c("none", "warning", "error")) { + assert_that(is.string(file), is.string(root)) + message <- match.arg(message) + root <- normalizePath(root, winslash = "/", mustWork = TRUE) + check_meta <- is_git2rmeta(file = file, root = root, message = message) + if (!check_meta) { + return(FALSE) + } + file <- clean_data_path(root = root, file = file) + + if (!file.exists(file["raw_file"])) { + msg <- "Data file missing." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + + # read the metadata + meta_data <- read_yaml(file["meta_file"]) + + correct <- names(meta_data) + correct <- paste(correct[correct != "..generic"], collapse = "\t") + header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8") + if (correct != header) { + msg <- paste("Corrupt data, incorrect header. Expecting:", correct) + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + + if (meta_data[["..generic"]][["data_hash"]] != hashfile(file[["raw_file"]])) { + msg <- "Corrupt data, mismatching data hash." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + + return(TRUE) +} + +#' @export +#' @importFrom git2r workdir +#' @include write_vc.R +is_git2rdata.git_repository <- function( + file, root, message = c("none", "warning", "error")) { + is_git2rdata(file = file, root = workdir(root), message = message) +} diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R new file mode 100644 index 0000000..a1c5f3f --- /dev/null +++ b/R/is_git2rmeta.R @@ -0,0 +1,101 @@ +#' Check Whether a Git2rdata Object Has Valid Metadata. +#' +#' Valid metadata is a file with `.yml` extension. It has a top level item +#' `..generic`. This item contains `git2rdata` (the version number), `hash` (a +#' hash on the metadata) and `data_hash` (a hash on the data file). The version +#' number must be the current version. +#' @inheritParams write_vc +#' @param message a single value indicating the type of messages on top of the +#' logical value. `"none"`: no messages, `"warning"`: issue a warning in case of +#' an invalid metadata file. `"error"`: an invalid metadata file results in an +#' error. Defaults to `"none"`. +#' @return A logical value. `TRUE` in case of a valid metadata file. Otherwise +#' `FALSE`. +#' @rdname is_git2rmeta +#' @export +#' @family internal +#' @template example-isgit2r +is_git2rmeta <- function(file, root = ".", + message = c("none", "warning", "error")) { + UseMethod("is_git2rmeta", root) +} + +#' @export +is_git2rmeta.default <- function(file, root, + message = c("none", "warning", "error")) { + stop("a 'root' of class ", class(root), " is not supported") +} + +#' @export +#' @importFrom assertthat assert_that is.string +#' @importFrom yaml read_yaml +#' @importFrom utils packageVersion +is_git2rmeta.character <- function(file, root = ".", + message = c("none", "warning", "error")) { + assert_that(is.string(file), is.string(root)) + message <- match.arg(message) + root <- normalizePath(root, winslash = "/", mustWork = TRUE) + file <- clean_data_path(root = root, file = file) + + if (!file.exists(file["meta_file"])) { + msg <- "Metadata file missing." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + + # read the metadata + meta_data <- read_yaml(file["meta_file"]) + if (!has_name(meta_data, "..generic")) { + msg <- "No '..generic' element." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + if (!has_name(meta_data[["..generic"]], "hash")) { + msg <- "Corrupt metadata, no hash found." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + if (!has_name(meta_data[["..generic"]], "git2rdata")) { + msg <- "Data stored using an older version of `git2rdata`. +See `?upgrade_data()`." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + if (package_version(meta_data[["..generic"]][["git2rdata"]]) < + packageVersion("git2rdata")) { + msg <- "Data stored using an older version of `git2rdata`. +See `?upgrade_data()`." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + if (!has_name(meta_data[["..generic"]], "data_hash")) { + msg <- "Corrupt metadata, no data hash found." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + current_hash <- meta_data[["..generic"]][["hash"]] + if (current_hash != metadata_hash(meta_data)) { + msg <- "Corrupt metadata, mismatching hash." + switch(message, error = stop(msg), warning = warning(msg)) + return(FALSE) + } + + return(TRUE) +} + +#' @export +#' @importFrom git2r workdir +#' @include write_vc.R +is_git2rmeta.git_repository <- function( + file, root, message = c("none", "warning", "error")) { + is_git2rmeta(file = file, root = workdir(root), message = message) +} + +#' @importFrom yaml as.yaml +#' @importFrom git2r hash +metadata_hash <- function(meta_data) { + meta_data[["..generic"]][["git2rdata"]] <- NULL + meta_data[["..generic"]][["hash"]] <- NULL + meta_data[["..generic"]][["data_hash"]] <- NULL + hash(as.yaml(meta_data)) +} diff --git a/R/list_data.R b/R/list_data.R index 99db4e8..bbb8c76 100644 --- a/R/list_data.R +++ b/R/list_data.R @@ -1,10 +1,16 @@ -#' List available data files +#' List Available Git2rdata Files Containing Data +#' +#' The function returns the names of all valid git2rdata objects. This implies +#' `.tsv` files with a matching **valid** metadata file (`.yml`). **Invalid** +#' metadata files result in a warning. The function ignores **valid** metadata +#' files without matching raw data (`.tsv`). #' @param root the `root` of the repository. Either a path or a `git-repository` #' @param path relative `path` from the `root`. Defaults to the `root` #' @inheritParams base::list.files #' @export #' @template example-prune -#' @return a character vector is dataframe names, including their relative path +#' @return A character vector of git2rdata object names, including their +#' relative path. #' @family storage list_data <- function(root = ".", path = ".", recursive = TRUE) { UseMethod("list_data", root) @@ -22,14 +28,23 @@ list_data.character <- function(root = ".", path = ".", recursive = TRUE) { root <- normalizePath(root, winslash = "/", mustWork = TRUE) path <- normalizePath(file.path(root, path), winslash = "/", mustWork = TRUE) - data_files <- list.files(path, pattern = "\\.tsv$", recursive = TRUE, + data_files <- list.files(path, pattern = "\\.tsv$", recursive = recursive, full.names = TRUE) - meta_files <- list.files(path, pattern = "\\.yml$", recursive = TRUE, + meta_files <- list.files(path, pattern = "\\.yml$", recursive = recursive, full.names = TRUE) data_files <- gsub("\\.tsv$", "", data_files) meta_files <- gsub("\\.yml$", "", meta_files) + meta_files <- meta_files[meta_files %in% data_files] + meta_files_base <- remove_root(file = meta_files, root = root) + check <- vapply(X = meta_files_base, FUN = is_git2rmeta, + FUN.VALUE = NA, root = root, message = "none") + if (any(!check)) { + warning("Invalid metadata files found. See ?is_git2rmeta():\n", + paste(meta_files_base[!check], collapse = "\n")) + } + meta_files <- meta_files[check] data_files <- data_files[data_files %in% meta_files] - gsub(paste0("^", root, "/"), "", data_files) + remove_root(file = data_files, root = root) } #' @export diff --git a/R/meta.R b/R/meta.R index a44a431..309c3b4 100644 --- a/R/meta.R +++ b/R/meta.R @@ -1,10 +1,13 @@ -#' Optimize a vector for storage as plain text and add meta data +#' Optimize an Object for Storage as Plain Text and Add Metadata #' -#' \code{\link{write_vc}} applies this function automatically on your -#' data.frame. -#' @param x the vector -#' @param ... further arguments to the methods -#' @return the optimized vector `x` with `meta` attribute +#' @description +#' Prepares a vector for storage. When relevant, `meta()`optimizes the object +#' for storage by changing the format to one which needs less characters. The +#' metadata stored in the `meta` attribute, contains all required information to +#' backtransform the optimized format into the original format. +#' @param x the vector. +#' @param ... further arguments to the methods. +#' @return the optimized vector `x` with `meta` attribute. #' @export #' @docType methods #' @family internal @@ -67,10 +70,15 @@ meta.numeric <- function(x, ...) { #' @export #' @rdname meta -#' @param optimize recode the data to get smaller text files. Defaults to TRUE -#' @param index an optional named vector with existing factor indices. The names must match the existing factor levels. Unmatched levels from `x` will get new indices. +#' @param optimize If `TRUE`, recode the data to get smaller text files. If +#' `FALSE`, `meta()` converts the data to character. Defaults to `TRUE`. +#' @param index an optional named vector with existing factor indices. The names +#' must match the existing factor levels. Unmatched levels from `x` will get new +#' indices. #' @inheritParams utils::write.table +#' @importFrom assertthat assert_that is.flag noNA meta.factor <- function(x, optimize = TRUE, na = "NA", index, ...) { + assert_that(is.flag(optimize), noNA(optimize)) if (missing(index) || is.null(index)) { index <- seq_along(levels(x)) names(index) <- levels(x) @@ -82,10 +90,14 @@ meta.factor <- function(x, optimize = TRUE, na = "NA", index, ...) { candidate_index <- candidate_index[!candidate_index %in% index] extra_index <- candidate_index[seq_along(new_levels)] names(extra_index) <- levels(x)[new_levels] - index <- c(index, extra_index)[levels(x)] + new_index <- c(index, extra_index) + index <- new_index[levels(x)] + empty <- levels(x) == "" + index[empty] <- new_index[names(new_index) == ""] + names(index)[empty] <- "" } - if (isTRUE(optimize)) { + if (optimize) { z <- index[x] } else { assert_that(is.string(na), noNA(na), no_whitespace(na)) @@ -96,8 +108,8 @@ Please use a different NA string or use optimize = TRUE", call. = FALSE) z <- meta(as.character(x), optimize = optimize, na = na, ...) } - m <- list(class = "factor", na_string = na, optimize = isTRUE(optimize), - labels = names(index), index = unname(index), + m <- list(class = "factor", na_string = na, optimize = optimize, + labels = enc2utf8(names(index)), index = unname(index), ordered = is.ordered(x)) class(m) <- "meta_detail" attr(z, "meta") <- m @@ -106,11 +118,13 @@ Please use a different NA string or use optimize = TRUE", call. = FALSE) #' @export #' @rdname meta +#' @importFrom assertthat assert_that is.flag noNA meta.logical <- function(x, optimize = TRUE, ...){ - if (isTRUE(optimize)) { + assert_that(is.flag(optimize), noNA(optimize)) + if (optimize) { x <- as.integer(x) } - m <- list(class = "logical", optimize = isTRUE(optimize)) + m <- list(class = "logical", optimize = optimize) class(m) <- "meta_detail" attr(x, "meta") <- m return(x) @@ -126,8 +140,10 @@ meta.complex <- function(x, ...) { #' @export #' @rdname meta +#' @importFrom assertthat assert_that is.flag noNA meta.POSIXct <- function(x, optimize = TRUE, ...) { - if (isTRUE(optimize)) { + assert_that(is.flag(optimize), noNA(optimize)) + if (optimize) { z <- unclass(x) m <- list(class = "POSIXct", optimize = TRUE, origin = "1970-01-01 00:00:00", timezone = "UTC") @@ -143,12 +159,14 @@ meta.POSIXct <- function(x, optimize = TRUE, ...) { #' @export #' @rdname meta +#' @importFrom assertthat assert_that is.flag noNA meta.Date <- function(x, optimize = TRUE, ...){ - if (isTRUE(optimize)) { + assert_that(is.flag(optimize), noNA(optimize)) + if (optimize) { z <- as.integer(x) m <- list(class = "Date", optimize = TRUE, origin = "1970-01-01") } else { - z <- format(x, format = "%Y-%m-%d", tz = "UTC") + z <- format(x, format = "%Y-%m-%d") m <- list(class = "Date", optimize = FALSE, format = "%Y-%m-%d") } class(m) <- "meta_detail" @@ -158,9 +176,22 @@ meta.Date <- function(x, optimize = TRUE, ...){ #' @export #' @importFrom assertthat assert_that -#' @importFrom git2r hash +#' @importFrom utils packageVersion +#' @description +#' In case of a data.frame, `meta()` applies itself to each of the columns. The +#' `meta` attribute becomes a named list containing the metadata for each column +#' plus an additional `..generic` element. `..generic` is a reserved name for +#' the metadata and not allowed as column name in a `data.frame`. +#' +#' \code{\link{write_vc}} uses this function to prepare a dataframe for storage. +#' Existing metadata is passed through the optional `old` argument. This +#' argument intendent for internal use. +#' @rdname meta +#' @inheritParams write_vc meta.data.frame <- function(x, optimize = TRUE, na = "NA", sorting, ...) { - assert_that(!has_name(x, "..generic"), msg = "'..generic' is a reserved name") + assert_that( + !has_name(x, "..generic"), + msg = "'..generic' is a reserved name and not allowed as column name") generic <- list(optimize = optimize, "NA string" = na) dots <- list(...) @@ -173,18 +204,23 @@ meta.data.frame <- function(x, optimize = TRUE, na = "NA", sorting, ...) { } # apply sorting - if (missing(sorting) || is.null(sorting)) { - warning("no sorting applied") + if (missing(sorting) || is.null(sorting) || !length(sorting)) { + warning("No sorting applied. +Sorting is strongly recommended in combination with version control.") } else { assert_that(is.character(sorting)) - assert_that(all(sorting %in% colnames(x)), - msg = "all sorting variables must be available") - if (anyDuplicated(x[sorting])) { - warning( -"sorting results in ties. Add extra sorting variables to ensure small diffs." - ) + assert_that( + all(sorting %in% colnames(x)), + msg = "All sorting variables must be available in the data.frame") + if (nrow(x) > 1) { + x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint + if (any_duplicated(x[sorting])) { + sorted <- paste(sprintf("'%s'", sorting), collapse = ", ") + sorted <- sprintf("Sorting on %s results in ties. +Add extra sorting variables to ensure small diffs.", sorted) + warning(sorted) + } } - x <- x[do.call(order, x[sorting]), , drop = FALSE] # nolint generic <- c(generic, sorting = list(sorting)) } # calculate meta for each column @@ -227,7 +263,9 @@ meta.data.frame <- function(x, optimize = TRUE, na = "NA", sorting, ...) { ) m <- c(..generic = list(generic), m) class(m) <- "meta_list" - m[["..generic"]] <- c(m[["..generic"]], hash = hash(as.yaml(m))) + m[["..generic"]] <- c( + list(git2rdata = as.character(packageVersion("git2rdata"))), + m[["..generic"]], hash = metadata_hash(m)) z <- lapply(z, `attr<-`, "meta", NULL) # convert z to dataframe and add metadata list @@ -266,3 +304,25 @@ print.meta_list <- function(x, ...) { print.meta_detail <- function(x, ...) { cat(format(x), sep = "\n") } + +delta <- function(a, b) { + ifelse( + is.na(a), + is.na(b), + ifelse(is.na(b), FALSE, a == b) + ) +} + +any_duplicated <- function(x) { + y <- vapply( + x, + function(z) { + delta(z[-1], z[-length(z)]) + }, + logical(nrow(x) - 1) + ) + if (inherits(y, "matrix")) { + y <- rowSums(y) + } + sum(y == ncol(x)) > 0 +} diff --git a/R/prune.R b/R/prune.R index c967ca3..4b22c6d 100644 --- a/R/prune.R +++ b/R/prune.R @@ -1,8 +1,19 @@ -#' Remove data files +#' Remove Data Files From Git2rdata Objects #' -#' Removes all data (`.tsv` files) from the `path` when they have accompanying metadata (`.yml` file). The metadata remains untouched. See the [workflow](https://inbo.github.io/git2rdata/articles/workflow.html) vignette (`vignette("workflow", package = "git2rdata")`) for some examples on how to use this. -#' @param path the directory in which to clean all the data files -#' @param recursive remove files in subdirectories too +#' @description +#' Remove the data (`.tsv`) file from all valid git2rdata objects at the `path`. +#' The metadata remains untouched. A warning lists any git2rdata object with +#' **invalid** metadata. The function keeps any `.tsv` file with +#' invalid metadata or from non-git2rdata objects. +#' +#' Use this function with caution since it will remove all valid data files +#' without asking for confirmation. We strongly recommend to use this +#' function on files under version control. See +#' `vignette("workflow", package = "git2rdata")` for some examples on how to use +#' this. +#' @param path the directory in which to clean all the data files. The directory +#' is relative to `root`. +#' @param recursive remove files in subdirectories too. #' @return returns invisibily a vector of removed files names. The paths are #' relative to `root`. #' @inheritParams write_vc @@ -41,7 +52,13 @@ rm_data.character <- function( #' @importFrom git2r workdir add #' @include write_vc.R #' @param stage stage the changes after removing the files. Defaults to FALSE. -#' @param type which classes of files should be removed. `unmodified` are files in the git history and unchanged since the last commit. `modified` are files in the git history and changed since the last commit. `ignored` refers to file listed in a `.gitignore` file. Selecting `modified` will remove both `unmodified` and `modified` data files. Selecting `ìgnored` will remove `unmodified`, `modified` and `ignored` data files. `all` refers to all visible data files, inclusing `untracked` files. The argument can be abbreviated to the first letter. +#' @param type Defines the classes of files to remove. `unmodified` are files in +#' the git history and unchanged since the last commit. `modified` are files in +#' the git history and changed since the last commit. `ignored` refers to file +#' listed in a `.gitignore` file. Selecting `modified` will remove both +#' `unmodified` and `modified` data files. Selecting `ìgnored` will remove +#' `unmodified`, `modified` and `ignored` data files. `all` refers to all +#' visible data files, inclusing `untracked` files. #' @rdname rm_data rm_data.git_repository <- function( root, path = NULL, recursive = TRUE, ..., stage = FALSE, @@ -78,9 +95,18 @@ rm_data.git_repository <- function( return(invisible(to_do)) } -#' Prune metadata files +#' Prune Metadata Files +#' +#' @description +#' Removes all **valid** metadata (`.yml` files) from the `path` when they don't +#' have accompanying data (`.tsv` file). **Invalid** metadata triggers a warning +#' without removing the metadata file. #' -#' Removes all metadata (`.yml` files) from the `path` when they don't have accompanying data (`.tsv` file). See the [workflow](https://inbo.github.io/git2rdata/articles/workflow.html) vignette (`vignette("workflow", package = "git2rdata")`) for some examples on how to use this. +#' Use this function with caution since it will remove all valid metadata files +#' without asking for confirmation. We strongly recommend to use this +#' function on files under version control. See +#' `vignette("workflow", package = "git2rdata")` for some examples on how to use +#' this. #' @inheritParams rm_data #' @return returns invisibily a vector of removed files names. The paths are #' relative to `root`. @@ -102,7 +128,7 @@ prune_meta.default <- function( } #' @export -#' @importFrom assertthat assert_that is.flag +#' @importFrom assertthat assert_that is.flag noNA prune_meta.character <- function( root = ".", path = NULL, recursive = TRUE, ... ){ @@ -114,24 +140,25 @@ prune_meta.character <- function( if (!dir.exists(path)) { return(invisible(NULL)) } - assert_that(is.flag(recursive)) + assert_that(is.flag(recursive), noNA(recursive)) - to_do <- list.files( - path = path, - pattern = "\\.yml$", - recursive = recursive, - full.names = TRUE - ) - keep <- list.files( - path = path, - pattern = "\\.tsv$", - recursive = recursive, - full.names = TRUE - ) + to_do <- list.files(path = path, pattern = "\\.yml$", recursive = recursive, + full.names = TRUE) + keep <- list.files(path = path, pattern = "\\.tsv$", recursive = recursive, + full.names = TRUE) keep <- gsub("\\.tsv$", ".yml", keep) to_do <- to_do[!to_do %in% keep] + to_do_base <- remove_root(file = to_do, root = root) + check <- vapply(X = gsub(".yml$", "", to_do_base), FUN = is_git2rmeta, + FUN.VALUE = NA, root = root, message = "none") + if (any(!check)) { + warning("Invalid metadata files found. See ?is_git2rmeta():\n", + paste(to_do_base[!check], collapse = "\n")) + } + to_do <- to_do[check] + file.remove(to_do) - to_do <- gsub(paste0("^", root, "/"), "", to_do) + to_do <- remove_root(file = to_do, root = root) return(invisible(to_do)) } @@ -140,7 +167,7 @@ prune_meta.character <- function( #' @importFrom assertthat assert_that is.flag #' @importFrom git2r workdir add #' @include write_vc.R -#' @param stage stage the changes after removing the files. Defaults to FALSE. +#' @param stage stage the changes after removing the files. Defaults to `FALSE`. #' @rdname prune_meta prune_meta.git_repository <- function( root, path = NULL, recursive = TRUE, ..., stage = FALSE @@ -179,7 +206,9 @@ prune_meta.git_repository <- function( )) changed <- gsub("\\.tsv$", ".yml", file.path(root_wd, changed, fsep = "/")) if (any(to_do %in% changed)) { - stop("cannot remove and stage metadata when data is removed but unstaged") + stop( +"cannot remove and stage metadata in combination with removed but unstaged data" + ) } } else { changed <- unlist(status( @@ -191,7 +220,7 @@ prune_meta.git_repository <- function( } } file.remove(to_do) - to_do <- gsub(sprintf("^%s/(.*)$", root_wd), "\\1", to_do) + to_do <- remove_root(file = to_do, root = root_wd) if (stage) { add(repo = root, path = to_do) diff --git a/R/read_vc.R b/R/read_vc.R index 40cc340..ddee500 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -1,8 +1,17 @@ -#' Read a \code{data.frame} +#' Read a Git2rdata Object from Disk +#' +#' @description +#' `read_vc()` handles git2rdata objects stored by `write_vc()`. It reads and +#' verifies the metadata file (`.yml`). Then it reads and verifies the raw data. +#' The last step is backtransforming any transformation done by `meta()` to +#' return the `data.frame` as stored by `write_vc()`. +#' +#' `read_vc()` is an S3 generic on `root` which currently handles `"character"` +#' (a path) and `"git-repository"` (from `git2r`). S3 methods for other version +#' control system could be added. #' -#' Note that the dataframe has to be written with `write_vc()` before it can be read with `read_vc()`. #' @inheritParams write_vc -#' @return The \code{data.frame} with the file names and hashes as attributes +#' @return The `data.frame` with the file names and hashes as attributes. #' @rdname read_vc #' @export #' @family storage @@ -27,6 +36,17 @@ read_vc.character <- function(file, root = ".") { root <- normalizePath(root, winslash = "/", mustWork = TRUE) file <- clean_data_path(root = root, file = file) + tryCatch( + is_git2rdata(file = remove_root(file = file["meta_file"], root = root), + root = root, message = "error"), + error = function(e) { + if (e$message == "Corrupt data, mismatching data hash.") { + warning("Mismatching data hash. Data altered outside of git2rdata.") + } else { + stop(e$message) + } + } + ) assert_that( all(file.exists(file)), msg = "raw file and/or meta file missing" @@ -34,7 +54,6 @@ read_vc.character <- function(file, root = ".") { # read the metadata meta_data <- read_yaml(file["meta_file"]) - assert_that(has_name(meta_data, "..generic")) optimize <- meta_data[["..generic"]][["optimize"]] if (optimize) { col_type <- c( @@ -58,8 +77,8 @@ read_vc.character <- function(file, root = ".") { raw_data <- read.table( file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", dec = ".", numerals = "warn.loss", na.strings = na_string, - colClasses = setNames(col_type[col_classes], col_names), - stringsAsFactors = FALSE, fileEncoding = "UTF-8", encoding = "UTF-8" + colClasses = setNames(col_type[col_classes], col_names), comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" ) # reinstate factors diff --git a/R/recent_commit.R b/R/recent_commit.R index 6d8a11f..90048a1 100644 --- a/R/recent_commit.R +++ b/R/recent_commit.R @@ -1,12 +1,21 @@ -#' Most recent file change +#' Retrieve the Most Recent File Change #' -#' Retrieve the most recent commit in which a file or data object was added or updated. +#' @description +#' Retrieve the most recent commit that added or updated a file or git2rdata +#' object. This does not imply that file still exists at the current HEAD as it +#' ignores the deletion of files. +#' +#' Use this information to document the current version of file or git2rdata +#' object in an analysis. Since it refers to the most recent change of this +#' file, it remains unchanged by committing changes to other files. You can +#' also use it to track if data got updated, requirering an analysis to +#' be rerun. See `vignette("workflow", package = "git2rdata")`. #' @inheritParams write_vc -#' @param root The root of a project. Can be a file path or a `git-repository` -#' @param data does `file` refers to a data object (TRUE) or to a file (FALSE). -#' Defaults to FALSE. +#' @param root The root of a project. Can be a file path or a `git-repository`. +#' @param data does `file` refers to a data object (`TRUE`) or to a file (`FALSE`). +#' Defaults to `FALSE`. #' @return a `data.frame` with `commit`, `author` and `when` for the most recent -#' commit in which the file was altered +#' commit that adds op updates the file. #' @export #' @family version_control #' @examples @@ -17,22 +26,27 @@ #' git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") #' #' # write and commit a first dataframe -#' write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", stage = TRUE) +#' # store the output of write_vc() minimize screen output +#' junk <- write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", +#' stage = TRUE) #' commit(repo, "important analysis", session = TRUE) #' list.files(repo_path) #' Sys.sleep(1.1) # required because git doesn't handle subsecond timings #' #' # write and commit a second dataframe -#' write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", stage = TRUE) +#' junk <- write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", +#' stage = TRUE) #' commit(repo, "important analysis", session = TRUE) #' list.files(repo_path) #' Sys.sleep(1.1) # required because git doesn't handle subsecond timings #' #' # write and commit a new version of the first dataframe -#' write_vc(iris[7:12, ], "iris", repo, stage = TRUE) +#' junk <- write_vc(iris[7:12, ], "iris", repo, stage = TRUE) #' list.files(repo_path) #' commit(repo, "important analysis", session = TRUE) #' +#' +#' #' # find out in which commit a file was last changed #' #' # "iris.tsv" was last updated in the third commit @@ -41,39 +55,54 @@ #' recent_commit("iris.yml", repo) #' # "iris2.yml" was last updated in the second commit #' recent_commit("iris2.yml", repo) -#' # the data object "iris" was last updated in the third commit +#' # the git2rdata object "iris" was last updated in the third commit #' recent_commit("iris", repo, data = TRUE) #' -#' # remove a dataframe and commit it +#' # remove a dataframe and commit it to see what happens with deleted files #' file.remove(file.path(repo_path, "iris.tsv")) #' prune_meta(repo, ".") #' commit(repo, message = "remove iris", all = TRUE, session = TRUE) +#' list.files(repo_path) #' -#' # still points to the third commit as it is the latest commit in which the +#' # still points to the third commit as this is the latest commit in which the #' # data was present #' recent_commit("iris", repo, data = TRUE) +#' +#' #' clean up +#' junk <- file.remove( +#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, +#' include.dirs = TRUE, all.files = TRUE)), +#' repo_path) recent_commit <- function(file, root, data = FALSE){ UseMethod("recent_commit", root) } #' @export -#' @importFrom assertthat assert_that is.string is.flag -#' @importFrom git2r odb_blobs last_commit +recent_commit.default <- function(file, root, data = FALSE) { + stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) +} + +#' @export +#' @importFrom assertthat assert_that is.string is.flag noNA +#' @importFrom git2r odb_blobs last_commit workdir recent_commit.git_repository <- function(file, root, data = FALSE) { - assert_that(is.string(file), is.flag(data)) + assert_that(is.string(file), is.flag(data), noNA(data)) - if (isTRUE(data)) { - file <- clean_data_path(root = ".", file, normalize = FALSE) + path <- unique(dirname(file)) + if (path == ".") { + path <- "" + } + if (data) { + file <- clean_data_path(root = workdir(root), file, normalize = FALSE) } name <- basename(file) - path <- gsub("^\\./?", "", unique(dirname(file))) blobs <- odb_blobs(root) blobs <- blobs[blobs$path == path & blobs$name %in% name, ] blobs <- blobs[blobs$when <= as.data.frame(last_commit(root))$when, ] blobs <- blobs[blobs$when == max(blobs$when), c("commit", "author", "when")] blobs <- unique(blobs) if (nrow(blobs) > 1) { - warning("Multiple commits within the same second") + warning("More than one commit within the same second") } rownames(blobs) <- NULL blobs diff --git a/R/reexport.R b/R/reexport.R index f454a09..f482284 100644 --- a/R/reexport.R +++ b/R/reexport.R @@ -1,42 +1,42 @@ -#' Reexported function from git2r +#' Reexported Function From `git2r` #' -#' See \code{\link[git2r]{repository}} +#' See \code{\link[git2r]{repository}} in `git2r`. #' @name repository #' @importFrom git2r repository #' @family version_control #' @export NULL -#' Reexported function from git2r +#' Reexported Function From `git2r` #' -#' See \code{\link[git2r]{status}} +#' See \code{\link[git2r]{status}} in `git2r`. #' @name status #' @importFrom git2r status #' @family version_control #' @export NULL -#' Reexported function from git2r +#' Reexported Function From `git2r` #' -#' See \code{\link[git2r]{commit}} +#' See \code{\link[git2r]{commit}} in `git2r`. #' @name commit #' @importFrom git2r commit #' @family version_control #' @export NULL -#' Reexported function from git2r +#' Reexported Function From `git2r` #' -#' See \code{\link[git2r]{pull}} +#' See \code{\link[git2r]{pull}} in `git2r`. #' @name pull #' @importFrom git2r pull #' @family version_control #' @export NULL -#' Reexported function from git2r +#' Reexported Function From `git2r` #' -#' See \code{\link[git2r]{push}} +#' See \code{\link[git2r]{push}} in `git2r`. #' @name push #' @importFrom git2r push #' @family version_control diff --git a/R/relabel.R b/R/relabel.R index 3a41183..230bc42 100644 --- a/R/relabel.R +++ b/R/relabel.R @@ -1,41 +1,67 @@ -#' Relabel factor levels +#' Relabel Factor Levels by Updating the Metadata #' -#' Imaging the situation where we have a dataframe with a factor variable and we +#' Imagine the situation where we have a dataframe with a factor variable and we #' have stored it with `write_vc(optimize = TRUE)`. The raw data file contains #' the factor indices and the metadata contains the link between the factor -#' index and the corresponding label. +#' index and the corresponding label. See +#' `vignette("version_control", package = "git2rdata")`. In such a case, +#' relabeling a factor can be fast and lightweight by updating the metadata. #' @inheritParams write_vc -#' @param change either list or a data.frame. In case of a list is a named list -#' with named vectors. The name of list elements must match the names of the -#' variables. The names of the vector elements must match the existing factor -#' labels. The values represent the new factor labels. In case of a data.frame -#' it needs to have the variables `factor` (name of the factor), `old` (the old) -#' factor label and `new` (the new factor label). Other columns are ignored. -#' @return invisible `NULL` +#' @param change either a `list` or a `data.frame`. In case of a `list` is a +#' named `list` with named `vectors`. The names of list elements must match the +#' names of the variables. The names of the vector elements must match the +#' existing factor labels. The values represent the new factor labels. In case +#' of a `data.frame` it needs to have the variables `factor` (name of the +#' factor), `old` (the old) factor label and `new` (the new factor label). +#' `relabel()` ignores all other columns. +#' @return invisible `NULL`. #' @export #' @examples #' -#' # setup a directory -#' root <- tempfile("git2rdata-relabel") -#' dir.create(root) +#' # initialise a git repo using git2r +#' repo_path <- tempfile("git2rdata-repo-") +#' dir.create(repo_path) +#' repo <- git2r::init(repo_path) +#' git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") #' -#' # create a dataframe and store it +#' # Create a dataframe and store it as an optimized git2rdata object. +#' # Note that write_vc() uses optimization by default. +#' # Stage and commit the git2rdata object. #' ds <- ds <- data.frame(a = c("a1", "a2"), b = c("b2", "b1")) -#' write_vc(ds, "relabel", root, sorting = "b") +#' junk <- write_vc(ds, "relabel", repo, sorting = "b", stage = TRUE) +#' cm <- commit(repo, "initial commit") +#' # check that the workspace is clean +#' status(repo) #' -#' # define new labels as a list and apply them +#' # Define new labels as a list and apply them to the git2rdata object. #' new_labels <- list( #' a = list(a2 = "a3") #' ) -#' relabel("relabel", root, new_labels) +#' relabel("relabel", repo, new_labels) +#' # check the changes +#' read_vc("relabel", repo) +#' # relabel() changed the metadata, not the raw data +#' status(repo) +#' git2r::add(repo, "relabel.*") +#' cm <- commit(repo, "relabel using a list") #' -#' # define new labels as a dataframe and apply them +#' # Define new labels as a dataframe and apply them to the git2rdata object #' change <- data.frame( #' factor = c("a", "a", "b"), #' old = c("a3", "a1", "b2"), #' new = c("c2", "c1", "b3") #' ) -#' relabel("relabel", root, change) +#' relabel("relabel", repo, change) +#' # check the changes +#' read_vc("relabel", repo) +#' # relabel() changed the metadata, not the raw data +#' status(repo) +#' +#' # clean up +#' junk <- file.remove( +#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, +#' include.dirs = TRUE, all.files = TRUE)), +#' repo_path) #' @family storage relabel <- function(file, root = ".", change) { UseMethod("relabel", change) @@ -50,20 +76,21 @@ relabel.default <- function(file, root, change) { #' @importFrom git2r workdir hash #' @importFrom assertthat assert_that is.string has_name #' @importFrom yaml read_yaml write_yaml +#' @importFrom utils packageVersion relabel.list <- function(file, root = ".", change) { if (inherits(root, "git_repository")) { return(relabel(file = file, root = workdir(root), change = change)) } assert_that(is.string(root), is.string(file)) - assert_that(!is.null(names(change)), msg = "'change' must be named") + assert_that(!is.null(names(change)), msg = "'change' has no names") root <- normalizePath(root, winslash = "/", mustWork = TRUE) + is_git2rmeta(file = file, root = root, message = "error") file <- clean_data_path(root = root, file = file) assert_that( all(file.exists(file)), msg = "raw file and/or meta file missing" ) meta_data <- read_yaml(file["meta_file"]) - assert_that(has_name(meta_data, "..generic")) optimize <- meta_data[["..generic"]][["optimize"]] if (!optimize) { stop("relabeling factors on verbose data leads to large diffs. @@ -79,6 +106,13 @@ Use write_vc() instead.") msg = sprintf("the names in '%s' don't match existing labels", id) ) names(meta_data[[id]][["labels"]]) <- meta_data[[id]][["labels"]] + + if (any(names(change[[id]]) == "")) { + empty_change <- which(names(change[[id]]) == "") + empty_meta <- which(names(meta_data[[id]][["labels"]]) == "") + meta_data[[id]][["labels"]][empty_meta] <- change[[id]][empty_change] + change[[id]] <- change[[id]][-empty_change] + } meta_data[[id]][["labels"]][names(change[[id]])] <- change[[id]] meta_data[[id]][["labels"]] <- unname(meta_data[[id]][["labels"]]) assert_that( @@ -86,9 +120,9 @@ Use write_vc() instead.") msg = sprintf("relabeling '%s' leads to duplicated labels", id) ) } - meta_data[["..generic"]][["hash"]] <- NULL - meta_data[["..generic"]] <- c(meta_data[["..generic"]], - hash = hash(as.yaml(meta_data))) + meta_data[["..generic"]][["hash"]] <- metadata_hash(meta_data) + meta_data[["..generic"]][["git2rdata"]] <- + as.character(packageVersion("git2rdata")) write_yaml(meta_data, file["meta_file"]) return(invisible(NULL)) } diff --git a/R/upgrade_data.R b/R/upgrade_data.R new file mode 100644 index 0000000..333b59d --- /dev/null +++ b/R/upgrade_data.R @@ -0,0 +1,118 @@ +#' Upgrade Files to the New Version +#' +#' Updates the data written by older versions to the current data format +#' standard. Works both on a single file and (recursively) on a path. The +#' `".yml"` file must contain a `"..generic"` element. `upgrade_data()` ignores +#' all other files. +#' @inheritParams write_vc +#' @param verbose display a message with the update status. Defaults to `TRUE`. +#' @param path specify `path` instead of `file` to update all git2rdata objects +#' in this directory and it's subdirectories. `path` is relative to `root`. Use +#' `path = "."` to upgrade all git2rdata objects under `root`. +#' @export +#' @return the git2rdata object names. +#' @family internal +#' @examples +#' # create a directory +#' root <- tempfile("git2rdata-") +#' dir.create(root) +#' +#' # write dataframes to the root +#' write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") +#' write_vc(iris[5:10, ], file = "subdir/iris", root = root, +#' sorting = "Sepal.Length") +#' # upgrade a single git2rdata object +#' upgrade_data(file = "iris", root = root) +#' # use path = "." to upgrade all git2rdata objects under root +#' upgrade_data(path = ".", root = root) +#' +#' # clean up +#' junk <- file.remove(list.files(root, full.names = TRUE), root) +upgrade_data <- function(file, root = ".", verbose, ..., path) { + UseMethod("upgrade_data", root) +} + +#' @export +upgrade_data.default <- function(file, root, verbose, path, ...) { + stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) +} + +#' @importFrom assertthat assert_that is.string is.flag noNA +#' @importFrom yaml read_yaml write_yaml +#' @importFrom utils packageVersion +#' @export +upgrade_data.character <- function( + file, root = ".", verbose = TRUE, ..., path) { + assert_that(is.string(root), is.flag(verbose), noNA(verbose)) + root <- normalizePath(root, winslash = "/", mustWork = TRUE) + if (missing(file)) { + assert_that(missing(file), + msg = "specify either 'file' or 'path'") + assert_that(is.string(path)) + full_path <- normalizePath(file.path(root, path), winslash = "/", + mustWork = TRUE) + ymls <- list.files(path = full_path, pattern = "\\.yml$", recursive = TRUE) + files <- vapply(file.path(path, ymls), upgrade_data, root = root, + verbose = verbose, FUN.VALUE = "") + return(files) + } + assert_that(missing(path), msg = "specify either 'file' or 'path'") + assert_that(is.string(file)) + file <- clean_data_path(root = root, file = file) + + meta_data <- read_yaml(file["meta_file"]) + target <- remove_root(file = file["meta_file"], root = root) + target <- gsub(".yml", "", target) + if (!has_name(meta_data, "..generic")) { + message(target, "is not a git2rdata object") + return(target) + } + assert_that( + has_name(meta_data[["..generic"]], "hash"), + msg = paste(target, "has corrupt metadata, no hash found.") + ) + if (has_name(meta_data[["..generic"]], "git2rdata")) { + if (package_version(meta_data[["..generic"]][["git2rdata"]]) == + packageVersion("git2rdata") + ) { + if (verbose) { + message(target, " already up to date") + } + return(target) + } + } + assert_that( + meta_data[["..generic"]][["hash"]] == metadata_hash(meta_data), + msg = paste(target, "has corrupt metadata: mismatching hash.") + ) + meta_data[["..generic"]][["git2rdata"]] <- + as.character(packageVersion("git2rdata")) + if (!has_name(meta_data[["..generic"]], "data_hash")) { + meta_data[["..generic"]][["data_hash"]] <- hashfile(file["raw_file"]) + } + write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") + if (verbose) { + message(file["meta_file"], " updated") + } + return(target) +} + +#' @rdname upgrade_data +#' @inheritParams write_vc.git_repository +#' @inheritParams git2r::add +#' @export +#' @importFrom git2r workdir add +#' @importFrom assertthat assert_that is.flag noNA +#' @importFrom git2r workdir add +upgrade_data.git_repository <- function( + file, root = ".", verbose = TRUE, ..., path, stage = FALSE, force = FALSE +) { + assert_that(is.flag(stage), noNA(stage), is.flag(force), noNA(force)) + file <- upgrade_data(file = file, root = workdir(root), verbose = verbose, + path = path, ...) + if (!stage) { + return(file) + } + add(root, path = paste0(file, ".yml"), force = force) + return(file) +} diff --git a/R/write_vc.R b/R/write_vc.R index 90df947..ab7a9bc 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -1,19 +1,25 @@ -#' Write a \code{data.frame} +#' Store a Data.Frame as a Git2rdata Object on Disk #' -#' This will create two files. The `".tsv"` file contains the raw data. -#' The `".yml"` contains the meta data on the columns in YAML format. -#' @param x the `data.frame -#' @param file the name of the file without file extension. Can include a -#' relative path. It is relative to the `root`. +#' A git2rdata object consists of two files. The `".tsv"` file contains the raw +#' data as a plain text tab separated file. The `".yml"` contains the metadata +#' on the columns in plain text YAML format. See `vignette("plain text", package = "git2rdata")` for more details on the implementation. +#' @param x the `data.frame`. +#' @param file the name of the git2rdata object. Git2rdata objects cannot +#' have dots in their name. The name may include a relative path. `file` is a +#' path relative to the `root`. #' @param root The root of a project. Can be a file path or a `git-repository`. -#' Defaults to the current working directory ("."). -#' @param sorting a vector of column names defining which columns to use for -#' sorting \code{x} and in what order to use them. Only required when writing -#' new metadata. -#' @param strict What to do when the metadata changes. `strict = FALSE` will -#' overwrite the data with a warning listing the changes, `strict = TRUE` will -#' return an error and leave the data as is. Default to `TRUE` -#' @param ... additional parameters used in some methods +#' Defaults to the current working directory (`"."`). +#' @param sorting an optional vector of column names defining which columns to +#' use for sorting `x` and in what order to use them. Omitting `sorting` yields +#' a warning. Add `sorting` to avoid this warning. Strongly recommended +#' in combination with version control. See +#' `vignette("efficiency", package = "git2rdata")` for an illustration of the +#' importance of sorting. +#' @param strict What to do when the metadata changes. `strict = FALSE` +#' overwrites the data and the metadata with a warning listing the changes, +#' `strict = TRUE` returns an error and leaves the data and metadata as is. +#' Defaults to `TRUE`. +#' @param ... parameters used in some methods #' @inheritParams meta #' @inheritParams utils::write.table #' @return a named vector with the file paths relative to `root`. The names @@ -21,6 +27,8 @@ #' @export #' @family storage #' @template example-io +#' @note `..generic` is a reserved name for the metadata and cannot be used as +#' column name in a `data.frame`. write_vc <- function( x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ... @@ -55,6 +63,13 @@ write_vc.character <- function( } if (file.exists(file["meta_file"])) { + tryCatch( + is_git2rmeta(file = remove_root(file = file["meta_file"], root = root), + root = root, message = "error"), + error = function(e) { + stop(paste("Existing metadata file is invalid.", e$message, sep = "\n")) + } + ) old <- read_yaml(file["meta_file"]) class(old) <- "meta_list" raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting, @@ -62,27 +77,31 @@ write_vc.character <- function( problems <- compare_meta(attr(raw_data, "meta"), old) if (length(problems)) { if (strict) { + problems <- c( + "The data was not overwritten because of the issues below.", +"See vignette('version_control', package = 'git2rdata') for more information.", + "", problems) stop(paste(problems, collapse = "\n"), call. = FALSE) } warning(paste(problems, collapse = "\n")) if (missing(sorting) && !is.null(old[["..generic"]][["sorting"]])) { sorting <- old[["..generic"]][["sorting"]] } - write_yaml(attr(raw_data, "meta"), file["meta_file"], - fileEncoding = "UTF-8") } } else { raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting) - write_yaml(attr(raw_data, "meta"), file["meta_file"], - fileEncoding = "UTF-8") } write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) + meta_data <- attr(raw_data, "meta") + meta_data[["..generic"]][["data_hash"]] <- hashfile(file["raw_file"]) + write_yaml(meta_data, file["meta_file"], + fileEncoding = "UTF-8") - hashes <- gsub(paste0("^", root, "/"), "", file) + hashes <- remove_root(file = file, root = root) names(hashes) <- hashfile(file) return(hashes) @@ -92,7 +111,8 @@ write_vc.character <- function( setOldClass("git_repository") #' @rdname write_vc -#' @param stage stage the changes after writing the data. Defaults to FALSE +#' @param stage Logical value indicating whether to stage the changes after +#' writing the data. Defaults to `FALSE`. #' @inheritParams git2r::add #' @export #' @importFrom git2r workdir add @@ -114,6 +134,8 @@ write_vc.git_repository <- function( } compare_meta <- function(new, old) { + new[["..generic"]][["data_hash"]] <- NULL + old[["..generic"]][["data_hash"]] <- NULL problems <- character(0) if (isTRUE(all.equal(new, old))) { return(problems) @@ -124,7 +146,8 @@ compare_meta <- function(new, old) { problems <- c( problems, sprintf( - "new data is %s, whereas old data was %s", + "- New data is %s, whereas old data was %s. + Check the 'optimized' argument.", ifelse(new_optimize, "optimized", "verbose"), ifelse(old_optimize, "optimized", "verbose") ) @@ -134,7 +157,8 @@ compare_meta <- function(new, old) { problems <- c( problems, sprintf( - "new data uses '%s' as NA string, whereas old data used '%s'", + "- New data uses '%s' as NA string, whereas old data used '%s'. + Check the 'NA' argument.", new[["..generic"]][["NA string"]], old[["..generic"]][["NA string"]] ) ) @@ -142,34 +166,33 @@ compare_meta <- function(new, old) { new_sorting <- new[["..generic"]][["sorting"]] old_sorting <- old[["..generic"]][["sorting"]] if (!isTRUE(all.equal(new_sorting, old_sorting))) { - common_sorting <- seq_len(min(length(new_sorting), length(old_sorting))) - if (any(new_sorting[common_sorting] != old_sorting[common_sorting])) { - problems <- c(problems, "new data uses different variables for sorting") - } - if (length(old_sorting) > length(common_sorting)) { - problems <- c(problems, "new data uses less variables for sorting") - } else if (length(new_sorting) > length(common_sorting)) { - problems <- c(problems, "new data uses more variables for sorting") - } + sprintf( + "- The sorting variables changed. + - Sorting for the new data: %s. + - Sorting for the old data: %s.", + paste(sprintf("'%s'", new_sorting), collapse = ", "), + paste(sprintf("'%s'", old_sorting), collapse = ", ") + ) -> extra + problems <- c(problems, extra) } new <- new[names(new) != "..generic"] old <- old[names(old) != "..generic"] if (length(new) != length(old)) { - problems <- c(problems, "new data has a different number of variables") + problems <- c(problems, "- New data has a different number of variables.") } if (!all(names(new) %in% names(old))) { problems <- c(problems, - paste( - "new variables:", + sprintf( + "- New variables: %s.", paste(names(new)[!names(new) %in% names(old)], collapse = ", ") ) ) } if (!all(names(old) %in% names(new))) { problems <- c(problems, - paste( - "deleted variables:", + sprintf( + "- Deleted variables: %s.", paste(names(old)[!names(old) %in% names(new)], collapse = ", ") ) ) @@ -181,7 +204,7 @@ compare_meta <- function(new, old) { delta <- which(old_class != new_class) if (length(delta)) { problems <- c(problems, - sprintf("change in class: %s from %s to %s", common_variables[delta], + sprintf("- Change in class: '%s' from %s to %s.", common_variables[delta], old_class[delta], new_class[delta]) ) } @@ -193,19 +216,29 @@ compare_meta <- function(new, old) { problems <- c( problems, sprintf( - "%s changes from %s to %s", id, + "- '%s' changes from %s to %s.", id, ifelse(old[[id]]$ordered, "ordinal", "nominal"), ifelse(new[[id]]$ordered, "ordinal", "nominal") ) ) } if (!isTRUE(all.equal(old[[id]][["labels"]], new[[id]][["labels"]]))) { - problems <- c(problems, sprintf("new factor labels for %s", id)) + problems <- c(problems, sprintf("- New factor labels for '%s'.", id)) } if (!isTRUE(all.equal(old[[id]][["index"]], new[[id]][["index"]]))) { - problems <- c(problems, sprintf("new indices labels for %s", id)) + problems <- c(problems, sprintf("- New indices for '%s'.", id)) } } return(problems) } + +#' @noRd +#' @param file the file including the path +#' @param root the path of the root +remove_root <- function(file, root) { + n_root <- nchar(root) + 1 + has_root <- substr(file, 1, n_root) == paste0(root, "/") + file[has_root] <- substr(file[has_root], n_root + 1, nchar(file[has_root])) + return(file) +} diff --git a/README.md b/README.md index 08c246b..2887919 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,43 @@ # The `git2rdata` package -[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) -[![lifecycle](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) +[![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) +[![](https://badges.ropensci.org/263_status.svg)](https://github.com/ropensci/software-review/issues/263) + [![Licence](https://img.shields.io/badge/licence-GPL--3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0.en.html) [![minimal R version](https://img.shields.io/badge/R%3E%3D-3.4.0-6666ff.svg)](https://cran.r-project.org/) +[![DOI](https://zenodo.org/badge/147685405.svg)](https://zenodo.org/badge/latestdoi/147685405) + [![Travis-CI Build Status](https://travis-ci.org/inbo/git2rdata.svg?branch=master)](https://travis-ci.org/inbo/git2rdata) [![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/a3idhi9f6ls9xu8r/branch/master?svg=true)](https://ci.appveyor.com/project/ThierryO/git2rdata/branch/master) [![codecov](https://codecov.io/gh/inbo/git2rdata/branch/master/graph/badge.svg)](https://codecov.io/gh/inbo/git2rdata) -[![DOI](https://zenodo.org/badge/147685405.svg)](https://zenodo.org/badge/latestdoi/147685405) + +

Please visit the git2rdata website at https://inbo.github.io/git2rdata/. The vignette code on the website link to a rendered version of the vignette. Functions have a link to their help file.

+ ## Rationale The `git2rdata` package is an R package for writing and reading dataframes as plain text files. Important information is stored in a metadata file. -1. Storing metadata allows to maintain the classes of variables. By default, the data is optimized for file storage prior to writing. This makes the data less human readable and can be turned off. Details on the implementation are available in the [plain text](https://inbo.github.io/git2rdata/articles/plain_text.html) vignette. -1. Storing metadata also allows to minimize row based [diffs](https://en.wikipedia.org/wiki/Diff) between two consecutive [commits](https://en.wikipedia.org/wiki/Commit_(version_control)). This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in the [version control](https://inbo.github.io/git2rdata/articles/version_control.html) vignette. Although `git2rdata` was envisioned with a [git](https://git-scm.com/) workflow in mind, it can also be used in combination with other version control systems like [subversion](https://subversion.apache.org/) or [mercurial](https://www.mercurial-scm.org/). -1. `git2rdata` is intended to facilitate a reproducible and traceable workflow. A toy example is given in the [workflow](https://inbo.github.io/git2rdata/articles/workflow.html) vignette. -1. The [efficiency](https://inbo.github.io/git2rdata/articles/efficiency.html) vignette provides some insight into the efficiency in terms of file storage, git repository size and speed for writing and reading. +1. Storing metadata allows to maintain the classes of variables. By default, the data is optimized for file storage prior to writing. The optimization is most effective on data containing factors. The optimization makes the data less human readable and can be turned off. Details on the implementation are available in `vignette("plain_text", package = "git2rdata")`. +1. Storing metadata also allows to minimize row based [diffs](https://en.wikipedia.org/wiki/Diff) between two consecutive [commits](https://en.wikipedia.org/wiki/Commit_(version_control)). This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in `vignette("version_control", package = "git2rdata")`. Although `git2rdata` was envisioned with a [git](https://git-scm.com/) workflow in mind, it can also be used in combination with other version control systems like [subversion](https://subversion.apache.org/) or [mercurial](https://www.mercurial-scm.org/). +1. `git2rdata` is intended to facilitate a reproducible and traceable workflow. A toy example is given in `vignette("workflow", package = "git2rdata")`. +1. `vignette("efficiency", package = "git2rdata")` provides some insight into the efficiency in terms of file storage, git repository size and speed for writing and reading. + +## Why Use Git2rdata? + +- You can store dataframes as plain text files. +- The dataframe you read has exactly the same information content as the one you wrote. + - No changes in data type. + - Factors keep their original levels, including their order. + - Date and date-time are stored in an unambiguous format, documented in the metadata. +- The data and the metadata are stored in a standard and open format, making it readable by other software. +- Data and metadata are checked during the reading. The user is informed if there is tampering with the data or metadata. +- Git2rdata integrates with the [`git2r`](https://cran.r-project.org/package=git2r) package for working with git repository from R. + - Another option is using git2rdata solely for writing to disk and handle the plain text files with your favourite version control system outside of R. +- The optimization reduces the required disk space by about 30% for both the working directory and the git history. +- Reading data from a HDD is 30% faster than `read.table()`, writing to a HDD takes about 70% more time than `write.table()`. +- Git2rdata is useful as a tool in a reproducible and traceable workflow. See `vignette("workflow", package = "git2rdata")`. +- You can detect when a file was last modified in the git history. Use this to check whether an existing analysis is obsolete due to new data. This allows to not rerun up to date analyses, saving resources. ## Installation @@ -36,26 +58,47 @@ remotes::install_github( remotes::install_github("inbo/git2rdata")) ``` -## Main usage +## Usage in a Nutshell Dataframes are stored using `write_vc()` and retrieved with `read_vc()`. Both functions share the arguments `root` and `file`. `root` refers to a base location where the dataframe should be stored. It can either point to a local directory or a local git repository. `file` is the file name to use and can include a path relative to `root`. Make sure the relative path stays within `root`. ```r +# using a local directory library(git2rdata) -root <- "~/myproject" # local directory -root <- git2r::repository("~/my_git_repo") # git repository +root <- "~/myproject" write_vc(my_data, file = "rel_path/filename", root = root) read_vc(file = "rel_path/filename", root = root) +root <- git2r::repository("~/my_git_repo") # git repository ``` +More details on store dataframes as plain text files in `vignette("plain_text", package = "git2rdata")`. + +```r +# using a git repository +library(git2rdata) +repo <- repository("~/my_git_repo") +pull(repo) +write_vc(my_data, file = "rel_path/filename", root = repo, stage = TRUE) +commit(repo, "My message") +push(repo) +read_vc(file = "rel_path/filename", root = repo) +``` + +Please read `vignette("version_control", package = "git2rdata")` for more details on using git2rdata in combination with version control. + +## What data sizes can `git2rdata` handle? + +The recommendation for git repositories is to use files smaller than 100 MB, an overall repository size less than 1 GB and less than 25k files. The individual file size is the limiting factor. Storing the airbag dataset ([`DAAG::nassCDS`](https://cran.r-project.org/package=DAAG)) with `write_vc()` requires on average 68 (optimized) or 97 (verbose) byte per record. The 100 MB file limit for this data is reached after about 1.5 million (optimize) or 1 million (verbose) observations. Your mileage might vary. + ## Citation Please use the output of `citation("git2rdata")` -## Folder structure +## Folder Structure - `R`: The source scripts of the [R](https://cran.r-project.org/) functions with documentation in [Roxygen](https://github.com/klutometis/roxygen) format - `man`: The help files in [Rd](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Rd-format) format +- `inst/efficiency`: pre-calculated data to speed up `vignette("efficiency", package = "git2rdata")` - `testthat`: R scripts with unit tests using the [testthat](http://testthat.r-lib.org/) framework - `vignettes`: source code for the vignettes describing the package - `man-roxygen`: templates for documentation in Roxygen format @@ -66,6 +109,8 @@ Please use the output of `citation("git2rdata")` ``` git2rdata ├── .github +├─┬ inst +│ └── efficiency ├── man ├── man-roxygen ├── pkgdown diff --git a/_pkgdown.yml b/_pkgdown.yml index af4d7fc..0020739 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -7,7 +7,7 @@ navbar: - text: Tutorials href: articles/index.html menu: - - text: Storing dataframes as plain text + - text: Getting started storing dataframes as plain text href: articles/plain_text.html - text: Storing dataframes under version control href: articles/version_control.html diff --git a/codemeta.json b/codemeta.json index a599ad9..1872de9 100644 --- a/codemeta.json +++ b/codemeta.json @@ -1,23 +1,21 @@ { - "@context": [ - "https://doi.org/10.5063/schema/codemeta-2.0", - "http://schema.org" - ], + "@context": ["https://doi.org/10.5063/schema/codemeta-2.0", "http://schema.org"], "@type": "SoftwareSourceCode", "identifier": "git2rdata", "description": "Make versioning of data.frame easy and efficient using git repositories.", "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository", "codeRepository": "https://github.com/inbo/git2rdata", + "relatedLink": "https://doi.org/10.5281/zenodo.1485309", "issueTracker": "https://github.com/inbo/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.0.3", + "version": "0.0.4", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", - "version": "3.5.2", + "version": "3.6.0", "url": "https://r-project.org" }, - "runtimePlatform": "R version 3.5.2 (2018-12-20)", + "runtimePlatform": "R version 3.6.0 (2019-04-26)", "author": [ { "@type": "Person", @@ -27,6 +25,15 @@ "@id": "https://orcid.org/0000-0001-8804-4216" } ], + "contributor": [ + { + "@type": "Person", + "givenName": "Floris", + "familyName": "Vanderhaeghe", + "email": "floris.vanderhaeghe@inbo.be", + "@id": "https://orcid.org/0000-0002-6378-6229" + } + ], "copyrightHolder": [ { "@type": "Organization", @@ -117,7 +124,7 @@ "@type": "SoftwareApplication", "identifier": "R", "name": "R", - "version": ">= 3.4.0" + "version": ">= 3.5.0" }, { "@type": "SoftwareApplication", @@ -162,30 +169,10 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "readme": "https://github.com/inbo/git2rdata/blob/master/README.md", - "fileSize": "336.015KB", - "contIntegration": [ - "https://travis-ci.org/inbo/git2rdata", - "https://ci.appveyor.com/project/ThierryO/git2rdata/branch/master", - "https://codecov.io/gh/inbo/git2rdata" - ], - "developmentStatus": "https://www.repostatus.org/#wip", "releaseNotes": "https://github.com/inbo/git2rdata/blob/master/NEWS.md", - "keywords": [ - "r", - "rstats", - "r-package", - "version-control", - "reproducible-research" - ], - "relatedLink": "https://doi.org/10.5281/zenodo.1485309", - "contributor": [ - { - "@type": "Person", - "givenName": "Floris", - "familyName": "Vanderhaeghe", - "email": "floris.vanderhaeghe@inbo.be", - "@id": "https://orcid.org/0000-0002-6378-6229" - } - ] + "readme": "https://github.com/inbo/git2rdata/blob/master/README.md", + "fileSize": "347.568KB", + "contIntegration": ["https://travis-ci.org/inbo/git2rdata", "https://ci.appveyor.com/project/ThierryO/git2rdata/branch/master", "https://codecov.io/gh/inbo/git2rdata"], + "developmentStatus": ["https://www.repostatus.org/#active", "https://www.tidyverse.org/lifecycle/#maturing"], + "keywords": ["r", "rstats", "r-package", "version-control", "reproducible-research"] } diff --git a/inst/efficiency/file_timings.rds b/inst/efficiency/file_timings.rds index 9b0b9ad..8c0ac74 100644 Binary files a/inst/efficiency/file_timings.rds and b/inst/efficiency/file_timings.rds differ diff --git a/inst/efficiency/git_size.rds b/inst/efficiency/git_size.rds index cb5643e..a220848 100644 Binary files a/inst/efficiency/git_size.rds and b/inst/efficiency/git_size.rds differ diff --git a/inst/efficiency/read_timings.rds b/inst/efficiency/read_timings.rds index 74f43f6..03c19c0 100644 Binary files a/inst/efficiency/read_timings.rds and b/inst/efficiency/read_timings.rds differ diff --git a/man-roxygen/example-io.R b/man-roxygen/example-io.R index a00db19..b5ae749 100644 --- a/man-roxygen/example-io.R +++ b/man-roxygen/example-io.R @@ -7,23 +7,24 @@ #' #' # write a dataframe to the directory #' write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") -#' # check that a data file (.tsv) and a meta data file (.yml) are created +#' # check that a data file (.tsv) and a metadata file (.yml) exist. #' list.files(root, recursive = TRUE) -#' # read the data from the directory +#' # read the git2rdata object from the directory #' read_vc("iris", root) #' -#' # store a new version +#' # store a new version with different observations but the same metadata #' write_vc(iris[1:5, ], "iris", root) #' list.files(root, recursive = TRUE) -#' # store a new version in case the meta data must change +#' # Removing a column requires version requires new metadata. +#' # Add strict = FALSE to override the existing metadata. #' write_vc( #' iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE #' ) #' list.files(root, recursive = TRUE) -#' # storing the first version again required another update of the meta data +#' # storing the orignal version again requires another update of the metadata #' write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE) #' list.files(root, recursive = TRUE) -#' # storing the data in verbose format leads to larger files +#' # optimize = FALSE stores the data more verbose. This requires larger files. #' write_vc( #' iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE #' ) @@ -31,7 +32,7 @@ #' #' #' -#' ## on git repo +#' ## on git repo using a git2r::git-repository #' #' # initialise a git repo using the git2r package #' repo_path <- tempfile("git2rdata-repo-") @@ -39,31 +40,27 @@ #' repo <- git2r::init(repo_path) #' git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") #' -#' # store a dataframe in git repo +#' # store a dataframe in git repo. #' write_vc(iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length") +#' # This git2rdata object is not staged by default. #' status(repo) #' # read a dataframe from a git repo #' read_vc("iris", repo) #' -#' # store a new version of in the git repo +#' # store a new version in the git repo and stage it in one go #' write_vc(iris[1:5, ], "iris", repo, stage = TRUE) #' status(repo) #' -#' # store a version with altered meta data -#' write_vc( -#' iris[1:6, -2], "iris", repo, sorting = "Sepal.Length", strict = FALSE -#' ) -#' status(repo) -#' -#' # store the original version again -#' write_vc( -#' iris[1:6, ], "iris", repo, sorting = "Sepal.Width", strict = FALSE, -#' stage = TRUE -#' ) -#' status(repo) -#' -#' # store a verbose version in separate files +#' # store a verbose version in a different gir2data object #' write_vc( #' iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE #' ) #' status(repo) +#' +#' # clean up +#' junk <- file.remove( +#' list.files(root, full.names = TRUE, recursive = TRUE), root) +#' junk <- file.remove( +#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, +#' include.dirs = TRUE, all.files = TRUE)), +#' repo_path) diff --git a/man-roxygen/example-isgit2r.R b/man-roxygen/example-isgit2r.R new file mode 100644 index 0000000..ac184be --- /dev/null +++ b/man-roxygen/example-isgit2r.R @@ -0,0 +1,26 @@ +#' @examples +#' # create a directory +#' root <- tempfile("git2rdata-") +#' dir.create(root) +#' +#' # store a file +#' write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +#' # check the stored file +#' is_git2rmeta("iris", root) +#' is_git2rdata("iris", root) +#' +#' # Remove the metadata from the existing git2rdata object. Then it stops +#' # being a git2rdata object. +#' junk <- file.remove(file.path(root, "iris.yml")) +#' is_git2rmeta("iris", root) +#' is_git2rdata("iris", root) +#' +#' # recreate the file and remove the data and keep the metadata. It stops being +#' # a git2rdata object, but the metadata remains valid. +#' write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +#' junk <- file.remove(file.path(root, "iris.tsv")) +#' is_git2rmeta("iris", root) +#' is_git2rdata("iris", root) +#' +#' # clean up +#' junk <- file.remove(list.files(root, full.names = TRUE), root) diff --git a/man-roxygen/example-prune.R b/man-roxygen/example-prune.R index f4fd703..d8fcba8 100644 --- a/man-roxygen/example-prune.R +++ b/man-roxygen/example-prune.R @@ -5,21 +5,32 @@ #' root <- tempfile("git2rdata-") #' dir.create(root) #' -#' # store a dataframe -#' write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") -#' # list the available data and the files +#' # store a dataframe as git2rdata object. Capture the result to minimise +#' # screen output +#' junk <- write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +#' # write a standard tab separate file (non git2rdata object) +#' write.table(iris, file = file.path(root, "standard.tsv"), sep = "\t") +#' # write a YAML file +#' yml <- list( +#' authors = list( +#' "Research Institute for Nature and Forest" = list( +#' href = "https://www.inbo.be/en"))) +#' yaml::write_yaml(yml, file = file.path(root, "_pkgdown.yml")) +#' +#' # list the git2rdata objects #' list_data(root) +#' # list the files #' list.files(root, recursive = TRUE) #' -#' # remove all .tsv files with an associated .yml file +#' # remove all .tsv files from valid git2rdata objects #' rm_data(root, path = ".") -#' # check the removal of the data +#' # check the removal of the .tsv file #' list.files(root, recursive = TRUE) #' list_data(root) #' -#' # remove dangling meta data files +#' # remove dangling git2rdata metadata files #' prune_meta(root, path = ".") -#' # check the removal of the meta data +#' # check the removal of the metadata #' list.files(root, recursive = TRUE) #' list_data(root) #' @@ -48,8 +59,17 @@ #' list_data(repo) #' status(repo) #' -#' # remove dangling meta data +#' # remove dangling metadata #' prune_meta(repo, path = ".") #' # check the removal #' list_data(repo) #' status(repo) +#' +#' # clean up +#' junk <- file.remove( +#' list.files(root, full.names = TRUE, recursive = TRUE), root) +#' junk <- file.remove( +#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, +#' include.dirs = TRUE, all.files = TRUE)), +#' repo_path) + diff --git a/man/commit.Rd b/man/commit.Rd index a03d22a..9a8c656 100644 --- a/man/commit.Rd +++ b/man/commit.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/reexport.R \name{commit} \alias{commit} -\title{Reexported function from git2r} +\title{Reexported Function From \code{git2r}} \description{ -See \code{\link[git2r]{commit}} +See \code{\link[git2r]{commit}} in \code{git2r}. } \seealso{ Other version_control: \code{\link{pull}}, diff --git a/man/is_git2rdata.Rd b/man/is_git2rdata.Rd new file mode 100644 index 0000000..d75e415 --- /dev/null +++ b/man/is_git2rdata.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/is_git2rdata.R +\name{is_git2rdata} +\alias{is_git2rdata} +\title{Check Whether a Git2rdata Object is Valid.} +\usage{ +is_git2rdata(file, root = ".", message = c("none", "warning", "error")) +} +\arguments{ +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} + +\item{root}{The root of a project. Can be a file path or a \code{git-repository}. +Defaults to the current working directory (\code{"."}).} + +\item{message}{a single value indicating the type of messages on top of the +logical value. \code{"none"}: no messages, \code{"warning"}: issue a warning in case of +an invalid metadata file. \code{"error"}: an invalid metadata file results in an +error. Defaults to \code{"none"}.} +} +\value{ +A logical value. \code{TRUE} in case of a valid git2rdata object. +Otherwise \code{FALSE}. +} +\description{ +A valid git2rdata object has valid metadata. The data hash must match the +data hash stored in the metadata. +} +\examples{ +# create a directory +root <- tempfile("git2rdata-") +dir.create(root) + +# store a file +write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +# check the stored file +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# Remove the metadata from the existing git2rdata object. Then it stops +# being a git2rdata object. +junk <- file.remove(file.path(root, "iris.yml")) +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# recreate the file and remove the data and keep the metadata. It stops being +# a git2rdata object, but the metadata remains valid. +write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +junk <- file.remove(file.path(root, "iris.tsv")) +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# clean up +junk <- file.remove(list.files(root, full.names = TRUE), root) +} +\seealso{ +Other internal: \code{\link{is_git2rmeta}}, + \code{\link{meta}}, \code{\link{upgrade_data}} +} +\concept{internal} diff --git a/man/is_git2rmeta.Rd b/man/is_git2rmeta.Rd new file mode 100644 index 0000000..74222e2 --- /dev/null +++ b/man/is_git2rmeta.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/is_git2rmeta.R +\name{is_git2rmeta} +\alias{is_git2rmeta} +\title{Check Whether a Git2rdata Object Has Valid Metadata.} +\usage{ +is_git2rmeta(file, root = ".", message = c("none", "warning", "error")) +} +\arguments{ +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} + +\item{root}{The root of a project. Can be a file path or a \code{git-repository}. +Defaults to the current working directory (\code{"."}).} + +\item{message}{a single value indicating the type of messages on top of the +logical value. \code{"none"}: no messages, \code{"warning"}: issue a warning in case of +an invalid metadata file. \code{"error"}: an invalid metadata file results in an +error. Defaults to \code{"none"}.} +} +\value{ +A logical value. \code{TRUE} in case of a valid metadata file. Otherwise +\code{FALSE}. +} +\description{ +Valid metadata is a file with \code{.yml} extension. It has a top level item +\code{..generic}. This item contains \code{git2rdata} (the version number), \code{hash} (a +hash on the metadata) and \code{data_hash} (a hash on the data file). The version +number must be the current version. +} +\examples{ +# create a directory +root <- tempfile("git2rdata-") +dir.create(root) + +# store a file +write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +# check the stored file +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# Remove the metadata from the existing git2rdata object. Then it stops +# being a git2rdata object. +junk <- file.remove(file.path(root, "iris.yml")) +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# recreate the file and remove the data and keep the metadata. It stops being +# a git2rdata object, but the metadata remains valid. +write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +junk <- file.remove(file.path(root, "iris.tsv")) +is_git2rmeta("iris", root) +is_git2rdata("iris", root) + +# clean up +junk <- file.remove(list.files(root, full.names = TRUE), root) +} +\seealso{ +Other internal: \code{\link{is_git2rdata}}, + \code{\link{meta}}, \code{\link{upgrade_data}} +} +\concept{internal} diff --git a/man/list_data.Rd b/man/list_data.Rd index 5e7c31e..b5105c0 100644 --- a/man/list_data.Rd +++ b/man/list_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/list_data.R \name{list_data} \alias{list_data} -\title{List available data files} +\title{List Available Git2rdata Files Containing Data} \usage{ list_data(root = ".", path = ".", recursive = TRUE) } @@ -14,10 +14,14 @@ list_data(root = ".", path = ".", recursive = TRUE) \item{recursive}{logical. Should the listing recurse into directories?} } \value{ -a character vector is dataframe names, including their relative path +A character vector of git2rdata object names, including their +relative path. } \description{ -List available data files +The function returns the names of all valid git2rdata objects. This implies +\code{.tsv} files with a matching \strong{valid} metadata file (\code{.yml}). \strong{Invalid} +metadata files result in a warning. The function ignores \strong{valid} metadata +files without matching raw data (\code{.tsv}). } \examples{ ## on file system @@ -26,21 +30,32 @@ List available data files root <- tempfile("git2rdata-") dir.create(root) -# store a dataframe -write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") -# list the available data and the files +# store a dataframe as git2rdata object. Capture the result to minimise +# screen output +junk <- write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +# write a standard tab separate file (non git2rdata object) +write.table(iris, file = file.path(root, "standard.tsv"), sep = "\\t") +# write a YAML file +yml <- list( + authors = list( + "Research Institute for Nature and Forest" = list( + href = "https://www.inbo.be/en"))) +yaml::write_yaml(yml, file = file.path(root, "_pkgdown.yml")) + +# list the git2rdata objects list_data(root) +# list the files list.files(root, recursive = TRUE) -# remove all .tsv files with an associated .yml file +# remove all .tsv files from valid git2rdata objects rm_data(root, path = ".") -# check the removal of the data +# check the removal of the .tsv file list.files(root, recursive = TRUE) list_data(root) -# remove dangling meta data files +# remove dangling git2rdata metadata files prune_meta(root, path = ".") -# check the removal of the meta data +# check the removal of the metadata list.files(root, recursive = TRUE) list_data(root) @@ -69,11 +84,19 @@ rm_data(repo, path = ".") list_data(repo) status(repo) -# remove dangling meta data +# remove dangling metadata prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) + +# clean up +junk <- file.remove( + list.files(root, full.names = TRUE, recursive = TRUE), root) +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{prune_meta}}, diff --git a/man/meta.Rd b/man/meta.Rd index 417d585..4bb6bc9 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -8,7 +8,8 @@ \alias{meta.logical} \alias{meta.POSIXct} \alias{meta.Date} -\title{Optimize a vector for storage as plain text and add meta data} +\alias{meta.data.frame} +\title{Optimize an Object for Storage as Plain Text and Add Metadata} \usage{ meta(x, ...) @@ -21,24 +22,47 @@ meta(x, ...) \method{meta}{POSIXct}(x, optimize = TRUE, ...) \method{meta}{Date}(x, optimize = TRUE, ...) + +\method{meta}{data.frame}(x, optimize = TRUE, na = "NA", sorting, ...) } \arguments{ -\item{x}{the vector} +\item{x}{the vector.} -\item{...}{further arguments to the methods} +\item{...}{further arguments to the methods.} \item{na}{the string to use for missing values in the data.} -\item{optimize}{recode the data to get smaller text files. Defaults to TRUE} +\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. If +\code{FALSE}, \code{meta()} converts the data to character. Defaults to \code{TRUE}.} + +\item{index}{an optional named vector with existing factor indices. The names +must match the existing factor levels. Unmatched levels from \code{x} will get new +indices.} -\item{index}{an optional named vector with existing factor indices. The names must match the existing factor levels. Unmatched levels from \code{x} will get new indices.} +\item{sorting}{an optional vector of column names defining which columns to +use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields +a warning. Add \code{sorting} to avoid this warning. Strongly recommended +in combination with version control. See +\code{vignette("efficiency", package = "git2rdata")} for an illustration of the +importance of sorting.} } \value{ -the optimized vector \code{x} with \code{meta} attribute +the optimized vector \code{x} with \code{meta} attribute. } \description{ -\code{\link{write_vc}} applies this function automatically on your -data.frame. +Prepares a vector for storage. When relevant, \code{meta()}optimizes the object +for storage by changing the format to one which needs less characters. The +metadata stored in the \code{meta} attribute, contains all required information to +backtransform the optimized format into the original format. + +In case of a data.frame, \code{meta()} applies itself to each of the columns. The +\code{meta} attribute becomes a named list containing the metadata for each column +plus an additional \code{..generic} element. \code{..generic} is a reserved name for +the metadata and not allowed as column name in a \code{data.frame}. + +\code{\link{write_vc}} uses this function to prepare a dataframe for storage. +Existing metadata is passed through the optional \code{old} argument. This +argument intendent for internal use. } \examples{ meta(c(NA, "'NA'", '"NA"', "abc\\tdef", "abc\\ndef")) @@ -59,4 +83,8 @@ meta(as.POSIXct("2019-02-01 10:59:59", tz = "CET"), optimize = FALSE) meta(as.Date("2019-02-01")) meta(as.Date("2019-02-01"), optimize = FALSE) } +\seealso{ +Other internal: \code{\link{is_git2rdata}}, + \code{\link{is_git2rmeta}}, \code{\link{upgrade_data}} +} \concept{internal} diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd index 7513624..c8b8b73 100644 --- a/man/prune_meta.Rd +++ b/man/prune_meta.Rd @@ -3,7 +3,7 @@ \name{prune_meta} \alias{prune_meta} \alias{prune_meta.git_repository} -\title{Prune metadata files} +\title{Prune Metadata Files} \usage{ prune_meta(root = ".", path = NULL, recursive = TRUE, ...) @@ -12,22 +12,31 @@ prune_meta(root = ".", path = NULL, recursive = TRUE, ...) } \arguments{ \item{root}{The root of a project. Can be a file path or a \code{git-repository}. -Defaults to the current working directory (".").} +Defaults to the current working directory (\code{"."}).} -\item{path}{the directory in which to clean all the data files} +\item{path}{the directory in which to clean all the data files. The directory +is relative to \code{root}.} -\item{recursive}{remove files in subdirectories too} +\item{recursive}{remove files in subdirectories too.} -\item{...}{additional parameters used in some methods} +\item{...}{parameters used in some methods} -\item{stage}{stage the changes after removing the files. Defaults to FALSE.} +\item{stage}{stage the changes after removing the files. Defaults to \code{FALSE}.} } \value{ returns invisibily a vector of removed files names. The paths are relative to \code{root}. } \description{ -Removes all metadata (\code{.yml} files) from the \code{path} when they don't have accompanying data (\code{.tsv} file). See the \href{https://inbo.github.io/git2rdata/articles/workflow.html}{workflow} vignette (\code{vignette("workflow", package = "git2rdata")}) for some examples on how to use this. +Removes all \strong{valid} metadata (\code{.yml} files) from the \code{path} when they don't +have accompanying data (\code{.tsv} file). \strong{Invalid} metadata triggers a warning +without removing the metadata file. + +Use this function with caution since it will remove all valid metadata files +without asking for confirmation. We strongly recommend to use this +function on files under version control. See +\code{vignette("workflow", package = "git2rdata")} for some examples on how to use +this. } \examples{ ## on file system @@ -36,21 +45,32 @@ Removes all metadata (\code{.yml} files) from the \code{path} when they don't ha root <- tempfile("git2rdata-") dir.create(root) -# store a dataframe -write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") -# list the available data and the files +# store a dataframe as git2rdata object. Capture the result to minimise +# screen output +junk <- write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +# write a standard tab separate file (non git2rdata object) +write.table(iris, file = file.path(root, "standard.tsv"), sep = "\\t") +# write a YAML file +yml <- list( + authors = list( + "Research Institute for Nature and Forest" = list( + href = "https://www.inbo.be/en"))) +yaml::write_yaml(yml, file = file.path(root, "_pkgdown.yml")) + +# list the git2rdata objects list_data(root) +# list the files list.files(root, recursive = TRUE) -# remove all .tsv files with an associated .yml file +# remove all .tsv files from valid git2rdata objects rm_data(root, path = ".") -# check the removal of the data +# check the removal of the .tsv file list.files(root, recursive = TRUE) list_data(root) -# remove dangling meta data files +# remove dangling git2rdata metadata files prune_meta(root, path = ".") -# check the removal of the meta data +# check the removal of the metadata list.files(root, recursive = TRUE) list_data(root) @@ -79,11 +99,19 @@ rm_data(repo, path = ".") list_data(repo) status(repo) -# remove dangling meta data +# remove dangling metadata prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) + +# clean up +junk <- file.remove( + list.files(root, full.names = TRUE, recursive = TRUE), root) +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{list_data}}, diff --git a/man/pull.Rd b/man/pull.Rd index 71a49f2..f5a6511 100644 --- a/man/pull.Rd +++ b/man/pull.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/reexport.R \name{pull} \alias{pull} -\title{Reexported function from git2r} +\title{Reexported Function From \code{git2r}} \description{ -See \code{\link[git2r]{pull}} +See \code{\link[git2r]{pull}} in \code{git2r}. } \seealso{ Other version_control: \code{\link{commit}}, diff --git a/man/push.Rd b/man/push.Rd index 524d91b..c02d752 100644 --- a/man/push.Rd +++ b/man/push.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/reexport.R \name{push} \alias{push} -\title{Reexported function from git2r} +\title{Reexported Function From \code{git2r}} \description{ -See \code{\link[git2r]{push}} +See \code{\link[git2r]{push}} in \code{git2r}. } \seealso{ Other version_control: \code{\link{commit}}, diff --git a/man/read_vc.Rd b/man/read_vc.Rd index 4a72f3c..92b18a1 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -2,22 +2,30 @@ % Please edit documentation in R/read_vc.R \name{read_vc} \alias{read_vc} -\title{Read a \code{data.frame}} +\title{Read a Git2rdata Object from Disk} \usage{ read_vc(file, root = ".") } \arguments{ -\item{file}{the name of the file without file extension. Can include a -relative path. It is relative to the \code{root}.} +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} \item{root}{The root of a project. Can be a file path or a \code{git-repository}. -Defaults to the current working directory (".").} +Defaults to the current working directory (\code{"."}).} } \value{ -The \code{data.frame} with the file names and hashes as attributes +The \code{data.frame} with the file names and hashes as attributes. } \description{ -Note that the dataframe has to be written with \code{write_vc()} before it can be read with \code{read_vc()}. +\code{read_vc()} handles git2rdata objects stored by \code{write_vc()}. It reads and +verifies the metadata file (\code{.yml}). Then it reads and verifies the raw data. +The last step is backtransforming any transformation done by \code{meta()} to +return the \code{data.frame} as stored by \code{write_vc()}. + +\code{read_vc()} is an S3 generic on \code{root} which currently handles \code{"character"} +(a path) and \code{"git-repository"} (from \code{git2r}). S3 methods for other version +control system could be added. } \examples{ ## on file system @@ -28,23 +36,24 @@ dir.create(root) # write a dataframe to the directory write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") -# check that a data file (.tsv) and a meta data file (.yml) are created +# check that a data file (.tsv) and a metadata file (.yml) exist. list.files(root, recursive = TRUE) -# read the data from the directory +# read the git2rdata object from the directory read_vc("iris", root) -# store a new version +# store a new version with different observations but the same metadata write_vc(iris[1:5, ], "iris", root) list.files(root, recursive = TRUE) -# store a new version in case the meta data must change +# Removing a column requires version requires new metadata. +# Add strict = FALSE to override the existing metadata. write_vc( iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE ) list.files(root, recursive = TRUE) -# storing the first version again required another update of the meta data +# storing the orignal version again requires another update of the metadata write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE) list.files(root, recursive = TRUE) -# storing the data in verbose format leads to larger files +# optimize = FALSE stores the data more verbose. This requires larger files. write_vc( iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE ) @@ -52,7 +61,7 @@ list.files(root, recursive = TRUE) -## on git repo +## on git repo using a git2r::git-repository # initialise a git repo using the git2r package repo_path <- tempfile("git2rdata-repo-") @@ -60,34 +69,30 @@ dir.create(repo_path) repo <- git2r::init(repo_path) git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") -# store a dataframe in git repo +# store a dataframe in git repo. write_vc(iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length") +# This git2rdata object is not staged by default. status(repo) # read a dataframe from a git repo read_vc("iris", repo) -# store a new version of in the git repo +# store a new version in the git repo and stage it in one go write_vc(iris[1:5, ], "iris", repo, stage = TRUE) status(repo) -# store a version with altered meta data -write_vc( - iris[1:6, -2], "iris", repo, sorting = "Sepal.Length", strict = FALSE -) -status(repo) - -# store the original version again -write_vc( - iris[1:6, ], "iris", repo, sorting = "Sepal.Width", strict = FALSE, - stage = TRUE -) -status(repo) - -# store a verbose version in separate files +# store a verbose version in a different gir2data object write_vc( iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE ) status(repo) + +# clean up +junk <- file.remove( + list.files(root, full.names = TRUE, recursive = TRUE), root) +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{list_data}}, diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index dc7cab4..9e5ce25 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -2,25 +2,34 @@ % Please edit documentation in R/recent_commit.R \name{recent_commit} \alias{recent_commit} -\title{Most recent file change} +\title{Retrieve the Most Recent File Change} \usage{ recent_commit(file, root, data = FALSE) } \arguments{ -\item{file}{the name of the file without file extension. Can include a -relative path. It is relative to the \code{root}.} +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} -\item{root}{The root of a project. Can be a file path or a \code{git-repository}} +\item{root}{The root of a project. Can be a file path or a \code{git-repository}.} -\item{data}{does \code{file} refers to a data object (TRUE) or to a file (FALSE). -Defaults to FALSE.} +\item{data}{does \code{file} refers to a data object (\code{TRUE}) or to a file (\code{FALSE}). +Defaults to \code{FALSE}.} } \value{ a \code{data.frame} with \code{commit}, \code{author} and \code{when} for the most recent -commit in which the file was altered +commit that adds op updates the file. } \description{ -Retrieve the most recent commit in which a file or data object was added or updated. +Retrieve the most recent commit that added or updated a file or git2rdata +object. This does not imply that file still exists at the current HEAD as it +ignores the deletion of files. + +Use this information to document the current version of file or git2rdata +object in an analysis. Since it refers to the most recent change of this +file, it remains unchanged by committing changes to other files. You can +also use it to track if data got updated, requirering an analysis to +be rerun. See \code{vignette("workflow", package = "git2rdata")}. } \examples{ # initialise a git repo using git2r @@ -30,22 +39,27 @@ repo <- git2r::init(repo_path) git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") # write and commit a first dataframe -write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", stage = TRUE) +# store the output of write_vc() minimize screen output +junk <- write_vc(iris[1:6, ], "iris", repo, sorting = "Sepal.Length", + stage = TRUE) commit(repo, "important analysis", session = TRUE) list.files(repo_path) Sys.sleep(1.1) # required because git doesn't handle subsecond timings # write and commit a second dataframe -write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", stage = TRUE) +junk <- write_vc(iris[7:12, ], "iris2", repo, sorting = "Sepal.Length", + stage = TRUE) commit(repo, "important analysis", session = TRUE) list.files(repo_path) Sys.sleep(1.1) # required because git doesn't handle subsecond timings # write and commit a new version of the first dataframe -write_vc(iris[7:12, ], "iris", repo, stage = TRUE) +junk <- write_vc(iris[7:12, ], "iris", repo, stage = TRUE) list.files(repo_path) commit(repo, "important analysis", session = TRUE) + + # find out in which commit a file was last changed # "iris.tsv" was last updated in the third commit @@ -54,17 +68,24 @@ recent_commit("iris.tsv", repo) recent_commit("iris.yml", repo) # "iris2.yml" was last updated in the second commit recent_commit("iris2.yml", repo) -# the data object "iris" was last updated in the third commit +# the git2rdata object "iris" was last updated in the third commit recent_commit("iris", repo, data = TRUE) -# remove a dataframe and commit it +# remove a dataframe and commit it to see what happens with deleted files file.remove(file.path(repo_path, "iris.tsv")) prune_meta(repo, ".") commit(repo, message = "remove iris", all = TRUE, session = TRUE) +list.files(repo_path) -# still points to the third commit as it is the latest commit in which the +# still points to the third commit as this is the latest commit in which the # data was present recent_commit("iris", repo, data = TRUE) + +#' clean up +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other version_control: \code{\link{commit}}, diff --git a/man/relabel.Rd b/man/relabel.Rd index f13a7e5..41d49a0 100644 --- a/man/relabel.Rd +++ b/man/relabel.Rd @@ -2,56 +2,83 @@ % Please edit documentation in R/relabel.R \name{relabel} \alias{relabel} -\title{Relabel factor levels} +\title{Relabel Factor Levels by Updating the Metadata} \usage{ relabel(file, root = ".", change) } \arguments{ -\item{file}{the name of the file without file extension. Can include a -relative path. It is relative to the \code{root}.} +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} \item{root}{The root of a project. Can be a file path or a \code{git-repository}. -Defaults to the current working directory (".").} +Defaults to the current working directory (\code{"."}).} -\item{change}{either list or a data.frame. In case of a list is a named list -with named vectors. The name of list elements must match the names of the -variables. The names of the vector elements must match the existing factor -labels. The values represent the new factor labels. In case of a data.frame -it needs to have the variables \code{factor} (name of the factor), \code{old} (the old) -factor label and \code{new} (the new factor label). Other columns are ignored.} +\item{change}{either a \code{list} or a \code{data.frame}. In case of a \code{list} is a +named \code{list} with named \code{vectors}. The names of list elements must match the +names of the variables. The names of the vector elements must match the +existing factor labels. The values represent the new factor labels. In case +of a \code{data.frame} it needs to have the variables \code{factor} (name of the +factor), \code{old} (the old) factor label and \code{new} (the new factor label). +\code{relabel()} ignores all other columns.} } \value{ -invisible \code{NULL} +invisible \code{NULL}. } \description{ -Imaging the situation where we have a dataframe with a factor variable and we +Imagine the situation where we have a dataframe with a factor variable and we have stored it with \code{write_vc(optimize = TRUE)}. The raw data file contains the factor indices and the metadata contains the link between the factor -index and the corresponding label. +index and the corresponding label. See +\code{vignette("version_control", package = "git2rdata")}. In such a case, +relabeling a factor can be fast and lightweight by updating the metadata. } \examples{ -# setup a directory -root <- tempfile("git2rdata-relabel") -dir.create(root) +# initialise a git repo using git2r +repo_path <- tempfile("git2rdata-repo-") +dir.create(repo_path) +repo <- git2r::init(repo_path) +git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") -# create a dataframe and store it +# Create a dataframe and store it as an optimized git2rdata object. +# Note that write_vc() uses optimization by default. +# Stage and commit the git2rdata object. ds <- ds <- data.frame(a = c("a1", "a2"), b = c("b2", "b1")) -write_vc(ds, "relabel", root, sorting = "b") +junk <- write_vc(ds, "relabel", repo, sorting = "b", stage = TRUE) +cm <- commit(repo, "initial commit") +# check that the workspace is clean +status(repo) -# define new labels as a list and apply them +# Define new labels as a list and apply them to the git2rdata object. new_labels <- list( a = list(a2 = "a3") ) -relabel("relabel", root, new_labels) +relabel("relabel", repo, new_labels) +# check the changes +read_vc("relabel", repo) +# relabel() changed the metadata, not the raw data +status(repo) +git2r::add(repo, "relabel.*") +cm <- commit(repo, "relabel using a list") -# define new labels as a dataframe and apply them +# Define new labels as a dataframe and apply them to the git2rdata object change <- data.frame( factor = c("a", "a", "b"), old = c("a3", "a1", "b2"), new = c("c2", "c1", "b3") ) -relabel("relabel", root, change) +relabel("relabel", repo, change) +# check the changes +read_vc("relabel", repo) +# relabel() changed the metadata, not the raw data +status(repo) + +# clean up +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{list_data}}, diff --git a/man/repository.Rd b/man/repository.Rd index 4062fbb..9e653d8 100644 --- a/man/repository.Rd +++ b/man/repository.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/reexport.R \name{repository} \alias{repository} -\title{Reexported function from git2r} +\title{Reexported Function From \code{git2r}} \description{ -See \code{\link[git2r]{repository}} +See \code{\link[git2r]{repository}} in \code{git2r}. } \seealso{ Other version_control: \code{\link{commit}}, diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 66dbf85..8f64aef 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -3,7 +3,7 @@ \name{rm_data} \alias{rm_data} \alias{rm_data.git_repository} -\title{Remove data files} +\title{Remove Data Files From Git2rdata Objects} \usage{ rm_data(root = ".", path = NULL, recursive = TRUE, ...) @@ -13,24 +13,40 @@ rm_data(root = ".", path = NULL, recursive = TRUE, ...) } \arguments{ \item{root}{The root of a project. Can be a file path or a \code{git-repository}. -Defaults to the current working directory (".").} +Defaults to the current working directory (\code{"."}).} -\item{path}{the directory in which to clean all the data files} +\item{path}{the directory in which to clean all the data files. The directory +is relative to \code{root}.} -\item{recursive}{remove files in subdirectories too} +\item{recursive}{remove files in subdirectories too.} -\item{...}{additional parameters used in some methods} +\item{...}{parameters used in some methods} \item{stage}{stage the changes after removing the files. Defaults to FALSE.} -\item{type}{which classes of files should be removed. \code{unmodified} are files in the git history and unchanged since the last commit. \code{modified} are files in the git history and changed since the last commit. \code{ignored} refers to file listed in a \code{.gitignore} file. Selecting \code{modified} will remove both \code{unmodified} and \code{modified} data files. Selecting \code{ìgnored} will remove \code{unmodified}, \code{modified} and \code{ignored} data files. \code{all} refers to all visible data files, inclusing \code{untracked} files. The argument can be abbreviated to the first letter.} +\item{type}{Defines the classes of files to remove. \code{unmodified} are files in +the git history and unchanged since the last commit. \code{modified} are files in +the git history and changed since the last commit. \code{ignored} refers to file +listed in a \code{.gitignore} file. Selecting \code{modified} will remove both +\code{unmodified} and \code{modified} data files. Selecting \code{ìgnored} will remove +\code{unmodified}, \code{modified} and \code{ignored} data files. \code{all} refers to all +visible data files, inclusing \code{untracked} files.} } \value{ returns invisibily a vector of removed files names. The paths are relative to \code{root}. } \description{ -Removes all data (\code{.tsv} files) from the \code{path} when they have accompanying metadata (\code{.yml} file). The metadata remains untouched. See the \href{https://inbo.github.io/git2rdata/articles/workflow.html}{workflow} vignette (\code{vignette("workflow", package = "git2rdata")}) for some examples on how to use this. +Remove the data (\code{.tsv}) file from all valid git2rdata objects at the \code{path}. +The metadata remains untouched. A warning lists any git2rdata object with +\strong{invalid} metadata. The function keeps any \code{.tsv} file with +invalid metadata or from non-git2rdata objects. + +Use this function with caution since it will remove all valid data files +without asking for confirmation. We strongly recommend to use this +function on files under version control. See +\code{vignette("workflow", package = "git2rdata")} for some examples on how to use +this. } \examples{ ## on file system @@ -39,21 +55,32 @@ Removes all data (\code{.tsv} files) from the \code{path} when they have accompa root <- tempfile("git2rdata-") dir.create(root) -# store a dataframe -write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") -# list the available data and the files +# store a dataframe as git2rdata object. Capture the result to minimise +# screen output +junk <- write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") +# write a standard tab separate file (non git2rdata object) +write.table(iris, file = file.path(root, "standard.tsv"), sep = "\\t") +# write a YAML file +yml <- list( + authors = list( + "Research Institute for Nature and Forest" = list( + href = "https://www.inbo.be/en"))) +yaml::write_yaml(yml, file = file.path(root, "_pkgdown.yml")) + +# list the git2rdata objects list_data(root) +# list the files list.files(root, recursive = TRUE) -# remove all .tsv files with an associated .yml file +# remove all .tsv files from valid git2rdata objects rm_data(root, path = ".") -# check the removal of the data +# check the removal of the .tsv file list.files(root, recursive = TRUE) list_data(root) -# remove dangling meta data files +# remove dangling git2rdata metadata files prune_meta(root, path = ".") -# check the removal of the meta data +# check the removal of the metadata list.files(root, recursive = TRUE) list_data(root) @@ -82,11 +109,19 @@ rm_data(repo, path = ".") list_data(repo) status(repo) -# remove dangling meta data +# remove dangling metadata prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) + +# clean up +junk <- file.remove( + list.files(root, full.names = TRUE, recursive = TRUE), root) +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{list_data}}, diff --git a/man/status.Rd b/man/status.Rd index 124e066..8b372eb 100644 --- a/man/status.Rd +++ b/man/status.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/reexport.R \name{status} \alias{status} -\title{Reexported function from git2r} +\title{Reexported Function From \code{git2r}} \description{ -See \code{\link[git2r]{status}} +See \code{\link[git2r]{status}} in \code{git2r}. } \seealso{ Other version_control: \code{\link{commit}}, diff --git a/man/upgrade_data.Rd b/man/upgrade_data.Rd new file mode 100644 index 0000000..17674cc --- /dev/null +++ b/man/upgrade_data.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/upgrade_data.R +\name{upgrade_data} +\alias{upgrade_data} +\alias{upgrade_data.git_repository} +\title{Upgrade Files to the New Version} +\usage{ +upgrade_data(file, root = ".", verbose, ..., path) + +\method{upgrade_data}{git_repository}(file, root = ".", verbose = TRUE, + ..., path, stage = FALSE, force = FALSE) +} +\arguments{ +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} + +\item{root}{The root of a project. Can be a file path or a \code{git-repository}. +Defaults to the current working directory (\code{"."}).} + +\item{verbose}{display a message with the update status. Defaults to \code{TRUE}.} + +\item{...}{parameters used in some methods} + +\item{path}{specify \code{path} instead of \code{file} to update all git2rdata objects +in this directory and it's subdirectories. \code{path} is relative to \code{root}. Use +\code{path = "."} to upgrade all git2rdata objects under \code{root}.} + +\item{stage}{Logical value indicating whether to stage the changes after +writing the data. Defaults to \code{FALSE}.} + +\item{force}{Add ignored files. Default is FALSE.} +} +\value{ +the git2rdata object names. +} +\description{ +Updates the data written by older versions to the current data format +standard. Works both on a single file and (recursively) on a path. The +\code{".yml"} file must contain a \code{"..generic"} element. \code{upgrade_data()} ignores +all other files. +} +\examples{ +# create a directory +root <- tempfile("git2rdata-") +dir.create(root) + +# write dataframes to the root +write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") +write_vc(iris[5:10, ], file = "subdir/iris", root = root, + sorting = "Sepal.Length") +# upgrade a single git2rdata object +upgrade_data(file = "iris", root = root) +# use path = "." to upgrade all git2rdata objects under root +upgrade_data(path = ".", root = root) + +# clean up +junk <- file.remove(list.files(root, full.names = TRUE), root) +} +\seealso{ +Other internal: \code{\link{is_git2rdata}}, + \code{\link{is_git2rmeta}}, \code{\link{meta}} +} +\concept{internal} diff --git a/man/write_vc.Rd b/man/write_vc.Rd index eb58764..ce43ad8 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -3,7 +3,7 @@ \name{write_vc} \alias{write_vc} \alias{write_vc.git_repository} -\title{Write a \code{data.frame}} +\title{Store a Data.Frame as a Git2rdata Object on Disk} \usage{ write_vc(x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ...) @@ -12,29 +12,36 @@ write_vc(x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ..., stage = FALSE, force = FALSE) } \arguments{ -\item{x}{the `data.frame} +\item{x}{the \code{data.frame}.} -\item{file}{the name of the file without file extension. Can include a -relative path. It is relative to the \code{root}.} +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}.} \item{root}{The root of a project. Can be a file path or a \code{git-repository}. -Defaults to the current working directory (".").} +Defaults to the current working directory (\code{"."}).} -\item{sorting}{a vector of column names defining which columns to use for -sorting \code{x} and in what order to use them. Only required when writing -new metadata.} +\item{sorting}{an optional vector of column names defining which columns to +use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields +a warning. Add \code{sorting} to avoid this warning. Strongly recommended +in combination with version control. See +\code{vignette("efficiency", package = "git2rdata")} for an illustration of the +importance of sorting.} -\item{strict}{What to do when the metadata changes. \code{strict = FALSE} will -overwrite the data with a warning listing the changes, \code{strict = TRUE} will -return an error and leave the data as is. Default to \code{TRUE}} +\item{strict}{What to do when the metadata changes. \code{strict = FALSE} +overwrites the data and the metadata with a warning listing the changes, +\code{strict = TRUE} returns an error and leaves the data and metadata as is. +Defaults to \code{TRUE}.} -\item{optimize}{recode the data to get smaller text files. Defaults to TRUE} +\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. If +\code{FALSE}, \code{meta()} converts the data to character. Defaults to \code{TRUE}.} \item{na}{the string to use for missing values in the data.} -\item{...}{additional parameters used in some methods} +\item{...}{parameters used in some methods} -\item{stage}{stage the changes after writing the data. Defaults to FALSE} +\item{stage}{Logical value indicating whether to stage the changes after +writing the data. Defaults to \code{FALSE}.} \item{force}{Add ignored files. Default is FALSE.} } @@ -43,8 +50,13 @@ a named vector with the file paths relative to \code{root}. The names contain the hashes of the files. } \description{ -This will create two files. The \code{".tsv"} file contains the raw data. -The \code{".yml"} contains the meta data on the columns in YAML format. +A git2rdata object consists of two files. The \code{".tsv"} file contains the raw +data as a plain text tab separated file. The \code{".yml"} contains the metadata +on the columns in plain text YAML format. See \code{vignette("plain text", package = "git2rdata")} for more details on the implementation. +} +\note{ +\code{..generic} is a reserved name for the metadata and cannot be used as +column name in a \code{data.frame}. } \examples{ ## on file system @@ -55,23 +67,24 @@ dir.create(root) # write a dataframe to the directory write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length") -# check that a data file (.tsv) and a meta data file (.yml) are created +# check that a data file (.tsv) and a metadata file (.yml) exist. list.files(root, recursive = TRUE) -# read the data from the directory +# read the git2rdata object from the directory read_vc("iris", root) -# store a new version +# store a new version with different observations but the same metadata write_vc(iris[1:5, ], "iris", root) list.files(root, recursive = TRUE) -# store a new version in case the meta data must change +# Removing a column requires version requires new metadata. +# Add strict = FALSE to override the existing metadata. write_vc( iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE ) list.files(root, recursive = TRUE) -# storing the first version again required another update of the meta data +# storing the orignal version again requires another update of the metadata write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE) list.files(root, recursive = TRUE) -# storing the data in verbose format leads to larger files +# optimize = FALSE stores the data more verbose. This requires larger files. write_vc( iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE ) @@ -79,7 +92,7 @@ list.files(root, recursive = TRUE) -## on git repo +## on git repo using a git2r::git-repository # initialise a git repo using the git2r package repo_path <- tempfile("git2rdata-repo-") @@ -87,34 +100,30 @@ dir.create(repo_path) repo <- git2r::init(repo_path) git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") -# store a dataframe in git repo +# store a dataframe in git repo. write_vc(iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length") +# This git2rdata object is not staged by default. status(repo) # read a dataframe from a git repo read_vc("iris", repo) -# store a new version of in the git repo +# store a new version in the git repo and stage it in one go write_vc(iris[1:5, ], "iris", repo, stage = TRUE) status(repo) -# store a version with altered meta data -write_vc( - iris[1:6, -2], "iris", repo, sorting = "Sepal.Length", strict = FALSE -) -status(repo) - -# store the original version again -write_vc( - iris[1:6, ], "iris", repo, sorting = "Sepal.Width", strict = FALSE, - stage = TRUE -) -status(repo) - -# store a verbose version in separate files +# store a verbose version in a different gir2data object write_vc( iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE ) status(repo) + +# clean up +junk <- file.remove( + list.files(root, full.names = TRUE, recursive = TRUE), root) +junk <- file.remove( + rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, + include.dirs = TRUE, all.files = TRUE)), + repo_path) } \seealso{ Other storage: \code{\link{list_data}}, diff --git a/sticker/sticker.R b/sticker/sticker.R index 4432084..8b3b42b 100644 --- a/sticker/sticker.R +++ b/sticker/sticker.R @@ -1,5 +1,6 @@ library(tidyverse) library(cowplot) +scale <- 1 dx <- 1 dy <- sqrt(2) corner <- 1/3 @@ -23,7 +24,7 @@ icon <- tribble( annotate( "text", label = "TXT", colour = git_colour, x = dx / 2, y = dy / 3, hjust = 0.5, vjust = 0.5, - size = 5, family = "Flanders Art Sans" + size = 5 * scale, family = "Flanders Art Sans" ) + theme_void() meta <- tribble( @@ -45,7 +46,7 @@ meta <- tribble( annotate( "text", label = "meta", colour = git_colour, x = dx / 2, y = dy / 3, hjust = 0.5, vjust = 0.5, - size = 5, family = "Flanders Art Sans" + size = 5 * scale, family = "Flanders Art Sans" ) + theme_void() hexagon <- tibble( @@ -81,24 +82,25 @@ sticker <- ggdraw() + draw_plot(hexagon) + draw_label( "git2rdata", x = 0.5, y = 0.8, - colour = git_colour, fontfamily = "Flanders Art Sans", size = 20 + colour = git_colour, fontfamily = "Flanders Art Sans", size = 20 * scale ) + draw_plot(df, x = -0.27, scale = 0.3) + draw_label( - "\u21C4", colour = git_colour, fontfamily = "Flanders Art Sans", size = 40 + "\u21C4", colour = git_colour, fontfamily = "Flanders Art Sans", + size = 40 * scale ) + draw_image("git.png", x = 0.25, y = -0.18, scale = 0.25) + draw_label( - "+", colour = git_colour, fontfamily = "Flanders Art Sans", size = 40, - x = 0.75 + "+", colour = git_colour, fontfamily = "Flanders Art Sans", + size = 40 * scale, x = 0.75 ) + draw_plot(meta, x = 0.35, y = 0.15, scale = 0.2) + draw_plot(icon, x = 0.15, y = 0.15, scale = 0.2) save_plot( filename = "../man/figures/logo.png", sticker, - base_height = 278 / 72, - base_width = 240 / 72, + base_height = scale * 278 / 72, + base_width = scale * 240 / 72, dpi = 72, bg = NA ) diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index bbd44ef..ea9715d 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -40,26 +40,23 @@ expect_identical( ) expect_error( write_vc(data.frame(junk = 5), file = "test", root = root, sorting = "junk"), -"new data uses different variables for sorting -new data has a different number of variables -new variables: junk -deleted variables: test_character, test_factor, test_ordered, test_integer" + "The data was not overwritten because of the issues below." ) expect_error( write_vc(x = test_data, file = "test", root = root, optimize = FALSE), - "new data is verbose, whereas old data was optimized" + "New data is verbose, whereas old data was optimized" ) expect_warning( write_vc(x = test_data, file = "test", root = root, optimize = FALSE, strict = FALSE), - "new data is verbose, whereas old data was optimized" + "New data is verbose, whereas old data was optimized" ) expect_error( write_vc( x = test_data[, colnames(test_data) != "test_Date"], file = "test", root = root ), - "all sorting variables must be available" + "All sorting variables must be available" ) expect_false(any(file.exists(git2rdata:::clean_data_path(root, "a/verbose")))) @@ -87,7 +84,7 @@ for (i in colnames(stored)) { } expect_error( write_vc(x = test_data, file = "a/verbose", root = root), - "new data is optimized, whereas old data was verbose" + "New data is optimized, whereas old data was verbose" ) expect_is( @@ -117,50 +114,54 @@ expect_error( ) expect_error( write_vc(test_data, file = "error", root = root, sorting = "junk"), - "all sorting variables must be available" + "All sorting variables must be available" ) expect_false(any(file.exists(git2rdata:::clean_data_path(root, "sorting")))) +expect_warning( + write_vc(test_data, file = "error", root = root, sorting = character(0)), + "No sorting applied" +) expect_warning( output <- write_vc(test_data, file = "sorting", root = root, sorting = "test_factor"), - "sorting results in ties" + "Sorting on 'test_factor' results in ties" ) expect_is(output, "character") expect_true(all(file.exists(git2rdata:::clean_data_path(root, "sorting")))) expect_warning( write_vc(test_data, file = "sorting", root = root, sorting = c("test_factor", "test_Date"), strict = FALSE), - "new data uses more variables for sorting" + "The sorting variables changed" ) expect_error( suppressWarnings( write_vc(test_data, file = "sorting", root = root, sorting = "test_factor") ), - "new data uses less variables for sorting" + "The sorting variables changed" ) test_changed <- test_data test_changed$junk <- test_changed$test_character expect_error( suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), - "new data has a different number of variables" + "New data has a different number of variables" ) test_changed$test_character <- NULL expect_error( suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), - "new variables: junk\ndeleted variables: test_character" + "New variables: junk" ) test_changed <- test_data test_changed$test_character <- factor(test_changed$test_character) expect_error( suppressWarnings(write_vc(test_changed, file = "sorting", root = root )), - "change in class: test_character from character to factor" + "Change in class: 'test_character' from character to factor" ) expect_error( suppressWarnings( write_vc(test_data, file = "sorting", root = root, sorting = "test_logical") ), - "new data uses different variables for sorting" + "The sorting variables changed" ) test_changed <- test_data test_changed$test_ordered <- factor( @@ -171,7 +172,7 @@ test_changed$test_ordered <- factor( expect_error( suppressWarnings(write_vc(test_changed, file = "sorting", root = root )), - "test_ordered changes from ordinal to nominal" + "'test_ordered' changes from ordinal to nominal" ) test_no <- test_data @@ -252,7 +253,7 @@ test_that("user specified na strings work", { write_vc(x, "test_na_string_verbose", root, "a", optimize = FALSE, na = "different") ), - "new data uses 'different' as NA string, whereas old data used 'junk'" + "New data uses 'different' as NA string, whereas old data used 'junk'" ) expect_is( fn <- suppressWarnings( @@ -269,6 +270,7 @@ test_that("user specified na strings work", { grep("junk", readLines(file.path(root, fn[1]))), 2:4 ) + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("write_vc() allows changes in factor levels", { @@ -285,8 +287,9 @@ test_that("write_vc() allows changes in factor levels", { x$test_factor <- factor(x$test_factor, levels = c("a", "b", "c")) expect_error( write_vc(x, "factor_levels", root), - "new factor labels for test_factor\nnew indices labels for test_factor" + "New factor labels for 'test_factor'" ) + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("meta attributes are printed as yaml", { diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R new file mode 100644 index 0000000..b61d018 --- /dev/null +++ b/tests/testthat/test_b_is_git2rmeta.R @@ -0,0 +1,180 @@ +context("validate metadata") +root <- tempfile("git2rdata-is_git2rmeta") +dir.create(root) +test_that("is_git2rmeta checks root", { + expect_error(is_git2rmeta(file = "junk", root = 1), + "a 'root' of class numeric is not supported") + expect_error(is_git2rdata(file = "junk", root = 1), + "a 'root' of class numeric is not supported") +}) + +test_that("is_git2rmeta checks metadata", { + expect_false(is_git2rmeta(file = "junk", root = root)) + expect_false(is_git2rdata(file = "junk", root = root)) + expect_error(is_git2rmeta(file = "junk", root = root, message = "error"), + "Metadata file missing.") + expect_warning(is_git2rmeta(file = "junk", root = root, message = "warning"), + "Metadata file missing.") + expect_false( + suppressWarnings( + is_git2rmeta(file = "junk", root = root, message = "warning") + ) + ) + expect_warning(is_git2rdata(file = "junk", root = root, message = "warning"), + "Metadata file missing.") + expect_false( + suppressWarnings( + is_git2rdata(file = "junk", root = root, message = "warning") + ) + ) + + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "No '..generic' element.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "No '..generic' element.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["hash"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Corrupt metadata, no hash found.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Corrupt metadata, no hash found.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Data stored using an older version of `git2rdata`.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Data stored using an older version of `git2rdata`.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.3" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Data stored using an older version of `git2rdata`.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Data stored using an older version of `git2rdata`.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["data_hash"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Corrupt metadata, no data hash found.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Corrupt metadata, no data hash found.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) + + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["hash"]] <- "zzz" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rmeta(file = file, root = root)) + expect_error(is_git2rmeta(file = file, root = root, message = "error"), + "Corrupt metadata, mismatching hash.") + expect_warning(is_git2rmeta(file = file, root = root, message = "warning"), + "Corrupt metadata, mismatching hash.") + expect_false( + suppressWarnings( + is_git2rmeta(file = file, root = root, message = "warning") + ) + ) +}) + +test_that("is_git2rdata checks data", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["data_hash"]] <- "zzz" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_false(is_git2rdata(file = file, root = root)) + expect_error(is_git2rdata(file = file, root = root, message = "error"), + "Corrupt data, mismatching data hash") + expect_warning(is_git2rdata(file = file, root = root, message = "warning"), + "Corrupt data, mismatching data hash") + expect_false( + suppressWarnings( + is_git2rdata(file = file, root = root, message = "warning") + ) + ) + + yaml::write_yaml(correct_yaml, file.path(root, junk[2])) + correct_data <- readLines(file.path(root, junk[1])) + junk_header <- correct_data + junk_header[1] <- "junk" + writeLines(junk_header, file.path(root, junk[1])) + expect_false(is_git2rdata(file = file, root = root)) + expect_error(is_git2rdata(file = file, root = root, message = "error"), + "Corrupt data, incorrect header.") + expect_warning(is_git2rdata(file = file, root = root, message = "warning"), + "Corrupt data, incorrect header.") + expect_false( + suppressWarnings( + is_git2rdata(file = file, root = root, message = "warning") + ) + ) + + file.remove(file.path(root, junk[1])) + expect_false(is_git2rdata(file = file, root = root)) + expect_error(is_git2rdata(file = file, root = root, message = "error"), + "Data file missing.") + expect_warning(is_git2rdata(file = file, root = root, message = "warning"), + "Data file missing.") + expect_false( + suppressWarnings( + is_git2rdata(file = file, root = root, message = "warning") + ) + ) +}) + +root <- git2r::init(root) +git2r::config(root, user.name = "Alice", user.email = "alice@example.org") + +test_that("is_git2rmeta handle git repositories", { + file <- basename(tempfile(tmpdir = git2r::workdir(root))) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + expect_true(is_git2rmeta(file = file, root = root)) + expect_true(is_git2rdata(file = file, root = root)) +}) + +file.remove(list.files(git2r::workdir(root), recursive = TRUE, + full.names = TRUE)) +file.remove(list.files(git2r::workdir(root), recursive = TRUE, + include.dirs = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_b_prune.R b/tests/testthat/test_b_prune.R index d8a7bf0..27173b4 100644 --- a/tests/testthat/test_b_prune.R +++ b/tests/testthat/test_b_prune.R @@ -4,7 +4,7 @@ expect_error(rm_data(root = 1), "a 'root' of class numeric is not supported") expect_error(prune_meta(root = 1), "a 'root' of class numeric is not supported") expect_error(list_data(root = 1), "a 'root' of class numeric is not supported") -root <- tempfile(pattern = "git2rdata-") +root <- tempfile(pattern = "git2rdata-prune") root <- normalizePath(root, winslash = "/", mustWork = FALSE) expect_error(rm_data(root, "."), root) expect_error(prune_meta(root), root) @@ -34,4 +34,41 @@ file.remove(file.path(root, "test.yml")) current <- list.files(root, recursive = TRUE) expect_identical(rm_data(root, path = "."), character(0)) expect_identical(list.files(root, recursive = TRUE), current) -file.remove(file.path(root, "test.tsv")) + +write_vc(test_data, file = "test1", root = root, sorting = "test_Date") +junk <- write_vc(test_data, file = "test2", root = root, sorting = "test_Date") +write_vc(test_data, file = "a/test2", root = root, sorting = "test_Date") +meta_data <- yaml::read_yaml(file.path(root, junk[2])) +meta_data[["..generic"]] <- NULL +yaml::write_yaml(meta_data, file = file.path(root, junk[2])) +yaml::write_yaml(meta_data, file = file.path(root, "a", junk[2])) +expect_warning( + list_data(root = root, path = ".", recursive = FALSE), + "Invalid metadata files found.*:\ntest2" +) +expect_warning( + list_data(root = root, path = ".", recursive = TRUE), + "Invalid metadata files found.*:\na/test2\ntest2" +) +current <- list.files(root, recursive = TRUE) +expect_warning( + rm_data(root = root, path = "."), + "Invalid metadata files found.*:\na/test2\ntest2" +) +expect_identical(current[current != "test1.tsv"], + list.files(root, recursive = TRUE)) +file.remove(file.path(root, "test2.tsv")) +current <- list.files(root, recursive = TRUE) +expect_warning( + prune_meta(root = root, path = "."), + "Invalid metadata files found.*:\ntest2" +) +expect_identical(current[current != "test1.yml"], + list.files(root, recursive = TRUE)) + +file.remove( + list.files(root, recursive = TRUE, full.names = TRUE) +) +file.remove( + list.files(root, recursive = TRUE, include.dirs = TRUE, full.names = TRUE) +) diff --git a/tests/testthat/test_c_git.R b/tests/testthat/test_c_git.R index 3ba0075..efa45d8 100644 --- a/tests/testthat/test_c_git.R +++ b/tests/testthat/test_c_git.R @@ -1,5 +1,5 @@ context("write_vc() and read_vc() on a git-repository") -root <- tempfile(pattern = "git2rdata-") +root <- tempfile(pattern = "git2rdata-git") dir.create(root) root <- git2r::init(root) git2r::config(root, user.name = "Alice", user.email = "alice@example.org") @@ -118,8 +118,8 @@ staged <- write_vc( expect_equal( status(root, ignored = TRUE), list( - staged = list(), unstaged = "staged.tsv", untracked = unname(untracked), - ignored = unname(ignored) + staged = list(), unstaged = c("staged.tsv", "staged.yml"), + untracked = unname(untracked), ignored = unname(ignored) ), check.attributes = FALSE ) @@ -144,8 +144,8 @@ staged <- write_vc( expect_equal( status(root, ignored = TRUE), list( - staged = "staged.tsv", unstaged = list(), untracked = unname(untracked), - ignored = unname(ignored) + staged = c("staged.tsv", "staged.yml"), unstaged = list(), + untracked = unname(untracked), ignored = unname(ignored) ), check.attributes = FALSE ) @@ -181,7 +181,7 @@ expect_identical( ) expect_error( prune_meta(root = root, path = ".", stage = TRUE), - "cannot remove and stage metadata when data is removed but unstaged" +"cannot remove and stage metadata in combination with removed but unstaged data" ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R index 3de15e2..251fe57 100644 --- a/tests/testthat/test_d_recent_commit.R +++ b/tests/testthat/test_d_recent_commit.R @@ -4,8 +4,12 @@ context("recent_commit") # therefore Sys.sleep(subsecond) is added before each commit subsecond <- 1.2 +expect_error(recent_commit(file = "junk", root = NULL), + "a 'root' of class NULL is not supported") + root <- tempfile(pattern = "git2rdata-recent") dir.create(root) + root <- git2r::init(root) git2r::config(root, user.name = "Alice", user.email = "alice@example.org") @@ -60,23 +64,18 @@ expect_identical( ) ) -write_vc( - test_data[11:12, ], file = "subsecond", root = root, stage = TRUE, - sorting = "test_Date" -) +target <- file.path(git2r::workdir(root), "subsecond.txt") +write.table(test_data[11, ], file = target) +git2r::add(root, target) commit_6 <- commit(root, "first subsecond") -write_vc( - test_data[13:14, ], file = "subsecond", root = root, stage = TRUE, - sorting = "test_Date" -) +write.table(test_data[12, ], file = target) +git2r::add(root, target) commit_7 <- commit(root, "second subsecond") -write_vc( - test_data[15:16, ], file = "subsecond", root = root, stage = TRUE, - sorting = "test_Date" -) +write.table(test_data[13, ], file = target) +git2r::add(root, target) commit_8 <- commit(root, "third subsecond") expect_warning( - output <- recent_commit(file = "subsecond", root, data = TRUE), - "Multiple commits within the same second" + output <- recent_commit(file = "subsecond.txt", root), + "More than one commit within the same second" ) expect_true(all(output$commit %in% c(commit_6$sha, commit_7$sha, commit_8$sha))) diff --git a/tests/testthat/test_e_empty_label.R b/tests/testthat/test_e_empty_label.R new file mode 100644 index 0000000..bc4416d --- /dev/null +++ b/tests/testthat/test_e_empty_label.R @@ -0,0 +1,101 @@ +context("empty label") +root <- tempfile("git2rdata-empty-label") +dir.create(root) + +test_that("write_vc handles empty labels", { + # "" is first level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("", "a", "b")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) + expect_is( + original <- write_vc(mydfr, file = file, root = root), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) + + # "" is middle level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("a", "", "b")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) + expect_is( + original <- write_vc(mydfr, file = file, root = root), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) + + # "" is last level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("a", "b", "")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) + expect_is( + original <- write_vc(mydfr, file = file, root = root), + "character" + ) + expect_equivalent(mydf, mydfr <- read_vc(file = file, root = root)) +}) + +test_that("relabel handles empty labels", { + change <- data.frame(factor = "var", old = "", new = "something") + + # "" is first level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("", "a", "b")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + relabel(file = file, root = root, change = change) + expect_is(mydfr <- read_vc(file = file, root = root), "data.frame") + + # "" is middle level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("a", "", "b")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + relabel(file = file, root = root, change = change) + expect_is(mydfr <- read_vc(file = file, root = root), "data.frame") + + # "" is last level + mydf <- data.frame( + id = 1:6, + var = factor(c("", "", "a", "b", NA, NA), levels = c("a", "b", "")) + ) + file <- basename(tempfile(tmpdir = root)) + expect_is( + original <- write_vc(mydf, file = file, root = root, sorting = "id"), + "character" + ) + relabel(file = file, root = root, change = change) + expect_is(mydfr <- read_vc(file = file, root = root), "data.frame") +}) + +file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_non_ascii.R b/tests/testthat/test_e_non_ascii.R new file mode 100644 index 0000000..de838fe --- /dev/null +++ b/tests/testthat/test_e_non_ascii.R @@ -0,0 +1,40 @@ +context("check writing non ASCII characters") +root <- tempfile("git2rdata-empty-label") +dir.create(root) +characters <- data.frame(a = c("€$£ @&#§µ^ ()[]{}|²³<>/\\*+- ,;:.?!~", + "äàáâã ëèéê ïìíî öòóô üùúû ÿ ç ñ", + "ÄÀÁ ËÈÉÊ ÏÌÍÎ ÖÒÓÔ ÜÙÚÛ Ñ"), + stringsAsFactors = FALSE) +characters <- characters[order(characters$a), , drop = FALSE] # nolint + +test_that("special character are written properly as character", { + file <- basename(tempfile(tmpdir = root)) + expect_is( + junk <- write_vc(characters, file = file, root = root, sorting = "a"), + "character" + ) + expect_equivalent(read_vc(file = file, root = root), characters) +}) + +test_that("special character are written properly as optimized factor", { + characters$a <- factor(characters$a) + file <- basename(tempfile(tmpdir = root)) + expect_is( + junk <- write_vc(characters, file = file, root = root, sorting = "a"), + "character" + ) + expect_equivalent(read_vc(file = file, root = root), characters) +}) + +test_that("special character are written properly as verbose factor", { + characters$a <- factor(characters$a) + file <- basename(tempfile(tmpdir = root)) + expect_is( + junk <- write_vc(characters, file = file, root = root, sorting = "a", + optimize = FALSE), + "character" + ) + expect_equivalent(read_vc(file = file, root = root), characters) +}) + +file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_upgrade.R b/tests/testthat/test_e_upgrade.R new file mode 100644 index 0000000..1d9817f --- /dev/null +++ b/tests/testthat/test_e_upgrade.R @@ -0,0 +1,147 @@ +context("upgrade to new version") +root <- tempfile("git2rdata-upgrade") +dir.create(root) +test_that("read_vc() checks version", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.3" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + read_vc(file = file, root = root), + "Data stored using an older version of `git2rdata`." + ) + + junk_yaml[["..generic"]][["git2rdata"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + read_vc(file = file, root = root), + "Data stored using an older version of `git2rdata`." + ) +}) + +test_that("relabel() checks version", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + new_labels <- list(test_factor = list(a = "xyz")) + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.3" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + relabel(file = file, root = root, change = new_labels), + "Data stored using an older version of `git2rdata`." + ) + + junk_yaml[["..generic"]][["git2rdata"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + relabel(file = file, root = root, change = new_labels), + "Data stored using an older version of `git2rdata`." + ) +}) + +test_that("upgrade_data() validates metadata", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + expect_error( + upgrade_data(file = file, root = pi), + "a 'root' of class numeric is not supported" + ) + + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- NULL + junk_yaml[["test_Date"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + upgrade_data(file = file, root = root), + "corrupt metadata: mismatching hash." + ) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- NULL + junk_yaml[["..generic"]][["hash"]] <- "zzz" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + upgrade_data(file = file, root = root), + "corrupt metadata: mismatching hash." + ) + junk_yaml[["..generic"]][["hash"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + upgrade_data(file = file, root = root), + "corrupt metadata, no hash found." + ) + junk_yaml[["..generic"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_message( + junk <- upgrade_data(file = file, root = root), + "is not a git2rdata object" + ) + expect_equivalent(file, junk) +}) + +file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) + +test_that("upgrade_data() works from 0.0.3 to 0.0.4", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + old_yaml <- correct_yaml + old_yaml[["..generic"]][["git2rdata"]] <- NULL + old_yaml[["..generic"]][["data_hash"]] <- NULL + yaml::write_yaml(old_yaml, file.path(root, junk[2])) + expect_message( + files <- upgrade_data(file = file, root = root, verbose = TRUE), + paste0(file, ".yml updated") + ) + expect_message( + files <- upgrade_data(file = file, root = root, verbose = TRUE), + paste(file, "already up to date") + ) + expect_equivalent(read_vc(file = file, root = root), sorted_test_data) + + root <- git2r::init(root) + git2r::config(root, user.name = "Alice", user.email = "alice@example.org") + yaml::write_yaml(old_yaml, file.path(git2r::workdir(root), junk[2])) + git2r::add(root, paste0(file, c(".tsv", ".yml"))) + initial_commit <- commit(root, "initial commit", all = TRUE) + expect_message( + files <- upgrade_data(file = file, root = root, verbose = TRUE), + paste0(file, ".yml updated") + ) + expect_equal( + status(root), + list(staged = list(), unstaged = list(paste0(files, ".yml")), + untracked = list()), + check.attributes = FALSE + ) + expect_message( + files <- upgrade_data(file = file, root = root, verbose = TRUE, + stage = TRUE), + paste(file, "already up to date") + ) + expect_equal( + status(root), + list( + staged = list(paste0(files, ".yml")), unstaged = list(), + untracked = list() + ), + check.attributes = FALSE + ) + + file <- basename(tempfile(tmpdir = git2r::workdir(root))) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + expect_error( + upgrade_data(file = file, path = ".", root = root, verbose = TRUE), + "specify either 'file' or 'path'" + ) + expect_is( + upgrade_data(path = ".", root = root, verbose = TRUE), + "character" + ) +}) + +file.remove(list.files(git2r::workdir(root), recursive = TRUE, + full.names = TRUE)) diff --git a/tests/testthat/test_e_validate_metadata.R b/tests/testthat/test_e_validate_metadata.R new file mode 100644 index 0000000..c7017d7 --- /dev/null +++ b/tests/testthat/test_e_validate_metadata.R @@ -0,0 +1,65 @@ +context("validate metadata when reading") +root <- tempfile("git2rdata-check-meta") +dir.create(root) +test_that("read_vc() checks hash", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["test_Date"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + read_vc(file = file, root = root), + "Corrupt metadata, mismatching hash." + ) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["hash"]] <- "zzz" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + read_vc(file = file, root = root), + "Corrupt metadata, mismatching hash." + ) + junk_yaml[["..generic"]][["hash"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + read_vc(file = file, root = root), + "Corrupt metadata, no hash found." + ) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["data_hash"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error(read_vc(file = file, root = root), + "Corrupt metadata, no data hash found.") +}) + +test_that("read_vc() handles changes in rawdata", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_data <- readLines(file.path(root, junk[1])) + correct_header <- strsplit(correct_data[1], "\t")[[1]] + junk_data <- correct_data + junk_data[1] <- paste(correct_header[-1], collapse = "\t") + writeLines(junk_data, file.path(root, junk[1])) + expect_error(read_vc(file = file, root = root), + "Corrupt data, incorrect header.") + writeLines(correct_data[1:2], file.path(root, junk[1])) + expect_warning( + read_vc(file = file, root = root), + "Mismatching data hash. Data altered outside of git2rdata." + ) +}) + +test_that("write_vc() checks existing metadata", { + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["test_Date"]] <- NULL + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_error( + write_vc(test_data, file = file, root = root, sorting = "test_Date"), + "Existing metadata file is invalid" + ) +}) + +file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/vignettes/efficiency.Rmd b/vignettes/efficiency.Rmd index dfb8c04..4878821 100644 --- a/vignettes/efficiency.Rmd +++ b/vignettes/efficiency.Rmd @@ -1,11 +1,11 @@ --- -title: "Efficiency in terms of storage and time" +title: "Efficiency in Terms of Storage and Time" author: "Thierry Onkelinx" output: rmarkdown::html_vignette: fig_caption: yes vignette: > - %\VignetteIndexEntry{Efficiency in terms of storage and time} + %\VignetteIndexEntry{Efficiency in Terms of Storage and Time} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\VignetteDepends{git2r} @@ -147,9 +147,9 @@ if (system.file("efficiency", "airbag.rds", package = "git2rdata") == "") { str(airbag) ``` -## Data storage +## Data Storage -### On a file system +### On a File System We start by writing the dataset as is with `write.table()`, `saveRDS()`, `write_vc()` and `write_vc()` without storage optimization. Note that `write_vc()` uses optimization by default. Since `write_vc()` creates two files for each data set, we take their combined file size into account. @@ -188,7 +188,7 @@ kable( ) ``` -The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. The figure below illustrates the huge gain as soon as the level labels contain a few characters. The gain is less pronounced when the factor has a large number of levels. The optimization fails only in extreme cases with very short factor labels and a high number of labels. +The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. The figure below illustrates the huge gain as soon as the level labels contain a few characters. The gain is less pronounced when the factor has a large number of levels. The optimization fails only in extreme cases with very short factor labels and a high number of levels. ```{r factor_label_length, echo = FALSE, fig.cap = "Effect of the label length on the efficiency of storing factor optimized, assuming 1000 observations", warning = FALSE} ratio <- function(label_length = 1:20, n_levels = 9, n_obs = 1000) { @@ -270,7 +270,7 @@ ggplot(f_ratio, aes(x = observations, y = ratio, colour = levels)) + scale_colour_manual("number of \nlevels", values = inbo_colours) ``` -### In git repositories +### In Git Repositories Here we will simulate how much space the data requires when the history is stored in a git repository. We will create a git repository for each method and store several subsets of the same data. Each commit contains a new version of the data. Each version is a random sample containing 90% of the observations of the `airbag` data. Two consecutive versions of the subset will have about 90% of the observations in common. 10% of the observations will be replaced by other observations. @@ -342,7 +342,7 @@ if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") { Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest posibble diff. We also test `write.table()` with a stable ordering of the observations and variables. -The randomised `write.table()` yields the largest git repository, converging to about 6.5 times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a 25% reduction in repostory size compared to the randomised `write.table()`, but still is almost 5 times larger than the sorted `write.table()`. Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because the git repository will be compressed too. The optimized `write_vc()` starts at 83% and converges toward 72%, the verbose version starts at 90% and converges towards 105%. There is a clear gain when using `write_vc()` with optimization in terms of storage size and the availability of metadata. The verbose option of `write_vc()` lacks the gain in terms of storage size but still has the metadata advantage. +The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repostory size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because the git repository will be compressed too. The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. There is a clear gain when using `write_vc()` with optimization in terms of storage size and the availability of metadata. The verbose option of `write_vc()` lacks the gain in terms of storage size but still has the metadata advantage. ```{r plot_git_size, echo = FALSE, fig.cap = "Size of the git history using the different storage methods."} rs <- lapply( @@ -388,7 +388,7 @@ ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) + The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hunderd times and yields a distribution of timings for each expression. -### Writing data +### Writing Data ```{r get_file_timings, eval = system.file("efficiency", "file_timings.rds", package = "git2rdata") == ""} library(microbenchmark) @@ -412,7 +412,15 @@ if (system.file("efficiency", "file_timings.rds", package = "git2rdata") == "") } ``` -`write_vc()` takes 67% to 86% more time than `write.table()` because it needs to prepare the metadata and sort the observations and variables. When overwriting existing data, the new data is checked against the existing metadata. `saveRDS()` requires only 44% of the time that `write.table()` needs. +```{r median_write, echo = FALSE} +median_time <- aggregate(time ~ expr, data = mb, FUN = median) +write_ratio <- 100 * median_time$time / + median_time$time[median_time$expr == "write.table"] +names(write_ratio) <- median_time$expr +``` + + +`write_vc()` takes `r paste(sprintf("%.0f%%", -100 + write_ratio[grep("write_vc", names(write_ratio))]), collapse = " to ")` more time than `write.table()` because it needs to prepare the metadata and sort the observations and variables. When overwriting existing data, the new data is checked against the existing metadata. `saveRDS()` requires only `r sprintf("%.0f%%", write_ratio["saveRDS"])` of the time that `write.table()` needs. ```{r plot_file_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for the different methods."} mb$expr <- reorder(mb$expr, mb$time, FUN = median) @@ -423,7 +431,7 @@ ggplot(mb, aes(x = expr, y = time)) + theme(axis.title.x = element_blank()) ``` -### Reading data +### Reading Data ```{r get_read_timings, eval = system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""} mb <- microbenchmark( @@ -446,7 +454,14 @@ if (system.file("efficiency", "read_timings.rds", package = "git2rdata") == "") } ``` -The timings on reading the data is another story. Reading the binary format takes about 8% of the time needed to read the standard plain text format using `read.table()`. `read_vc()` takes about 65% (optimized) and 79% (verbose) of the time needed by `read.table()`, which at first seems strange because `read_vc()` calls `read.table()` to read the files and has some extra work to convert the variables to the correct data type. The main difference is that `read_vc()` knows the required data type _a priori_ and passes this info to `read.table()`. Otherwise, `read.table()` has to guess the correct data type from the file. +```{r median_read, echo = FALSE} +median_time <- aggregate(time ~ expr, data = mb, FUN = median) +read_ratio <- 100 * median_time$time / + median_time$time[median_time$expr == "read.table"] +names(read_ratio) <- median_time$expr +``` + +The timings on reading the data is another story. Reading the binary format takes about `r sprintf("%.0f%%", read_ratio["readRDS"])` of the time needed to read the standard plain text format using `read.table()`. `read_vc()` takes about `r sprintf("%.0f%%", read_ratio["read_vc.optim"])` (optimized) and `r sprintf("%.0f%%", read_ratio["read_vc.verbose"])` (verbose) of the time needed by `read.table()`, which at first seems strange because `read_vc()` calls `read.table()` to read the files and has some extra work to convert the variables to the correct data type. The main difference is that `read_vc()` knows the required data type _a priori_ and passes this info to `read.table()`. Otherwise, `read.table()` has to guess the correct data type from the file. ```{r plot_read_timings, echo = FALSE, fig.cap = "Boxplots for the read timings for the different methods."} mb$expr <- factor( diff --git a/vignettes/plain_text.Rmd b/vignettes/plain_text.Rmd index 53c7577..880b727 100644 --- a/vignettes/plain_text.Rmd +++ b/vignettes/plain_text.Rmd @@ -1,9 +1,9 @@ --- -title: "Getting started" +title: "Getting Started Storing Dataframes as Plain Text" author: "Thierry Onkelinx" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Getting started} + %\VignetteIndexEntry{Getting Started Storing Dataframes as Plain Text} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -24,10 +24,10 @@ This vignette motivates why we wrote `git2rdata` and illustrates how you can use ### Maintaining variable classes -R has several options to store dataframes as plain text files from R. Base R has `write.table()` and its companions like `write.csv()` .Some other options are `readr::write_delim()`, `readr::write_csv()` and `readr::write_tsv()`. Each of them writes a dataframe as a plain text file by converting all variables into characters. After reading the file, the conversion is reversed. However, the distinction between `character` and `factor` is lost in translation. `read.table()` converts by default all strings to factors, `readr::read_csv()` keeps by default all strings as character. The factor levels are another thing which is lost. These functions determine factor levels based on the observed levels in the plain text file. Hence factor levels without observations will disappear. The order of the factor levels is also determined by the available levels in the plain text file, which can be different from the original order. +R has several options to store dataframes as plain text files from R. Base R has `write.table()` and its companions like `write.csv()`. Some other options are `data.table::fwrite()`, `readr::write_delim()`, `readr::write_csv()` and `readr::write_tsv()`. Each of them writes a dataframe as a plain text file by converting all variables into characters. After reading the file, the conversion is reversed. However, the distinction between `character` and `factor` is lost in translation. `read.table()` converts by default all strings to factors, `readr::read_csv()` keeps by default all strings as character. The factor levels are another thing which is lost. These functions determine factor levels based on the observed levels in the plain text file. Hence factor levels without observations will disappear. The order of the factor levels is also determined by the available levels in the plain text file, which can be different from the original order. The `write_vc()` and `read_vc()` functions from `git2rdata` keep track of the class of each variable and, in case of a factor, also of the factor levels and their order. Hence this function pair preserves the information content of the dataframe. The `vc` suffix stands for **v**ersion **c**ontrol as these functions use their full capacity in combination with a version control system. - +Efficiency in terms of storage and time ### Optimizing file storage Plain text files require more disk space than binary files. This is the price we have to pay for a readable file format. The default option of `write_vc()` is to minimize file size as much as possible prior to writing. Since we use a tab delimited file format, we can omit quotes around character variables. This saves 2 bytes per row for each character variable. Quotes are added automatically in the exceptional cases when they are needed, e.g. to store a string that contains tab or newline characters. In such cases, quotes are only used in row-variable combinations where the exception occurs. @@ -36,7 +36,7 @@ Since we store the class of each variable, further file size reductions can be a - `logical` is written as 0 (FALSE), 1 (TRUE) or NA to the data - `factor` is stored as its indices in the data. The index and labels of levels and their order are stored in the metadata. -- `POSIXct` is written as an integer to the data. The class and the origin are stored in the metadata. Timestamps are always stored and returned as UTC. +- `POSIXct` is written as a numeric to the data. The class and the origin are stored in the metadata. Timestamps are always stored and returned as UTC. - `Date` is written as an integer to the data. The class and the origin are stored in the metadata. Storing the factors, POSIXct and Date as their index, makes them less user readable. The user can turn off this optimization when user readability is more important than file size. @@ -45,11 +45,11 @@ Storing the factors, POSIXct and Date as their index, makes them less user reada Another main goal of `git2rdata` is to optimise the storage of the plain text files under version control. `write_vc()` and `read_vc()` has methods for interacting with [git](https://git-scm.com/) repositories using the `git2r` framework. Users who want to use git without `git2r` or use a different version control system (e.g. [Subversion](https://subversion.apache.org/), [Mercurial](https://www.mercurial-scm.org/)), still can use `git2rdata` to write the files to disk and uses their preferred workflow on version control. -Hence, `write_vc()` will always perform checks to look for changes which potentially lead to large diffs. More details on this in the [version control](https://inbo.github.io/git2rdata/articles/version_control.html) vignette. Some problems will always yield a warning. Other problems will yield an error by default. The user can turn these errors into warnings by setting the `strict = FALSE` argument. +Hence, `write_vc()` will always perform checks to look for changes which potentially lead to large diffs. More details on this in `vignette("version_control", package = "git2rdata")`. Some problems will always yield a warning. Other problems will yield an error by default. The user can turn these errors into warnings by setting the `strict = FALSE` argument. As this vignette ignores the part on version control, we will always use `write_vc(strict = FALSE)` and hide the warnings to improve the readability. -## Basic usage +## Basic Usage Let's start by setting up the environment. We need a directory to store the data and a dataframe to store. @@ -78,7 +78,7 @@ x <- data.frame( str(x) ``` -## Storing optimized +## Storing Optimized Use `write_vc()` to store the dataframe. The `root` argument refers to the base directory where the data is stored. The `file` argument is used as the base name of the files. The data file gets a `.tsv` extension, the metadata file a `.yml` extension. `file` can include a relative path starting from `root`. @@ -100,7 +100,7 @@ print_file("first_test.yml", path) ``` -## Storing verbose +## Storing Verbose Adding `optimize = FALSE` to `write_vc()` will keep the raw data in a human readable format. The metadata file is slightly different. The most obvious is the `optimize: no` tag and the different hash. Another difference is the metadata for POSIXct and Date classes. They will no longer have an origin tag but a format tag. @@ -113,11 +113,11 @@ print_file("verbose.tsv", path, 10) print_file("verbose.yml", path) ``` -## Efficiency in terms of file storage +## Efficiency in Terms of File Storage Storing dataframes optimized or verbose has an impact on the required file size. A comparison can be found in the [efficiency](efficiency.html#on-a-file-system) vignette. -## Reading data +## Reading Data The data can be retrieved with `read_vc()`. This function will reinstate the variables to their original state. @@ -130,7 +130,7 @@ all.equal(x, y2, check.attributes = FALSE) As `read_vc()` requires the meta data, it can only read dataframes which were stored by `write_vc()`. -## Missing values +## Missing Values `write_vc()` has an `na` argument which specifies the string which is used to indicate missing values. Because we avoid using quotes, this string must be different from any character value in the data. This includes factor labels when the data is stored verbose. This is checked and will always return an error, even with `strict = FALSE`. diff --git a/vignettes/version_control.Rmd b/vignettes/version_control.Rmd index 3c08d9e..7128556 100644 --- a/vignettes/version_control.Rmd +++ b/vignettes/version_control.Rmd @@ -1,9 +1,9 @@ --- -title: "Optimizing storage for version control" +title: "Optimizing Storage for Version Control" author: "Thierry Onkelinx" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Optimizing storage for version control} + %\VignetteIndexEntry{Optimizing Storage for Version Control} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -19,9 +19,9 @@ options(width = 83) ## Introduction -This vignette focuses on what `git2rdata` does to make storing dataframes under version control more efficient and convenient. All details on the actual file format are described in the [plain text storage](plain_text.html) vignette. Hence we not discuss the `optimize` and `na` arguments. +This vignette focuses on what `git2rdata` does to make storing dataframes under version control more efficient and convenient. All details on the actual file format are described in `vignette("plain_text", package = "git2rdata")`. Hence we will not discuss the `optimize` and `na` arguments to the `write_vc()` function. -We will neither illustrate the efficiency of `write_vc()` and `read_vc()` since that is covered in the [efficiency](efficiency.html) vignette. +We will not illustrate the efficiency of `write_vc()` and `read_vc()` since that is covered in `vignette("efficiency", package = "git2rdata")`. ## Setup @@ -56,9 +56,9 @@ A critical assumption made by `git2rdata` is that all information is contained w Version control systems like [git](https://git-scm.com/), [subversion](https://subversion.apache.org/) or [mercurial](https://www.mercurial-scm.org/) focus on accurately keeping track of _any_ change in the files. Two observations switching place in a plain text file _is_ a change, although the information content^[_sensu_ `git2rdata`] doesn't change. Therefore `git2rdata` helps the user to prepare the plain text files in such a way that any change in the version history is an actual change in the information content. -## Sorting observations +## Sorting Observations -Version control systems often track changes in plain text files based on row based differences. In layman's terms it only records which lines in a file are removed and which lines are inserted at what location. Changing an existing line implies removing the old version and inserting the new one. This is illustrated is the minimal example below. +Version control systems often track changes in plain text files based on row based differences. In layman's terms they only record which lines in a file are removed and which lines are inserted at what location. Changing an existing line implies removing the old version and inserting the new one. This is illustrated in the minimal example below. Original version @@ -69,7 +69,7 @@ A,B 3,12 ``` -Altered version. The row containing `1, 10` was moved to the last line. The row containing `3,12` was changed to `3,12` +Altered version. The row containing `1, 10` was moved to the last line. The row containing `3,12` was changed to `3,0` ``` A,B @@ -145,7 +145,7 @@ fn <- write_vc(x[sample(nrow(x)), ], "row_order", root, sorting = c("y", "x")) fn <- write_vc(x[sample(nrow(x), replace = TRUE), ], "row_order", root) ``` -## Sorting variables +## Sorting Variables The order of the variables (columns) has an even bigger impact on a row based diff. Let's revisit our minimal example. Suppose that we swap `A` and `B` from our [original example](#sorting-observations). The new data looks as below. @@ -156,7 +156,7 @@ B,A 13,3 ``` -The resulting diff is maximal because every single row was updated. Yet none of the information was changed. Hence, it is crucial to maintain column order when storing dataframes as plain text files under version control. This is illustrated on a more realistic data set in the [efficiency](efficiency.html#in-git-repositories) vignette. +The resulting diff is maximal because every single row was updated. Yet none of the information was changed. Hence, it is crucial to maintain column order when storing dataframes as plain text files under version control. This is illustrated on a more realistic data set in the `vignette("efficiency", package = "git2rdata")` vignette. ```diff -A,B @@ -178,9 +178,9 @@ write_vc(x[sample(nrow(x)), sample(ncol(x))], "column_order", root) print_file("column_order.tsv", root, n = 5) ``` -## Handling factors optimized +## Handling Factors Optimized -The [plain text](plain_text.html#optimizing-file-storage) and [efficieny](efficiency.html#on-a-file-system) vignette illustrate how a factor can be stored more efficiently when storing their index in the data file and the indices and labels in the metadata. We take this even a bit further: what happens if new data arrives and an extra factor level is required? +`vignette("plain_text", package = "git2rdata")` and `vignette("efficiency", package = "git2rdata")` illustrate how a factor can be stored more efficiently when storing their index in the data file and the indices and labels in the metadata. We take this even a bit further: what happens if new data arrives and an extra factor level is required? ```{r factor} old <- data.frame(color = c("red", "blue")) @@ -222,7 +222,7 @@ write_vc(ordered, "factor", root, sorting = "color", strict = FALSE) print_file("factor.yml", root) ``` -## Relabelling a factor +## Relabelling a Factor The example below will store a dataframe, relabel the factor levels and store it again using `write_vc()`. Notice that both the labels and the indices are updated. Hence creating a large diff, where just updating the labels would be sufficient. diff --git a/vignettes/workflow.Rmd b/vignettes/workflow.Rmd index ea67b0a..6c767f2 100644 --- a/vignettes/workflow.Rmd +++ b/vignettes/workflow.Rmd @@ -1,9 +1,9 @@ --- -title: "Suggested workflow for storing a variable set of dataframes under version control" +title: "Suggested Workflow for Storing a Variable Set of Dataframes under Version Control" author: "Thierry Onkelinx" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Suggested workflow for storing a variable set of dataframes under version control} + %\VignetteIndexEntry{Suggested Workflow for Storing a Variable Set of Dataframes under Version Control} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\VignetteDepends{git2r} @@ -20,179 +20,200 @@ set.seed(20120225) ## Introduction -This vignette describes a suggested workflow for storing dataframes. This time we use a `git2r::repository()` object as the root. This adds git functionality to `write_vc()` and `read_vc()`, provided by the [`git2r`](https://cran.r-project.org/package=git2r) package. This allows to pull, stage, commit and push from within R. +This vignette describes a suggested workflow for storing a snapshot of dataframes as git2rdata objects under version control. The workflow comes in two flavours: -The rationale behind this workflow is that we have read-only access to a database containing the raw data. The database is beyond our control. Observations in the database can be added, removed or updated without our knowledge. These changes cannot be traced in the database. + 1. A single repository holding both the data and the analysis code. The single repository set-up is simple. A single reference (e.g. commit) points to both the data and the code. + 1. One repository holding the data and a second repository holding the code. The data and the code have an independent history under a two repository set-up. Documenting the analysis requires one reference to each repository. Such a set-up is useful for repeating the same analysis (stable code) on updated data. -The database defines a variable number of dataframe (e.g. species can be added or removed). We have defined a standard analysis which should run for each group. We want to repeat the analyses with some predefined frequency (e.g. once every year). In order to make the analyses reproducible, we want to store the relevant data in a git repository. +In this vignette we use a `git2r::repository()` object as the root. This adds git functionality to `write_vc()` and `read_vc()`, provided by the [`git2r`](https://cran.r-project.org/package=git2r) package. This allows to pull, stage, commit and push from within R. + +Each commit in the data git repository describes a complete snapshot of the data at the time of the commit. The difference between two commits can consist of changes in existing git2rdata object (updated observations, new observations, deleted observations or updated metadata). Besides updating the existing git2rdata objects, we can also add new git2rdata objects or remove existing ones. Such higher level addition and deletions need to be tracked as well. + +We illustrate the workflow with a mock analysis on the `datasets::beaver1` and `datasets::beaver2` datasets. ## Setup -We start by initializing a git repository. `git2rdata` assumes that is already done. Therefore we'll use the `git2r` functions to to so. +We start by initializing a git repository. `git2rdata` assumes that is already done. Therefore we'll use the `git2r` functions to do so. We start by creating a local bare repository. In practice we will use a remote on an external server (GitHub, Gitlab, Bitbucket, ...). The example below creates a local git repository with an upstream git repository. Any other workflow to create a similar structure is fine. ```{r initialize} # initialize a bare git repo to be used as remote remote <- tempfile("git2rdata-workflow-remote") +remote <- normalizePath(remote, winslash = "/") dir.create(remote) git2r::init(remote, bare = TRUE) # initialize a local git repo path <- tempfile("git2rdata-workflow") +path <- normalizePath(path, winslash = "/") dir.create(path) init_repo <- git2r::clone(remote, path, progress = FALSE) git2r::config(init_repo, user.name = "me", user.email = "me@me.com") # add an initial commit with .gitignore file -writeLines("C.*", file.path(path, ".gitignore")) -git2r::add(init_repo, ".gitignore") +writeLines("*extra*", file.path(path, ".gitignore")) +git2r::add(init_repo, ".gitignore", force = TRUE) git2r::commit(init_repo, message = "Initial commit") # push initial commit to remote git2r::push(init_repo, "origin", "refs/heads/master") +rm(init_repo) ``` -Next we need a function to mimic the data selection from the database. - -```{r dummy_data} -generate_data <- function(x, n = rpois(1, 10)) { - if (missing(x)) { - coef <- runif(2) - sigma <- rchisq(1, 10) / 10 - covariate <- runif(n) - old_covariate <- numeric(0) - old_response <- numeric(0) - } else { - coef <- attr(x, "coef") - sigma <- attr(x, "sigma") - x <- x[rbinom(nrow(x), size = 1, prob = 0.9) == 1, ] - update <- rbinom(nrow(x), size = 1, prob = 0.1) == 1 - covariate <- c(x$covariate[update], runif(n)) - old_covariate <- x$covariate[!update] - old_response <- x$response[!update] - } - response <- coef[1] + covariate * coef[2] + - rnorm(length(covariate), sd = sigma) - x <- data.frame( - covariate = c(old_covariate, covariate), - response = c(old_response, response) - ) - attr(x, "coef") <- coef - attr(x, "sigma") <- sigma - return(x) -} -``` +## Structuring Git2rdata Objects Within a Project -## Storing dataframes into a git repository +`git2rdata` imposes very little structure. Both the `.tsv` and the `.yml` file need to be in the same folder. That's it. For the sake of simplicity, in this vignette we dump all git2rdata objects at the root of the repository. -### First commit +However, this might not be good idea for real project. We recommend to use at least a different directory tree for each import script. This directory can go into the root of a data only repository. It goes in the `data` directory in case of a data and code repository. Or the `inst` directory is case of an R package. -Suppose that we have two groups at the first point in time. We read the data for these groups from the database. We also store them in a list called `content` to be reused in the [next section](#automated-workflow-for-storing-dataframes). +Your project might need a different directory structure. Feel free to implement the most relevant data structure for your project. -Then we connect to the git repository using `repository()`. Note that this assumes that `path` is an existing git repository. Now we can write each group to a dedicated data file in the repository. If the `root` argument of `write_vc()` is a `git_repository`, it gains two additional arguments: `stage` and `force`. Setting `stage = TRUE`, will automatically stage the files written by `write_vc()`. +## Storing Dataframes _ad Hoc_ into a Git Repository -```{r store_data_1} -A <- generate_data() -B <- generate_data() -content <- list(list(A = A, B = B)) +### First Commit +In the first commit we use `datasets::beaver1`. We connect to the git repository using `repository()`. Note that this assumes that `path` is an existing git repository. Now we can write the dataset as a git2rdata object in the repository. If the `root` argument of `write_vc()` is a `git_repository`, it gains two additional arguments: `stage` and `force`. Setting `stage = TRUE`, will automatically stage the files written by `write_vc()`. + +```{r store_data_1} library(git2rdata) repo <- repository(path) -fn <- write_vc(A, "A", repo, sorting = "covariate", stage = TRUE) -fn <- write_vc(B, "B", repo, sorting = "covariate", stage = TRUE) +fn <- write_vc(beaver1, "beaver", repo, sorting = "time", stage = TRUE) ``` We can use `status()` to check that the required files are written and staged. Then we `commit()` the changes. +```{r avoid_subsecond_commit, echo = FALSE} +Sys.sleep(1.2) +``` + + ```{r commit_data_1} status(repo) -cm <- commit(repo, message = "First commit") -cat(cm$message) +cm1 <- commit(repo, message = "First commit") +cat(cm1$message) ``` -### Second commit +### Second Commit -Let's assume that at a second point in time group A has updated data, group B remains unchanged and a new group C emerges. We write all three data sets to the repo. `status()` indicated that only the data of group A has changed. +The second commit adds `beaver2`. ```{r store_data_2} -A <- generate_data(A) -C <- generate_data() -content <- c(content, list(list(A = A, B = B, C = C))) - -fn <- write_vc(A, "A", repo, sorting = "covariate", stage = TRUE) -fn <- write_vc(B, "B", repo, sorting = "covariate", stage = TRUE) -fn <- write_vc(C, "C", repo, sorting = "covariate", stage = TRUE) +fn <- write_vc(beaver2, "extra_beaver", repo, sorting = "time", stage = TRUE) status(repo) ``` -Notice that group C is not listed in the `status()`, although it was written to the repository. The reason is that we set a `.gitignore` which contains `"C\.*`, so group C is ignored. We can force it to be staged by setting `force = TRUE` +Notice that `extra_beaver` is not listed in the `status()`, although it was written to the repository. The reason is that we set a `.gitignore` which contains `"*extra*`, so any git2rdata object with a name containing "extra" is ignored. We can force it to be staged by setting `force = TRUE`. + +```{r avoid_subsecond_commit2, echo = FALSE} +Sys.sleep(1.2) +``` ```{r} -list.files(path) -fn <- write_vc(C, "C", repo, sorting = "covariate", stage = TRUE, force = TRUE) +status(repo, ignored = TRUE) +fn <- write_vc(beaver2, "extra_beaver", repo, sorting = "time", stage = TRUE, + force = TRUE) status(repo) -cm <- commit(repo, message = "Second commit") +cm2 <- commit(repo, message = "Second commit") ``` -### Third commit +### Third Commit -During the third point in time, group A is removed, group B unchanged and group C updated. So we remove group A and write the two other groups. We use `all = TRUE` to stage the unstaged removal of group A. Since group C was force into the history, `.gitignore` is overruled for these two files. +At this point in time we decide that a single git2rdata object containing the data of both beavers is more relevant. We add an ID variable for each of the animals. This requires updating the `sorting` to eliminate ties. And `strict = FALSE` to update the metadata. The "extra_beaver" git2rdata object is no longer needed so we remove it. We use `all = TRUE` to stage the removal of "extra_beaver" while committing the changes. -```{r store_data_3} -C <- generate_data(C) -content <- c(content, list(list(B = B, C = C))) +```{r avoid_subsecond_commit3, echo = FALSE} +Sys.sleep(1.2) +``` -file.remove(file.path(path, c("A.tsv", "A.yml"))) -fn <- write_vc(B, "B", repo, sorting = "covariate", stage = TRUE) -fn <- write_vc(C, "C", repo, sorting = "covariate", stage = TRUE) +```{r store_data_3} +beaver1$beaver <- 1 +beaver2$beaver <- 2 +beaver <- rbind(beaver1, beaver2) +fn <- write_vc(beaver, "beaver", repo, sorting = c("beaver", "time"), + strict = FALSE, stage = TRUE) +file.remove(list.files(path, "extra", full.names = TRUE)) status(repo) -cm <- commit(repo, message = "Third commit", all = TRUE) +cm3 <- commit(repo, message = "Third commit", all = TRUE) status(repo) ``` -## Automated workflow for storing dataframes +## Scripted Workflow for Storing Dataframes -To mimic a changing dataset we reuse the list `content` created above. This contains the relevant data at the different points in time. We create a custom function to store the data in an automated way. In pratice we will run this function each time we want to make a snapshot of the data. In this example we emulate that by applying it to each element of `content`. +We strongly recommend to add git2rdata object through an import script instead of adding them [_ad hoc_](#storing-dataframes-ad-hoc-into-a-git-repository). Store this script in the (analysis) repository. It documents the creation of the git2rdata objects. Rerun this script whenever updated data becomes available. -We start by pulling the remote repository to make sure that our local repository has the latest version. Then we want to write the dataframe for each group. But how do we detect which groups are no longer present? A straightforward workaround for this problem is to first remove all data files. Then write all currently existing dataframes to the repository. Since we only removed the data files, any preexisting metadata is still available. After writing all existing dataframes we only are left with cleaning dangling metadata files. To make this process more convenient we created `rm_data()` and `prune_meta()`. `prune_meta()` will remove any `.yml` file without matching `.tsv` file. `rm_data()` removes by default all `.tsv` files with associated `.yml` file. When applied on a `git_repository` object, there is an extra fail-safe because then it will only remove unmodified files. _Caveat_: when applied on a path, it will remove _all_ data files, without warning. Even when the path points to a git repository. So use `rm_data()` and `prune_meta()` with care. +Old versions of the import script and the associated git2rdata remain available through the version control history. Remove obsolete git2rdata objects from the import script. This keeps both the import script and the working directory tidy and minimal. -The last steps in the function consist of committing the changes and push them to the remote repository. We had to add a `Sys.sleep(2)` to avoid commits within the same second. This should not be needed in a real-life situation. +Basically, the import script should create all git2rdata objects within a given directory tree. This gives the advantage that we start the import script by clearing any existing git2rdata object in this directory. Any git2rdata object which no longer is created by the import script gets removed without the need to track what git2rdata objects existed in the previous version. -Please note that the function below is intended as a template. In practice, step 3 would contain user defined functions to create the relevant dataframes and store them using `write_vc()`. +The brute force method of removing all files or all `.tsv` / `.yml` pairs is not a good idea. This removes the existing metadata which we need for efficient storage (see `vignette("efficiency", package = "git2rdata")`). A better solution is to use `rm_data()` on the directory at the start of the import script. This removes all `.tsv` files which have valid metadata. The existing metadata remains untouched at this point. -```{r automated_flow} -store_data <- function(df, repo) { - # step 1: update the local repository - pull(repo) - # step 2: remove all exisiting data files - rm_data(repo, path = ".", type = "all", stage = TRUE) - # step 3: create and write all relevant dataframe - lapply( - names(df), - function(i) { - write_vc(df[[i]], i, root = repo, sorting = "covariate", - stage = TRUE, force = TRUE) - } - ) - # step 4: remove dangling metadata - prune_meta(repo, path = ".", stage = TRUE) - # step 5: commit the changes - commit(repo, "Scripted commit from git2rdata", session = TRUE) - # step 6: update the remote repository - push(repo) - # avoid subsecond commits, only needed in this toy example - Sys.sleep(2) -} -``` +Then write all git2rdata objects and stage them. Unchanged objects will not lead to a diff, even if we first deleted and then recreated them. The script won't recreate the `.tsv` file of obsolete git2rdata objects. Use `prune_meta()` to remove any leftover metadata files. + +Commit and push the changes at the end of the script. -Make a new clone on the remote repo and store `content` in it. +Below is an example script recreating the "beaver" git2rdata object from the [third commit](#third-commit). -```{r run_automated} -path2 <- tempfile("git2rdata-workflow") -dir.create(path2) -init_repo2 <- git2r::clone(remote, path2, progress = FALSE) -git2r::config(init_repo2, user.name = "me", user.email = "me@me.com") -done <- lapply(content, store_data, repo = repository(path2)) +```{r eval = FALSE} +# load package +library(git2rdata) +# step 1: setup the repository and data path +repo <- repository(".") +data_path <- "data/beaver" +# step 1b: sync the repository with the remote +pull(repo = repo) +# step 2: remove all existing data files +rm_data(root = repo, path = data_path, stage = TRUE) + +# step 3: write all relevant git2rdata objects to the data path +beaver1$beaver <- 1 +beaver2$beaver <- 2 +body_temp <- rbind(beaver1, beaver2) +fn <- write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), + root = repo, sorting = c("beaver", "time"), stage = TRUE) + +# step 4: remove any dangling metadata files +prune_meta(root = repo, path = data_path, stage = TRUE) + +# step 5: commit the changes +cm <- commit(repo = repo, message = "import") +# step 5b: sync the repository with the remote +push(repo = repo) ``` -## Analysis workflow with reproducible data +## R Package Workflow for Storing Dataframes + +We recommend a two repository set-up in case of recurring analyses. These are relative stable analyses which have to run with some frequency on updated data (e.g. once a month). Then it is worthwhile to convert the analyses into an R package. Long scripts can be converted into a set of shorter functions which are much easier to document and maintain. An R package offers lots of [functionality](http://r-pkgs.had.co.nz/check.html) out of the box to check the quality of your code. + +The example below converts the import script above into a function. We illustrate how you can use Roxygen2 (see `vignette("roxygen2", package = "roxygen2")`) tags to document the function and to list its dependencies. Note that we added `session = TRUE` to `commit()`. This will append the `sessionInfo()` at the time of the commit to the commit message. Thus documenting all loaded R packages and their version. This documents to code used to create the git2rdatad object since your analysis code resides in a dedicated package with its own version number. We strongly recommend to run the import from a fresh R session. Then the `sessionInfo()` at commit time is limited to those packages with are strictly required for the import. Consider running the import from the command line. e.g. `Rscript -e 'mypackage::import_body_temp("path/to/root")'`. + +```{r eval = FALSE} +#' Import the beaver body temperature data +#' @param path the root of the git repository +#' @importFrom git2rdata repository pull rm_data write_vc prune_meta commit push +#' @export +import_body_temp <- function(path) { + # step 1: setup the repository and data path + repo <- repository(path) + data_path <- "data/beaver" + # step 1b: sync the repository with the remote + pull(repo = repo) + # step 2: remove all existing data files + rm_data(root = repo, path = data_path, stage = TRUE) + + # step 3: write all relevant git2rdata objects to the data path + beaver1$beaver <- 1 + beaver2$beaver <- 2 + body_temp <- rbind(beaver1, beaver2) + fn <- write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), + root = repo, sorting = c("beaver", "time"), stage = TRUE) + + # step 4: remove any dangling metadata files + prune_meta(root = repo, path = data_path, stage = TRUE) + + # step 5: commit the changes + cm <- commit(repo = repo, message = "import", session = TRUE) + # step 5b: sync the repository with the remote + push(repo = repo) +} +``` + +## Analysis Workflow with Reproducible Data The example below is a small trivial example of a standardized analysis in which the source of the data is documented by describing the name of the data, the repository URL and the commit. We can use this information when reporting the results. This makes the data underlying the results traceable. @@ -203,44 +224,49 @@ analysis <- function(ds_name, repo) { dataset = ds_name, repository = git2r::remote_url(repo), commit = recent_commit(ds_name, repo, data = TRUE), - model = lm(response ~ covariate, data = ds) + model = lm(temp ~ activ, data = ds) ) } report <- function(x) { knitr::kable( coef(summary(x$model)), - caption = sprintf("Datasource: repository: %s commit: %s dataset: %s", - x$repository, x$commit$commit, x$dataset) + caption = sprintf("**dataset:** %s \n**commit:** %s \n**repository:** %s", + x$dataset, x$commit$commit, x$repository) ) } ``` In this case we can run every analysis by looping over the list of datasets in the repository. -```{r run_current_analyses} -repo <- repository(path2) +```{r run_current_analyses, results = "asis"} +repo <- repository(path) current <- lapply(list_data(repo), analysis, repo = repo) names(current) <- list_data(repo) -report(current$B) -report(current$C) +result <- lapply(current, report) +junk <- lapply(result, print) ``` -The example below does exactly the same thing for the previous commit. +The example below does exactly the same thing for the first and second commit. -```{r run_previous_analyses} -# checkout previous commit -current_commit <- git2r::last_commit(repo) -previous_commit <- git2r::parents(current_commit)[[1]] -git2r::checkout(previous_commit) +```{r run_previous_analyses, results = "asis"} +# checkout first commit +git2r::checkout(cm1) +# do analysis +previous <- lapply(list_data(repo), analysis, repo = repo) +names(previous) <- list_data(repo) +result <- lapply(previous, report) +junk <- lapply(result, print) +# checkout second commit +git2r::checkout(cm2) # do analysis previous <- lapply(list_data(repo), analysis, repo = repo) names(previous) <- list_data(repo) -report(previous$B) -report(previous$C) +result <- lapply(previous, report) +junk <- lapply(result, print) ``` -If you inspect the reported results carefully you'll notice that the output for dataset "B" is identical. This makes sense since dataset B didn't change during the last commit. Dataset "C" did change, which results in different estimates _and_ a different commit hash. +If you inspect the reported results carefully you'll notice that all the output (coefficients and commit hash) for "beaver" object is identical for the first and second commit. This makes sense since the "beaver" object didn't change during the second commit. The output for the current (third) commit is different because the dataset changed. ### Long running analysis -Imagine the case where an individual analysis takes quite a while to run. We store the most recent version of each analysis and add the information from `recent_commit()`. When preparing the analysis, you can run `recent_commit()` again on the dataset and compare the commit hash with that one of the currently available analysis. If the commit hashes match, then the data hasn't changed. So there is no need to rerun the analysis^[assuming the code for running the analysis didn't change.], saving valuable computing resources. +Imagine the case where an individual analysis takes quite a while to run. We store the most recent version of each analysis and add the information from `recent_commit()`. When preparing the analysis, you can run `recent_commit()` again on the dataset and compare the commit hash with that one of the currently available analysis. If the commit hashes match, then the data hasn't changed. So there is no need to rerun the analysis^[assuming the code for running the analysis didn't change.], saving valuable computing resources and time.