diff --git a/.Rbuildignore b/.Rbuildignore index d8347fd..423d7a1 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^.*\.Rproj$ ^cran-comments\.md$ ^CRAN-RELEASE$ +^\.httr-oauth$ diff --git a/.gitignore b/.gitignore index 138866d..155cda3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata inst/doc docs +.httr-oauth diff --git a/DESCRIPTION b/DESCRIPTION index d75d4c9..2dd70bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.1.0.9000 +Version: 0.1.0.9003 Authors@R: c( person( "Thierry", "Onkelinx", role = c("aut", "cre"), @@ -14,6 +14,10 @@ Authors@R: c( "Peter", "Desmet", role = "ctb", email = "peter.desmet@inbo.be", comment = c(ORCID = "0000-0002-8442-8025")), + person( + "Els", "Lommelen", role = "ctb", + email = "els.lommelen@inbo.be", + comment = c(ORCID = "0000-0002-3481-5684")), person( "Research Institute for Nature and Forest", role = c("cph", "fnd"), email = "info@inbo.be")) @@ -40,6 +44,7 @@ URL: https://github.com/ropensci/git2rdata, https://doi.org/10.5281/zenodo.14853 BugReports: https://github.com/ropensci/git2rdata/issues Collate: 'clean_data_path.R' + 'datahash.R' 'git2rdata-package.R' 'write_vc.R' 'is_git2rdata.R' diff --git a/NEWS.md b/NEWS.md index 72ad52a..3f4e14b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,31 @@ +git2rdata 0.1.0.9003 (2019-11-07) +================================= + +### BREAKING FEATURES + + * reordering factor levels requires `strict = TRUE` + +git2rdata 0.1.0.9002 (2019-09-27) +================================= + +### BREAKING FEATURES + + * sorting is based on the "C" locale + * the data hash is based on the plain text file + +git2rdata 0.1.0.9001 (2019-09-09) +================================= + +### BREAKING FEATURES + + * Calculation of data hash has changed, due to which `read_vc()` will once warn that data are altered outside git2rdata when reading a previously written git2rdata object (#53). + * `read_vc()` only works with data stored with version >= 0.1.0.9001. Use `upgrade_data()` on data written with an earlier version. + * `is_git2rdata()` and `upgrade_data()` do not test equality in data hashes anymore (but `read_vc()` still does). + +### Bugfixes + + * The same data hash is generated on Linux and Windows machines (#49). + git2rdata 0.1.0.9000 (2019-08-13) ================================= diff --git a/R/datahash.R b/R/datahash.R new file mode 100644 index 0000000..3c13188 --- /dev/null +++ b/R/datahash.R @@ -0,0 +1,61 @@ +#' Calculate Hash of Dataframe +#' Calculates a hash based on the given data that is compatible with different +#' operating systems. +#' @param file The file to calculate the hash. +#' @return A 40 hexadecimal character quasi-unique code representing the +#' dataframe. +#' @noRd +#' @family internal +#' @importFrom assertthat assert_that +#' @importFrom git2r hash +datahash <- function(file) { + chunk_size <- 1e4 + hashes <- character(chunk_size + 1) + i <- 0 + rawdata <- scan( + file = file, what = character(), nmax = -1, sep = "\n", quote = "", + skip = i * chunk_size, nlines = chunk_size, na.strings = "", + flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE, + blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE, + encoding = "UTF-8", skipNul = FALSE + ) + while (length(rawdata)) { + hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n")) + i <- i + 1 + if (i %% chunk_size == 0) { + hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov + } + rawdata <- scan( + file = file, what = character(), nmax = -1, sep = "\n", quote = "", + skip = i * chunk_size, nlines = chunk_size, na.strings = "", + flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE, + blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE, + encoding = "UTF-8", skipNul = FALSE + ) + } + hash(paste(hashes, collapse = "")) +} + +#' Set the C locale for standardized sorting +#' @noRd +#' @return a named vector with the old locale +set_c_locale <- function() { + old_ctype <- Sys.getlocale(category = "LC_CTYPE") + old_collate <- Sys.getlocale(category = "LC_COLLATE") + old_time <- Sys.getlocale(category = "LC_TIME") + Sys.setlocale(category = "LC_CTYPE", locale = "C") + Sys.setlocale(category = "LC_COLLATE", locale = "C") + Sys.setlocale(category = "LC_TIME", locale = "C") + return(c(ctype = old_ctype, collate = old_collate, time = old_time)) +} + +#' Reset the old locale +#' @param locale the output of `set_c_locale()` +#' @return invisible `NULL` +#' @noRd +set_local_locale <- function(locale) { + Sys.setlocale(category = "LC_CTYPE", locale = locale["ctype"]) + Sys.setlocale(category = "LC_COLLATE", locale = locale["collate"]) + Sys.setlocale(category = "LC_TIME", locale = locale["time"]) + return(invisible(NULL)) +} diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R index 0aeaa87..8ec1779 100644 --- a/R/is_git2rdata.R +++ b/R/is_git2rdata.R @@ -1,7 +1,6 @@ #' Check Whether a Git2rdata Object is Valid. #' -#' A valid git2rdata object has valid metadata. The data hash must match the -#' data hash stored in the metadata. +#' A valid git2rdata object has valid metadata. #' @inheritParams write_vc #' @inheritParams is_git2rmeta #' @return A logical value. `TRUE` in case of a valid git2rdata object. @@ -24,7 +23,6 @@ is_git2rdata.default <- function(file, root, message) { #' @importFrom assertthat assert_that is.string #' @importFrom yaml read_yaml as.yaml #' @importFrom utils packageVersion -#' @importFrom git2r hash is_git2rdata.character <- function(file, root = ".", message = c("none", "warning", "error")) { assert_that(is.string(file), is.string(root)) @@ -56,13 +54,6 @@ is_git2rdata.character <- function(file, root = ".", return(FALSE) } - if (meta_data[["..generic"]][["data_hash"]] != hashfile(file[["raw_file"]])) { - msg <- "Corrupt data, mismatching data hash." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } - return(TRUE) } diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R index e01a103..7066b3f 100644 --- a/R/is_git2rmeta.R +++ b/R/is_git2rmeta.R @@ -66,7 +66,7 @@ See `?upgrade_data()`." return(FALSE) } if (package_version(meta_data[["..generic"]][["git2rdata"]]) < - package_version("0.0.5")) { + package_version("0.1.0.9001")) { msg <- "Data stored using an older version of `git2rdata`. See `?upgrade_data()`." switch(message, error = stop(msg, call. = FALSE), diff --git a/R/meta.R b/R/meta.R index da9be6d..ebee960 100644 --- a/R/meta.R +++ b/R/meta.R @@ -1,7 +1,7 @@ #' Optimize an Object for Storage as Plain Text and Add Metadata #' #' @description -#' Prepares a vector for storage. When relevant, `meta()`optimizes the object +#' Prepares a vector for storage. When relevant, `meta()` optimizes the object #' for storage by changing the format to one which needs less characters. The #' metadata stored in the `meta` attribute, contains all required information to #' back-transform the optimized format into the original format. @@ -38,6 +38,7 @@ meta <- function(x, ...) { #' @importFrom assertthat assert_that is.string noNA meta.character <- function(x, na = "NA", ...) { assert_that(is.string(na), noNA(na), no_whitespace(na)) + x <- enc2utf8(x) if (na %in% x) { stop("one of the strings matches the NA string ('", na, "') Please use a different NA string or consider using a factor.", call. = FALSE) @@ -70,21 +71,41 @@ meta.numeric <- function(x, ...) { #' @export #' @rdname meta -#' @param optimize If `TRUE`, recode the data to get smaller text files. If -#' `FALSE`, `meta()` converts the data to character. Defaults to `TRUE`. -#' @param index an optional named vector with existing factor indices. The names -#' must match the existing factor levels. Unmatched levels from `x` will get new -#' indices. +#' @param optimize If `TRUE`, recode the data to get smaller text files. +#' If `FALSE`, `meta()` converts the data to character. +#' Defaults to `TRUE`. +#' @param index An optional named vector with existing factor indices. +#' The names must match the existing factor levels. +#' Unmatched levels from `x` will get new indices. #' @inheritParams utils::write.table #' @importFrom assertthat assert_that is.flag noNA -meta.factor <- function(x, optimize = TRUE, na = "NA", index, ...) { - assert_that(is.flag(optimize), noNA(optimize)) +#' @note The default order of factor levels depends on the current locale. +#' See \code{\link{Comparison}} for more details on that. +#' The same code on a different locale might result in a different sorting. +#' `meta()` ignores, with a warning, any change in the order of factor levels. +#' Add `strict = FALSE` to enforce the new order of factor levels. +meta.factor <- function( + x, optimize = TRUE, na = "NA", index, strict = TRUE, ... +) { + assert_that(is.flag(optimize), noNA(optimize), is.flag(strict), noNA(strict)) + levels(x) <- enc2utf8(levels(x)) if (missing(index) || is.null(index)) { index <- seq_along(levels(x)) names(index) <- levels(x) } else { assert_that(is.integer(index)) assert_that(anyDuplicated(index) == 0, msg = "duplicate indices") + + if ( + strict && + all(names(index) %in% levels(x)) && + all(levels(x) %in% names(index)) && + any(levels(x) != names(index)) + ) { + warning("Same levels with a different order detected. +This change is ignored. Use `strict = FALSE` to reorder the factor.") + x <- factor(x, levels = names(index)) + } new_levels <- which(!levels(x) %in% names(index)) candidate_index <- seq_len(length(new_levels) + length(index)) candidate_index <- candidate_index[!candidate_index %in% index] @@ -101,15 +122,16 @@ meta.factor <- function(x, optimize = TRUE, na = "NA", index, ...) { z <- index[x] } else { assert_that(is.string(na), noNA(na), no_whitespace(na)) - if (na %in% levels(x)) { - stop("one of the levels matches the NA string ('", na, "'). -Please use a different NA string or use optimize = TRUE", call. = FALSE) - } + assert_that( + !na %in% levels(x), + msg = paste0("one of the levels matches the NA string ('", na, "'). +Please use a different NA string or use optimize = TRUE") + ) z <- meta(as.character(x), optimize = optimize, na = na, ...) } m <- list(class = "factor", na_string = na, optimize = optimize, - labels = enc2utf8(names(index)), index = unname(index), + labels = names(index), index = unname(index), ordered = is.ordered(x)) class(m) <- "meta_detail" attr(z, "meta") <- m @@ -119,7 +141,7 @@ Please use a different NA string or use optimize = TRUE", call. = FALSE) #' @export #' @rdname meta #' @importFrom assertthat assert_that is.flag noNA -meta.logical <- function(x, optimize = TRUE, ...){ +meta.logical <- function(x, optimize = TRUE, ...) { assert_that(is.flag(optimize), noNA(optimize)) if (optimize) { x <- as.integer(x) @@ -160,7 +182,7 @@ meta.POSIXct <- function(x, optimize = TRUE, ...) { #' @export #' @rdname meta #' @importFrom assertthat assert_that is.flag noNA -meta.Date <- function(x, optimize = TRUE, ...){ +meta.Date <- function(x, optimize = TRUE, ...) { assert_that(is.flag(optimize), noNA(optimize)) if (optimize) { z <- as.integer(x) @@ -188,7 +210,9 @@ meta.Date <- function(x, optimize = TRUE, ...){ #' argument intended for internal use. #' @rdname meta #' @inheritParams write_vc -meta.data.frame <- function(x, optimize = TRUE, na = "NA", sorting, ...) { +meta.data.frame <- function(# nolint + x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ... +) { assert_that( !has_name(x, "..generic"), msg = "'..generic' is a reserved name and not allowed as column name") @@ -213,7 +237,9 @@ Sorting is strongly recommended in combination with version control.") all(sorting %in% colnames(x)), msg = "All sorting variables must be available in the data.frame") if (nrow(x) > 1) { + old_locale <- set_c_locale() x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint + set_local_locale(old_locale) if (any_duplicated(x[sorting])) { sorted <- paste(sprintf("'%s'", sorting), collapse = ", ") sorted <- sprintf("Sorting on %s results in ties. @@ -229,14 +255,16 @@ Add extra sorting variables to ensure small diffs.", sorted) if (length(common)) { z_common <- lapply( common, - function(id, optimize, na) { + function(id, optimize, na, strict) { meta( x[[id]], optimize = optimize, na = na, - index = setNames(old[[id]][["index"]], old[[id]][["labels"]]) + index = setNames(old[[id]][["index"]], old[[id]][["labels"]]), + strict = strict ) }, optimize = old[["..generic"]][["optimize"]], - na = old[["..generic"]][["NA string"]] + na = old[["..generic"]][["NA string"]], + strict = strict ) names(z_common) <- common } else { diff --git a/R/prune.R b/R/prune.R index b9708d3..e60c239 100644 --- a/R/prune.R +++ b/R/prune.R @@ -22,14 +22,14 @@ #' @template example-prune rm_data <- function( root = ".", path = NULL, recursive = TRUE, ... -){ +) { UseMethod("rm_data", root) } #' @export rm_data.default <- function( root, path = NULL, recursive = TRUE, ... -){ +) { stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) } @@ -37,7 +37,7 @@ rm_data.default <- function( #' @importFrom assertthat assert_that is.flag rm_data.character <- function( root = ".", path = NULL, recursive = TRUE, ... -){ +) { to_do <- list_data(root = root, path = path, recursive = recursive) if (length(to_do) == 0) { return(to_do) @@ -63,7 +63,7 @@ rm_data.character <- function( rm_data.git_repository <- function( root, path = NULL, recursive = TRUE, ..., stage = FALSE, type = c("unmodified", "modified", "ignored", "all") -){ +) { type <- match.arg(type) to_do <- list_data(root = root, path = path, recursive = recursive) if (length(to_do) == 0) { @@ -116,14 +116,14 @@ rm_data.git_repository <- function( #' @template example-prune prune_meta <- function( root = ".", path = NULL, recursive = TRUE, ... -){ +) { UseMethod("prune_meta", root) } #' @export prune_meta.default <- function( root, path = NULL, recursive = TRUE, ... -){ +) { stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) } @@ -131,7 +131,7 @@ prune_meta.default <- function( #' @importFrom assertthat assert_that is.flag noNA prune_meta.character <- function( root = ".", path = NULL, recursive = TRUE, ... -){ +) { assert_that(is.string(root)) root <- normalizePath(root, winslash = "/", mustWork = TRUE) assert_that(is.string(path)) @@ -171,7 +171,7 @@ prune_meta.character <- function( #' @rdname prune_meta prune_meta.git_repository <- function( root, path = NULL, recursive = TRUE, ..., stage = FALSE -){ +) { root_wd <- normalizePath(workdir(root), winslash = "/") assert_that(is.string(path)) path <- file.path(root_wd, path) diff --git a/R/read_vc.R b/R/read_vc.R index fb01999..618b830 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -40,12 +40,7 @@ read_vc.character <- function(file, root = ".") { is_git2rdata(file = remove_root(file = file["meta_file"], root = root), root = root, message = "error"), error = function(e) { - if (e$message == "Corrupt data, mismatching data hash.") { - warning("Mismatching data hash. Data altered outside of git2rdata.", - call. = FALSE) - } else { - stop(e$message, call. = FALSE) - } + stop(e$message, call. = FALSE) } ) assert_that( @@ -56,25 +51,24 @@ read_vc.character <- function(file, root = ".") { # read the metadata meta_data <- read_yaml(file["meta_file"]) optimize <- meta_data[["..generic"]][["optimize"]] - if (optimize) { - col_type <- c( - character = "character", factor = "integer", integer = "integer", - numeric = "numeric", logical = "integer", Date = "integer", - POSIXct = "numeric", complex = "complex" - ) - } else { - col_type <- c( + col_type <- list( + c( character = "character", factor = "character", integer = "integer", numeric = "numeric", logical = "logical", Date = "Date", POSIXct = "character", complex = "complex" + ), + c( + character = "character", factor = "integer", integer = "integer", + numeric = "numeric", logical = "integer", Date = "integer", + POSIXct = "numeric", complex = "complex" ) - } + )[[optimize + 1]] na_string <- meta_data[["..generic"]][["NA string"]] details <- meta_data[names(meta_data) != "..generic"] col_names <- names(details) col_classes <- vapply(details, "[[", character(1), "class") - # read the raw data + # read the raw data and check the data hash raw_data <- read.table( file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", dec = ".", numerals = "warn.loss", na.strings = na_string, @@ -82,6 +76,28 @@ read_vc.character <- function(file, root = ".") { stringsAsFactors = FALSE, fileEncoding = "UTF-8" ) + dh <- datahash(file["raw_file"]) + if (meta_data[["..generic"]][["data_hash"]] != dh) { + meta_data[["..generic"]][["data_hash"]] <- dh + warning("Mismatching data hash. Data altered outside of git2rdata.", + call. = FALSE) + } + + raw_data <- reinstate( + raw_data = raw_data, col_names = col_names, col_classes = col_classes, + details = details, optimize = optimize + ) + + names(file) <- + c( + meta_data[["..generic"]][["data_hash"]], + meta_data[["..generic"]][["hash"]] + ) + attr(raw_data, "source") <- file + return(raw_data) +} + +reinstate <- function(raw_data, col_names, col_classes, details, optimize) { # reinstate factors for (id in col_names[col_classes == "factor"]) { if (optimize) { @@ -118,21 +134,19 @@ read_vc.character <- function(file, root = ".") { } } - if (optimize) { - # reinstate logical - for (id in col_names[col_classes == "logical"]) { - raw_data[[id]] <- as.logical(raw_data[[id]]) - } - - # reinstage Date - for (id in col_names[col_classes == "Date"]) { - raw_data[[id]] <- as.Date(raw_data[[id]], - origin = details[[id]][["origin"]]) - } + if (!optimize) { + return(raw_data) + } + # reinstate logical + for (id in col_names[col_classes == "logical"]) { + raw_data[[id]] <- as.logical(raw_data[[id]]) } - names(file) <- hashfile(file) - attr(raw_data, "source") <- file + # reinstage Date + for (id in col_names[col_classes == "Date"]) { + raw_data[[id]] <- as.Date(raw_data[[id]], + origin = details[[id]][["origin"]]) + } return(raw_data) } diff --git a/R/recent_commit.R b/R/recent_commit.R index d2c392a..6cb0602 100644 --- a/R/recent_commit.R +++ b/R/recent_commit.R @@ -12,7 +12,8 @@ #' be rerun. See `vignette("workflow", package = "git2rdata")`. #' @inheritParams write_vc #' @param root The root of a project. Can be a file path or a `git-repository`. -#' @param data does `file` refers to a data object (`TRUE`) or to a file (`FALSE`). +#' @param data does `file` refers to a data object (`TRUE`) or to a file +#' (`FALSE`)? #' Defaults to `FALSE`. #' @return a `data.frame` with `commit`, `author` and `when` for the most recent #' commit that adds op updates the file. @@ -73,7 +74,7 @@ #' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, #' include.dirs = TRUE, all.files = TRUE)), #' repo_path) -recent_commit <- function(file, root, data = FALSE){ +recent_commit <- function(file, root, data = FALSE) { UseMethod("recent_commit", root) } diff --git a/R/relabel.R b/R/relabel.R index 334f758..d32f60c 100644 --- a/R/relabel.R +++ b/R/relabel.R @@ -131,7 +131,7 @@ Use write_vc() instead.", call. = FALSE) #' @export #' @importFrom assertthat assert_that has_name #' @importFrom stats setNames -relabel.data.frame <- function(file, root, change) { +relabel.data.frame <- function(file, root, change) { #nolint assert_that( has_name(change, "factor"), has_name(change, "old"), diff --git a/R/upgrade_data.R b/R/upgrade_data.R index a10744e..df8ec30 100644 --- a/R/upgrade_data.R +++ b/R/upgrade_data.R @@ -73,7 +73,7 @@ upgrade_data.character <- function( ) if (has_name(meta_data[["..generic"]], "git2rdata")) { if (package_version(meta_data[["..generic"]][["git2rdata"]]) >= - package_version("0.0.5") + package_version("0.1.0.9001") ) { if (verbose) { message(target, " already up to date") @@ -81,17 +81,15 @@ upgrade_data.character <- function( return(target) } meta_data[["..generic"]][["git2rdata"]] <- NULL + meta_data[["..generic"]][["data_hash"]] <- NULL } - assert_that( - meta_data[["..generic"]][["hash"]] == metadata_hash(meta_data), - msg = paste(target, "has corrupt metadata: mismatching hash.") - ) meta_data[["..generic"]] <- c( git2rdata = as.character(packageVersion("git2rdata")), meta_data[["..generic"]] ) if (!has_name(meta_data[["..generic"]], "data_hash")) { - meta_data[["..generic"]][["data_hash"]] <- hashfile(file["raw_file"]) + # recalculate the data hash + meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) } write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") if (verbose) { diff --git a/R/write_vc.R b/R/write_vc.R index 67b11a9..40be4a7 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -1,8 +1,10 @@ #' Store a Data.Frame as a Git2rdata Object on Disk #' -#' A git2rdata object consists of two files. The `".tsv"` file contains the raw -#' data as a plain text tab separated file. The `".yml"` contains the metadata -#' on the columns in plain text YAML format. See `vignette("plain text", package = "git2rdata")` for more details on the implementation. +#' A git2rdata object consists of two files. +#' The `".tsv"` file contains the raw data as a plain text tab separated file. +#' The `".yml"` contains the metadata on the columns in plain text YAML format. +#' See `vignette("plain text", package = "git2rdata")` for more details on the +#' implementation. #' @param x the `data.frame`. #' @param file the name of the git2rdata object. Git2rdata objects cannot #' have dots in their name. The name may include a relative path. `file` is a @@ -52,7 +54,7 @@ write_vc.default <- function( write_vc.character <- function( x, file, root = ".", sorting, strict = TRUE, optimize = TRUE, na = "NA", ... -){ +) { assert_that( inherits(x, "data.frame"), is.string(file), is.string(root), is.string(na), noNA(na), no_whitespace(na), is.flag(strict), is.flag(optimize) @@ -63,7 +65,9 @@ write_vc.character <- function( dir.create(dirname(file["raw_file"]), recursive = TRUE) } - if (file.exists(file["meta_file"])) { + if (!file.exists(file["meta_file"])) { + raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting) + } else { tryCatch( is_git2rmeta(file = remove_root(file = file["meta_file"], root = root), root = root, message = "error"), @@ -75,7 +79,7 @@ write_vc.character <- function( old <- read_yaml(file["meta_file"]) class(old) <- "meta_list" raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting, - old = old) + old = old, strict = strict) problems <- compare_meta(attr(raw_data, "meta"), old) if (length(problems)) { problems <- c( @@ -94,8 +98,6 @@ write_vc.character <- function( sorting <- old[["..generic"]][["sorting"]] } } - } else { - raw_data <- meta(x, optimize = optimize, na = na, sorting = sorting) } write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, @@ -106,12 +108,16 @@ write_vc.character <- function( meta_data[["..generic"]][["git2rdata"]] <- as.character( packageVersion("git2rdata") ) - meta_data[["..generic"]][["data_hash"]] <- hashfile(file["raw_file"]) + meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") hashes <- remove_root(file = file, root = root) - names(hashes) <- hashfile(file) + names(hashes) <- + c( + meta_data[["..generic"]][["data_hash"]], + meta_data[["..generic"]][["hash"]] + ) return(hashes) } @@ -129,7 +135,7 @@ setOldClass("git_repository") write_vc.git_repository <- function( x, file, root, sorting, strict = TRUE, optimize = TRUE, na = "NA", ..., stage = FALSE, force = FALSE -){ +) { assert_that(is.flag(stage), is.flag(force)) hashes <- write_vc( x = x, file = file, root = workdir(root), sorting = sorting, @@ -218,8 +224,18 @@ compare_meta <- function(new, old) { ) } - common_variables <- common_variables[old_class == new_class] - old_class <- old_class[old_class == new_class] + problems <- compare_factors( + problems = problems, + common_variables = common_variables[old_class == new_class], + old_class = old_class[old_class == new_class], + old = old, + new = new + ) + + return(problems) +} + +compare_factors <- function(problems, common_variables, old_class, old, new) { for (id in common_variables[old_class == "factor"]) { if (old[[id]]$ordered != new[[id]]$ordered) { problems <- c( @@ -238,10 +254,8 @@ compare_meta <- function(new, old) { problems <- c(problems, sprintf("- New indices for '%s'.", id)) } } - return(problems) } - #' @noRd #' @param file the file including the path #' @param root the path of the root diff --git a/README.md b/README.md index f9a63cc..243d035 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,10 @@ The `git2rdata` package is an R package for writing and reading dataframes as pl - Git2rdata is useful as a tool in a reproducible and traceable workflow. See `vignette("workflow", package = "git2rdata")`. - You can detect when a file was last modified in the git history. Use this to check whether an existing analysis is obsolete due to new data. This allows to not rerun up to date analyses, saving resources. +## Talk About `git2rdata` at useR!2019 in Toulouse, France + + + ## Installation Install from CRAN diff --git a/codemeta.json b/codemeta.json index 3f33af0..28870e3 100644 --- a/codemeta.json +++ b/codemeta.json @@ -14,14 +14,14 @@ ], "issueTracker": "https://github.com/ropensci/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.0.9000", + "version": "0.1.0.9002", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", - "version": "3.6.0", + "version": "3.6.1", "url": "https://r-project.org" }, - "runtimePlatform": "R version 3.6.0 (2019-04-26)", + "runtimePlatform": "R version 3.6.1 (2019-07-05)", "author": [ { "@type": "Person", @@ -45,6 +45,13 @@ "familyName": "Desmet", "email": "peter.desmet@inbo.be", "@id": "https://orcid.org/0000-0002-8442-8025" + }, + { + "@type": "Person", + "givenName": "Els", + "familyName": "Lommelen", + "email": "els.lommelen@inbo.be", + "@id": "https://orcid.org/0000-0002-3481-5684" } ], "copyrightHolder": [ @@ -196,7 +203,7 @@ ], "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", - "fileSize": "338.973KB", + "fileSize": "362.855KB", "contIntegration": [ "https://travis-ci.org/inbo/git2rdata", "https://ci.appveyor.com/project/ThierryO/git2rdata/branch/master", diff --git a/inst/efficiency/file_timings.rds b/inst/efficiency/file_timings.rds index 8c0ac74..48bb68a 100644 Binary files a/inst/efficiency/file_timings.rds and b/inst/efficiency/file_timings.rds differ diff --git a/inst/efficiency/git_size.rds b/inst/efficiency/git_size.rds index a220848..0c14df9 100644 Binary files a/inst/efficiency/git_size.rds and b/inst/efficiency/git_size.rds differ diff --git a/inst/efficiency/read_timings.rds b/inst/efficiency/read_timings.rds index 03c19c0..632a38e 100644 Binary files a/inst/efficiency/read_timings.rds and b/inst/efficiency/read_timings.rds differ diff --git a/man/git2rdata-package.Rd b/man/git2rdata-package.Rd index 149b804..91ae826 100644 --- a/man/git2rdata-package.Rd +++ b/man/git2rdata-package.Rd @@ -24,6 +24,7 @@ Other contributors: \itemize{ \item Floris Vanderhaeghe \email{floris.vanderhaeghe@inbo.be} (0000-0002-6378-6229) [contributor] \item Peter Desmet \email{peter.desmet@inbo.be} (0000-0002-8442-8025) [contributor] + \item Els Lommelen \email{els.lommelen@inbo.be} (0000-0002-3481-5684) [contributor] \item Research Institute for Nature and Forest \email{info@inbo.be} [copyright holder, funder] } diff --git a/man/is_git2rdata.Rd b/man/is_git2rdata.Rd index df01b44..31cd867 100644 --- a/man/is_git2rdata.Rd +++ b/man/is_git2rdata.Rd @@ -25,8 +25,7 @@ A logical value. \code{TRUE} in case of a valid git2rdata object. Otherwise \code{FALSE}. } \description{ -A valid git2rdata object has valid metadata. The data hash must match the -data hash stored in the metadata. +A valid git2rdata object has valid metadata. } \examples{ # create a directory diff --git a/man/meta.Rd b/man/meta.Rd index 36c5f89..2b83b2c 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -15,7 +15,8 @@ meta(x, ...) \method{meta}{character}(x, na = "NA", ...) -\method{meta}{factor}(x, optimize = TRUE, na = "NA", index, ...) +\method{meta}{factor}(x, optimize = TRUE, na = "NA", index, + strict = TRUE, ...) \method{meta}{logical}(x, optimize = TRUE, ...) @@ -23,7 +24,8 @@ meta(x, ...) \method{meta}{Date}(x, optimize = TRUE, ...) -\method{meta}{data.frame}(x, optimize = TRUE, na = "NA", sorting, ...) +\method{meta}{data.frame}(x, optimize = TRUE, na = "NA", sorting, + strict = TRUE, ...) } \arguments{ \item{x}{the vector.} @@ -32,12 +34,18 @@ meta(x, ...) \item{na}{the string to use for missing values in the data.} -\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. If -\code{FALSE}, \code{meta()} converts the data to character. Defaults to \code{TRUE}.} +\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. +If \code{FALSE}, \code{meta()} converts the data to character. +Defaults to \code{TRUE}.} -\item{index}{an optional named vector with existing factor indices. The names -must match the existing factor levels. Unmatched levels from \code{x} will get new -indices.} +\item{index}{An optional named vector with existing factor indices. +The names must match the existing factor levels. +Unmatched levels from \code{x} will get new indices.} + +\item{strict}{What to do when the metadata changes. \code{strict = FALSE} +overwrites the data and the metadata with a warning listing the changes, +\code{strict = TRUE} returns an error and leaves the data and metadata as is. +Defaults to \code{TRUE}.} \item{sorting}{an optional vector of column names defining which columns to use for sorting \code{x} and in what order to use them. Omitting \code{sorting} yields @@ -50,7 +58,7 @@ importance of sorting.} the optimized vector \code{x} with \code{meta} attribute. } \description{ -Prepares a vector for storage. When relevant, \code{meta()}optimizes the object +Prepares a vector for storage. When relevant, \code{meta()} optimizes the object for storage by changing the format to one which needs less characters. The metadata stored in the \code{meta} attribute, contains all required information to back-transform the optimized format into the original format. @@ -64,6 +72,13 @@ the metadata and not allowed as column name in a \code{data.frame}. Existing metadata is passed through the optional \code{old} argument. This argument intended for internal use. } +\note{ +The default order of factor levels depends on the current locale. +See \code{\link{Comparison}} for more details on that. +The same code on a different locale might result in a different sorting. +\code{meta()} ignores, with a warning, any change in the order of factor levels. +Add \code{strict = FALSE} to enforce the new order of factor levels. +} \examples{ meta(c(NA, "'NA'", '"NA"', "abc\\tdef", "abc\\ndef")) meta(1:3) diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index 4f2ba37..a3705b6 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -14,7 +14,8 @@ Note that \code{file} must point to a location within \code{root}.} \item{root}{The root of a project. Can be a file path or a \code{git-repository}.} -\item{data}{does \code{file} refers to a data object (\code{TRUE}) or to a file (\code{FALSE}). +\item{data}{does \code{file} refers to a data object (\code{TRUE}) or to a file +(\code{FALSE})? Defaults to \code{FALSE}.} } \value{ diff --git a/man/write_vc.Rd b/man/write_vc.Rd index 8bfa62c..2b85132 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -34,8 +34,9 @@ overwrites the data and the metadata with a warning listing the changes, \code{strict = TRUE} returns an error and leaves the data and metadata as is. Defaults to \code{TRUE}.} -\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. If -\code{FALSE}, \code{meta()} converts the data to character. Defaults to \code{TRUE}.} +\item{optimize}{If \code{TRUE}, recode the data to get smaller text files. +If \code{FALSE}, \code{meta()} converts the data to character. +Defaults to \code{TRUE}.} \item{na}{the string to use for missing values in the data.} @@ -51,9 +52,11 @@ a named vector with the file paths relative to \code{root}. The names contain the hashes of the files. } \description{ -A git2rdata object consists of two files. The \code{".tsv"} file contains the raw -data as a plain text tab separated file. The \code{".yml"} contains the metadata -on the columns in plain text YAML format. See \code{vignette("plain text", package = "git2rdata")} for more details on the implementation. +A git2rdata object consists of two files. +The \code{".tsv"} file contains the raw data as a plain text tab separated file. +The \code{".yml"} contains the metadata on the columns in plain text YAML format. +See \code{vignette("plain text", package = "git2rdata")} for more details on the +implementation. } \note{ \code{..generic} is a reserved name for the metadata and is a forbidden diff --git a/tests/testthat/setup_test_data.R b/tests/testthat/setup_test_data.R index 37d4397..9c6820f 100644 --- a/tests/testthat/setup_test_data.R +++ b/tests/testthat/setup_test_data.R @@ -1,6 +1,9 @@ test_n <- 100 test_data <- data.frame( - test_character = sample(LETTERS, size = test_n, replace = TRUE), + test_character = c( + sample(LETTERS, size = test_n - 10, replace = TRUE), + c("é", "&", "à", "µ", "ç", "€", "|", "#", "@", "$") + ), test_factor = sample( factor(c("a", "b"), levels = c("a", "b", "c")), size = test_n, replace = TRUE @@ -23,7 +26,10 @@ test_data <- data.frame( stringsAsFactors = FALSE ) +old_locale <- git2rdata:::set_c_locale() sorted_test_data <- test_data[order(test_data$test_Date), ] +git2rdata:::set_local_locale(old_locale) +sorted_test_data$test_character <- enc2utf8(sorted_test_data$test_character) rownames(sorted_test_data) <- NULL attr(sorted_test_data$test_POSIXct, "tzone") <- "UTC" @@ -37,8 +43,10 @@ test_na <- test_data for (i in seq_along(test_na)) { test_na[sample(test_n, size = ceiling(0.1 * test_n)), i] <- NA } +old_locale <- git2rdata:::set_c_locale() sorted_test_na <- test_na[ order(test_na$test_Date, test_na$test_integer, test_na$test_numeric), ] +git2rdata:::set_local_locale(old_locale) rownames(sorted_test_na) <- NULL attr(sorted_test_na$test_POSIXct, "tzone") <- "UTC" diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index af138b3..9e4e78a 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -247,13 +247,15 @@ test_that("user specified na strings work", { ), "character" ) + old_locale <- git2rdata:::set_c_locale() expect_equal( read_vc(fn[1], root), x[order(x$a), ], check.attributes = FALSE ) + git2rdata:::set_local_locale(old_locale) expect_identical( - grep("junk", readLines(file.path(root, fn[1]))), + grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), 2:4 ) expect_error( @@ -269,13 +271,15 @@ test_that("user specified na strings work", { ), "character" ) + old_locale <- git2rdata:::set_c_locale() expect_equal( read_vc(fn[1], root), x[order(x$a), ], check.attributes = FALSE ) + git2rdata:::set_local_locale(old_locale) expect_identical( - grep("junk", readLines(file.path(root, fn[1]))), + grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), 2:4 ) file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) @@ -292,6 +296,15 @@ test_that("write_vc() allows changes in factor levels", { fn <- write_vc(x, "factor_levels", root, sorting = "test_factor"), "character" ) + x$test_factor <- factor(x$test_factor, levels = c("b", "a")) + expect_warning( + write_vc(x, "factor_levels", root), + "Same levels with a different order detected" + ) + expect_warning( + write_vc(x, "factor_levels", root, strict = FALSE), + " New factor labels" + ) x$test_factor <- factor(x$test_factor, levels = c("a", "b", "c")) expect_error( write_vc(x, "factor_levels", root), diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R index b61d018..88186f7 100644 --- a/tests/testthat/test_b_is_git2rmeta.R +++ b/tests/testthat/test_b_is_git2rmeta.R @@ -121,22 +121,8 @@ test_that("is_git2rdata checks data", { file <- basename(tempfile(tmpdir = root)) junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["data_hash"]] <- "zzz" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_false(is_git2rdata(file = file, root = root)) - expect_error(is_git2rdata(file = file, root = root, message = "error"), - "Corrupt data, mismatching data hash") - expect_warning(is_git2rdata(file = file, root = root, message = "warning"), - "Corrupt data, mismatching data hash") - expect_false( - suppressWarnings( - is_git2rdata(file = file, root = root, message = "warning") - ) - ) - yaml::write_yaml(correct_yaml, file.path(root, junk[2])) - correct_data <- readLines(file.path(root, junk[1])) + correct_data <- readLines(file.path(root, junk[1]), encoding = "UTF-8") junk_header <- correct_data junk_header[1] <- "junk" writeLines(junk_header, file.path(root, junk[1])) diff --git a/tests/testthat/test_b_special.R b/tests/testthat/test_b_special.R index 884ef27..8a24d08 100644 --- a/tests/testthat/test_b_special.R +++ b/tests/testthat/test_b_special.R @@ -8,27 +8,35 @@ ds <- data.frame( "a\nb", "a\nb\nc", "\na", "a\n", "a\"b", "a\"b\"c", "\"b", "a\"", "\"b\"", "a'b", "a'b'c", "'b", "a'", "'b'", - "a b c", "\"NA\"", "'NA'", NA + "a b c", "\"NA\"", "'NA'", NA, + "\U00E9", "&", "\U00E0", "\U00B5", "\U00E7", "€", "|", "#", "@", "$" ), stringsAsFactors = FALSE ) expect_is( - write_vc(ds, "character", root, sorting = "a"), + output <- write_vc(ds, "character", root, sorting = "a"), "character" ) expect_equal( - junk <- read_vc("character", root), - ds[order(ds$a), , drop = FALSE], # nolint - check.attributes = FALSE + names(output)[1], + "9e5edf55ceadd2c148d6d715ea5d12cc8e1538d8" +) +old_locale <- git2rdata:::set_c_locale() +dso <- ds[order(ds$a), , drop = FALSE] # nolint +git2rdata:::set_local_locale(old_locale) +expect_equal( + junk <- read_vc("character", root), dso, check.attributes = FALSE +) +expect_identical( + names(output), + names(attr(junk, "source")) ) expect_is( write_vc(ds, "character2", root, sorting = "a", optimize = FALSE), "character" ) expect_equal( - junk <- read_vc("character2", root), - ds[order(ds$a), , drop = FALSE], # nolint - check.attributes = FALSE + junk <- read_vc("character2", root), dso, check.attributes = FALSE ) z <- rbind(ds, "NA") z$a <- factor(z$a) @@ -41,14 +49,33 @@ expect_equal( z[order(z$a), , drop = FALSE], # nolint check.attributes = FALSE ) + +old_locale <- git2rdata:::set_c_locale() ds$a <- factor(ds$a) +git2rdata:::set_local_locale(old_locale) expect_is( - write_vc(ds, "factor2", root, sorting = "a", optimize = FALSE), + output <- write_vc(ds, "factor2", root, sorting = "a", optimize = FALSE), "character" ) expect_equal( - read_vc("factor2", root), + junk <- read_vc("factor2", root), ds[order(ds$a), , drop = FALSE], # nolint check.attributes = FALSE ) +expect_equal( + names(output)[1], + "9e5edf55ceadd2c148d6d715ea5d12cc8e1538d8" +) +expect_identical( + names(output), + names(attr(junk, "source")) +) + + +yaml_file <- yaml::read_yaml(file.path(root, "factor2.yml")) +yaml_file[["..generic"]][["data_hash"]] <- "zzz" +yaml::write_yaml(yaml_file, file.path(root, "factor2.yml")) +expect_warning(read_vc("factor2", root = root), + "Mismatching data hash. Data altered outside of git2rdata.") + file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_non_ascii.R b/tests/testthat/test_e_non_ascii.R index de838fe..4d9c7b3 100644 --- a/tests/testthat/test_e_non_ascii.R +++ b/tests/testthat/test_e_non_ascii.R @@ -5,7 +5,9 @@ characters <- data.frame(a = c("€$£ @&#§µ^ ()[]{}|²³<>/\\*+- ,;:.?!~", "äàáâã ëèéê ïìíî öòóô üùúû ÿ ç ñ", "ÄÀÁ ËÈÉÊ ÏÌÍÎ ÖÒÓÔ ÜÙÚÛ Ñ"), stringsAsFactors = FALSE) +old_locale <- git2rdata:::set_c_locale() characters <- characters[order(characters$a), , drop = FALSE] # nolint +git2rdata:::set_local_locale(old_locale) test_that("special character are written properly as character", { file <- basename(tempfile(tmpdir = root)) @@ -16,8 +18,11 @@ test_that("special character are written properly as character", { expect_equivalent(read_vc(file = file, root = root), characters) }) +characters$a <- factor(characters$a) +old_locale <- git2rdata:::set_c_locale() +characters <- characters[order(characters$a), , drop = FALSE] # nolint +git2rdata:::set_local_locale(old_locale) test_that("special character are written properly as optimized factor", { - characters$a <- factor(characters$a) file <- basename(tempfile(tmpdir = root)) expect_is( junk <- write_vc(characters, file = file, root = root, sorting = "a"), @@ -27,7 +32,6 @@ test_that("special character are written properly as optimized factor", { }) test_that("special character are written properly as verbose factor", { - characters$a <- factor(characters$a) file <- basename(tempfile(tmpdir = root)) expect_is( junk <- write_vc(characters, file = file, root = root, sorting = "a", diff --git a/tests/testthat/test_e_upgrade.R b/tests/testthat/test_e_upgrade.R index 887bb4c..6df7bf1 100644 --- a/tests/testthat/test_e_upgrade.R +++ b/tests/testthat/test_e_upgrade.R @@ -58,22 +58,6 @@ test_that("upgrade_data() validates metadata", { unname(upgrade_data(file = file, root = root)), file ) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- NULL - junk_yaml[["test_Date"]] <- NULL - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_error( - upgrade_data(file = file, root = root), - "corrupt metadata: mismatching hash." - ) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- NULL - junk_yaml[["..generic"]][["hash"]] <- "zzz" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_error( - upgrade_data(file = file, root = root), - "corrupt metadata: mismatching hash." - ) junk_yaml[["..generic"]][["hash"]] <- NULL yaml::write_yaml(junk_yaml, file.path(root, junk[2])) expect_error( @@ -87,6 +71,18 @@ test_that("upgrade_data() validates metadata", { "is not a git2rdata object" ) expect_equivalent(file, junk) + + file <- basename(tempfile(tmpdir = root)) + junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date", + optimize = FALSE) + correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) + junk_yaml <- correct_yaml + junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.5" + yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_identical( + unname(upgrade_data(file = file, root = root)), + file + ) }) file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_validate_metadata.R b/tests/testthat/test_e_validate_metadata.R index c7017d7..748f702 100644 --- a/tests/testthat/test_e_validate_metadata.R +++ b/tests/testthat/test_e_validate_metadata.R @@ -35,7 +35,7 @@ test_that("read_vc() checks hash", { test_that("read_vc() handles changes in rawdata", { file <- basename(tempfile(tmpdir = root)) junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") - correct_data <- readLines(file.path(root, junk[1])) + correct_data <- readLines(file.path(root, junk[1]), encoding = "UTF-8") correct_header <- strsplit(correct_data[1], "\t")[[1]] junk_data <- correct_data junk_data[1] <- paste(correct_header[-1], collapse = "\t")