diff --git a/.Rbuildignore b/.Rbuildignore index b9532f6a..aa57a161 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,17 +1,22 @@ +# checklist +^_pkgdown.yml$ ^.*\.Rproj$ -^\.Rproj\.user$ -^\.github$ -^codemeta\.json$ ^.zenodo\.json$ -^man-roxygen$ -^pkgdown$ -^_pkgdown.yml$ -^docs$ -^cran-comments\.md$ -# checklist +^\.github$ +^\.httr-oauth$ +^\.Rproj\.user$ +^\.zenodo\.json$ ^checklist.yml$ +^CITATION\.cff$ ^codecov.yml$ -^LICENSE.md$ -^\.httr-oauth$ +^codecov\.yml$ +^codemeta\.json$ +^cran-comments\.md$ +^data-raw$ ^doc$ +^docs$ +^LICENSE.md$ +^man-roxygen$ ^Meta$ +^pkgdown$ +^README\.Rmd$ diff --git a/.github/workflows/check_on_branch.yml b/.github/workflows/check_on_branch.yml index 7b4f0f02..ae975696 100644 --- a/.github/workflows/check_on_branch.yml +++ b/.github/workflows/check_on_branch.yml @@ -1,6 +1,7 @@ on: push: branches-ignore: + - main - master - ghpages @@ -10,8 +11,10 @@ jobs: check-package: runs-on: ubuntu-latest name: "check package" + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + ORCID_TOKEN: ${{ secrets.ORCID_TOKEN }} steps: - uses: inbo/actions/check_pkg@master with: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - ORCID_TOKEN: ${{ secrets.ORCID_TOKEN }} + token: ${{ secrets.PAT }} diff --git a/.github/workflows/check_on_different_r_os.yml b/.github/workflows/check_on_different_r_os.yml index aa2d2bea..31901aa8 100644 --- a/.github/workflows/check_on_different_r_os.yml +++ b/.github/workflows/check_on_different_r_os.yml @@ -1,12 +1,14 @@ on: push: branches: + - main - master pull_request: branches: + - main - master -name: R-CMD-check +name: R-CMD-check-OS jobs: R-CMD-check: @@ -21,10 +23,11 @@ jobs: - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} - - {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} + - {os: ubuntu-20.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + _R_CHECK_SYSTEM_CLOCK_: false RSPM: ${{ matrix.config.rspm }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} ORCID_TOKEN: ${{ secrets.ORCID_TOKEN }} @@ -61,6 +64,8 @@ jobs: Rscript -e "remotes::install_github('r-hub/sysreqs')" sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") sudo -s eval "$sysreqs" + sudo apt-get install -y libcurl4-openssl-dev + - name: Install dependencies run: | remotes::install_deps(dependencies = TRUE) @@ -77,7 +82,7 @@ jobs: - name: Check env: _R_CHECK_CRAN_INCOMING_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "error", check_dir = "check") shell: Rscript {0} - name: Show testthat output @@ -91,3 +96,4 @@ jobs: with: name: ${{ runner.os }}-r${{ matrix.config.r }}-results path: check + retention-days: 5 diff --git a/.github/workflows/check_on_main.yml b/.github/workflows/check_on_main.yml new file mode 100644 index 00000000..7a183419 --- /dev/null +++ b/.github/workflows/check_on_main.yml @@ -0,0 +1,21 @@ +on: + push: + branches: + - main + - master + schedule: + - cron: '6 0 15 * *' + +name: "check package on main" + +jobs: + check-package: + runs-on: ubuntu-latest + name: "check package" + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + ORCID_TOKEN: ${{ secrets.ORCID_TOKEN }} + steps: + - uses: inbo/actions/check_pkg@master + with: + token: ${{ secrets.PAT }} diff --git a/.github/workflows/check_on_master.yml b/.github/workflows/check_on_master.yml deleted file mode 100644 index 9d1cb838..00000000 --- a/.github/workflows/check_on_master.yml +++ /dev/null @@ -1,19 +0,0 @@ -on: - push: - branches: - - master - schedule: - - cron: '6 0 * * 1' - -name: "check package on master" - -jobs: - check-package: - runs-on: ubuntu-latest - name: "check package" - steps: - - uses: inbo/actions/check_pkg@master - with: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - ORCID_TOKEN: ${{ secrets.ORCID_TOKEN }} - token: ${{ secrets.pat }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..c44a7601 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,30 @@ +on: + push: + tags: + - 'v*' + +name: Create Release + +jobs: + build: + name: Create Release + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Get tag message + run: | + TAG_BODY=$(git tag --contains ${{ github.sha }} -n100 | awk '(NR>1)') + echo "::set-output name=TAG_BODY::$TAG_BODY" + id: tag-body + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Release ${{ github.ref }} + body: ${{ steps.tag-body.outputs.TAG_BODY }} + draft: false + prerelease: false diff --git a/.gitignore b/.gitignore index cde4424a..fa45244d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ -.Rproj.user -.Rhistory +.httr-oauth .RData +.Rhistory +.Rproj.user .Ruserdata -inst/doc -docs -.httr-oauth +*.html doc +docs +inst/doc Meta diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 00000000..0dc7496d --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,47 @@ +{ + "title": "git2rdata: Store and Retrieve Data.frames in a Git Repository", + "version": "0.4.0", + "description": "The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(\"plain_text\", package = \"git2rdata\"). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(\"version_control\", package = \"git2rdata\"). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(\"workflow\", package = \"git2rdata\") gives a toy example. 4) vignette(\"efficiency\", package = \"git2rdata\") provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.", + "creators": [ + { + "name": "Onkelinx, Thierry", + "orcid": "https://orcid.org/0000-0001-8804-4216" + } + ], + "upload_type": "software", + "access_right": "open", + "license": "GPL-3.0", + "communities": [ + { + "identifier": "inbo" + } + ], + "contributors": [ + { + "name": "Vanderhaeghe, Floris", + "type": "ProjectMember", + "orcid": "https://orcid.org/0000-0002-6378-6229" + }, + { + "name": "Desmet, Peter", + "type": "ProjectMember", + "orcid": "https://orcid.org/0000-0002-8442-8025" + }, + { + "name": "Lommelen, Els", + "type": "ProjectMember", + "orcid": "https://orcid.org/0000-0002-3481-5684" + }, + { + "name": "Research Institute for Nature and Forest", + "type": "RightsHolder" + }, + { + "name": "Onkelinx, Thierry", + "type": "ContactPerson", + "orcid": "https://orcid.org/0000-0001-8804-4216" + } + ], + "language": "eng", + "keywords": ["R package", "reproducible research", "version control"] +} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..8256b9a0 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,40 @@ +cff-version: 1.2.0 +message: If you use this software, please cite it as below. +authors: +- family-names: Onkelinx + given-names: Thierry + orcid: https://orcid.org/0000-0001-8804-4216 +contact: +- email: thierry.onkelinx@inbo.be + family-names: Onkelinx + given-names: Thierry +- email: info@inbo.be + name: Research Institute for Nature and Forest +title: 'git2rdata: Store and Retrieve Data.frames in a Git Repository' +version: 0.4.0 +abstract: The git2rdata package is an R package for writing and reading dataframes + as plain text files. A metadata file stores important information. 1) Storing metadata + allows to maintain the classes of variables. By default, git2rdata optimizes the + data for file storage. The optimization is most effective on data containing factors. + The optimization makes the data less human readable. The user can turn this off + when they prefer a human readable format over smaller files. Details on the implementation + are available in vignette("plain_text", package = "git2rdata"). 2) Storing metadata + also allows smaller row based diffs between two consecutive commits. This is a useful + feature when storing data as plain text files under version control. Details on + this part of the implementation are available in vignette("version_control", package + = "git2rdata"). Although we envisioned git2rdata with a git workflow in mind, you + can use it in combination with other version control systems like subversion or + mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. + vignette("workflow", package = "git2rdata") gives a toy example. 4) vignette("efficiency", + package = "git2rdata") provides some insight into the efficiency of file storage, + git repository size and speed for writing and reading. +license: GPL-3.0 +type: software +repository-code: https://github.com/ropensci/git2rdata/ +identifiers: +- type: url + value: https://ropensci.github.io/git2rdata/ +keywords: +- R package +- reproducible research +- version control diff --git a/DESCRIPTION b/DESCRIPTION index e53e9b1d..a2fbc82b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,30 +1,17 @@ Package: git2rdata Title: Store and Retrieve Data.frames in a Git Repository -Version: 0.3.1 -Authors@R: - c(person(given = "Thierry", - family = "Onkelinx", - role = c("aut", "cre"), - email = "thierry.onkelinx@inbo.be", - comment = c(ORCID = "0000-0001-8804-4216")), - person(given = "Floris", - family = "Vanderhaeghe", - role = "ctb", - email = "floris.vanderhaeghe@inbo.be", - comment = c(ORCID = "0000-0002-6378-6229")), - person(given = "Peter", - family = "Desmet", - role = "ctb", - email = "peter.desmet@inbo.be", - comment = c(ORCID = "0000-0002-8442-8025")), - person(given = "Els", - family = "Lommelen", - role = "ctb", - email = "els.lommelen@inbo.be", - comment = c(ORCID = "0000-0002-3481-5684")), - person(given = "Research Institute for Nature and Forest", - role = c("cph", "fnd"), - email = "info@inbo.be")) +Version: 0.4.0 +Authors@R: c( + person("Thierry", "Onkelinx", , "thierry.onkelinx@inbo.be", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-8804-4216")), + person("Floris", "Vanderhaeghe", , "floris.vanderhaeghe@inbo.be", role = "ctb", + comment = c(ORCID = "0000-0002-6378-6229")), + person("Peter", "Desmet", , "peter.desmet@inbo.be", role = "ctb", + comment = c(ORCID = "0000-0002-8442-8025")), + person("Els", "Lommelen", , "els.lommelen@inbo.be", role = "ctb", + comment = c(ORCID = "0000-0002-3481-5684")), + person("Research Institute for Nature and Forest", , , "info@inbo.be", role = c("cph", "fnd")) + ) Description: The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of @@ -44,10 +31,10 @@ Description: The git2rdata package is an R package for writing and reading traceable workflow. vignette("workflow", package = "git2rdata") gives a toy example. 4) vignette("efficiency", package = "git2rdata") provides some insight into the efficiency of file storage, git - repository size and speed for writing and reading. Please cite using - . + repository size and speed for writing and reading. License: GPL-3 -URL: https://ropensci.github.io/git2rdata/ +URL: https://ropensci.github.io/git2rdata/, + https://github.com/ropensci/git2rdata/ BugReports: https://github.com/ropensci/git2rdata/issues Depends: R (>= 3.5.0) @@ -66,10 +53,9 @@ Suggests: VignetteBuilder: knitr Encoding: UTF-8 -Language: en-GB -LazyData: true +Language: eng Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Collate: 'clean_data_path.R' 'datahash.R' @@ -87,3 +73,4 @@ Collate: 'rename_variable.R' 'upgrade_data.R' 'utils.R' + 'verify_vc.R' diff --git a/NAMESPACE b/NAMESPACE index df4c980a..9e7634f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ export(repository) export(rm_data) export(status) export(upgrade_data) +export(verify_vc) export(write_vc) importFrom(assertthat,"on_failure<-") importFrom(assertthat,assert_that) @@ -81,6 +82,7 @@ importFrom(git2r,workdir) importFrom(methods,setOldClass) importFrom(stats,setNames) importFrom(utils,file_test) +importFrom(utils,flush.console) importFrom(utils,packageVersion) importFrom(utils,read.table) importFrom(utils,write.table) diff --git a/NEWS.md b/NEWS.md index 1d703d04..1b49361a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,20 @@ +# git2rdata 0.4.0 + +## New features + +* `write_vc()` stores non optimised files as comma separated values rather than + tab separated values. + The general public seems to better recognised `.csv` files than `.tsv` files + as being data files. +* Add a new function `verify_vc()` which reads a `git2rdata` object and verifies + the presence of a set of variables. + It return the data upon success. + +## Internal changes + +* Upgrade to Roxygen2 7.1.2 +* Add `inst/CITATION`, `CITATION.cff`, `.zenodo.json` + # git2rdata 0.3.1 * Use `icuSetCollate()` to define a standardised sorting. diff --git a/R/clean_data_path.R b/R/clean_data_path.R index 2f83d6e0..2a5ea67b 100644 --- a/R/clean_data_path.R +++ b/R/clean_data_path.R @@ -11,9 +11,9 @@ clean_data_path <- function(root, file, normalize = TRUE) { assert_that(is.flag(normalize), noNA(normalize)) dir_name <- dirname(file) - if (length(grep("\\.\\.", dir_name))) { - stop("file should not contain '..'") - } + assert_that( + length(grep("\\.\\.", dir_name)) == 0, msg = "file should not contain '..'" + ) file <- gsub("\\..*$", "", basename(file)) if (dir_name == ".") { diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R index 15f9f23b..0c48b86c 100644 --- a/R/is_git2rdata.R +++ b/R/is_git2rdata.R @@ -9,8 +9,9 @@ #' @export #' @family internal #' @template example_isgit2r -is_git2rdata <- function(file, root = ".", - message = c("none", "warning", "error")) { +is_git2rdata <- function( + file, root = ".", message = c("none", "warning", "error") +) { UseMethod("is_git2rdata", root) } @@ -23,8 +24,9 @@ is_git2rdata.default <- function(file, root, message) { #' @importFrom assertthat assert_that is.string #' @importFrom yaml read_yaml as.yaml #' @importFrom utils packageVersion -is_git2rdata.character <- function(file, root = ".", - message = c("none", "warning", "error")) { +is_git2rdata.character <- function( + file, root = ".", message = c("none", "warning", "error") +) { assert_that(is.string(file), is.string(root)) message <- match.arg(message) root <- normalizePath(root, winslash = "/", mustWork = TRUE) @@ -34,6 +36,13 @@ is_git2rdata.character <- function(file, root = ".", } file <- clean_data_path(root = root, file = file) + # read the metadata + meta_data <- read_yaml(file["meta_file"]) + file["raw_file"] <- ifelse( + meta_data[["..generic"]][["optimize"]], + file["raw_file"], + gsub("\\.tsv$", ".csv", file["raw_file"]) + ) if (!file.exists(file["raw_file"])) { msg <- "Data file missing." switch(message, error = stop(msg, call. = FALSE), @@ -41,8 +50,6 @@ is_git2rdata.character <- function(file, root = ".", return(FALSE) } - # read the metadata - meta_data <- read_yaml(file["meta_file"]) if (has_name(meta_data[["..generic"]], "split_by")) { header <- readLines( file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8" @@ -79,7 +86,10 @@ is_git2rdata.character <- function(file, root = ".", } } else { correct <- names(meta_data) - correct <- paste(correct[correct != "..generic"], collapse = "\t") + correct <- paste( + correct[correct != "..generic"], + collapse = ifelse(meta_data[["..generic"]][["optimize"]], "\t", ",") + ) header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8") if (correct != header) { msg <- paste("Corrupt data, incorrect header. Expecting:", correct) diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R index f4ed0b27..1261966e 100644 --- a/R/is_git2rmeta.R +++ b/R/is_git2rmeta.R @@ -30,68 +30,73 @@ is_git2rmeta.default <- function(file, root, #' @importFrom assertthat assert_that is.string #' @importFrom yaml read_yaml #' @importFrom utils packageVersion -is_git2rmeta.character <- function(file, root = ".", - message = c("none", "warning", "error")) { +is_git2rmeta.character <- function( + file, root = ".", message = c("none", "warning", "error") +) { assert_that(is.string(file), is.string(root)) message <- match.arg(message) root <- normalizePath(root, winslash = "/", mustWork = TRUE) file <- clean_data_path(root = root, file = file) - if (!file.exists(file["meta_file"])) { - msg <- ifelse( + check <- error_warning( + file.exists(file["meta_file"]), + msg = ifelse( file.exists(file["raw_file"]), "Metadata file missing.", "`git2rdata` object not found." - ) - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) + ), + message = message + ) + if (!check) { + return(check) } # read the metadata meta_data <- read_yaml(file["meta_file"]) - if (!has_name(meta_data, "..generic")) { - msg <- "No '..generic' element." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } - if (!has_name(meta_data[["..generic"]], "hash")) { - msg <- "Corrupt metadata, no hash found." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } - if (!has_name(meta_data[["..generic"]], "git2rdata")) { - msg <- "Data stored using an older version of `git2rdata`. -See `?upgrade_data()`." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } - if (package_version(meta_data[["..generic"]][["git2rdata"]]) < - package_version("0.1.0.9001")) { - msg <- "Data stored using an older version of `git2rdata`. -See `?upgrade_data()`." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } - if (!has_name(meta_data[["..generic"]], "data_hash")) { - msg <- "Corrupt metadata, no data hash found." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } + check <- error_warning( + has_name(meta_data, "..generic"), + msg = "No '..generic' element.", + message = message, previous = check + ) + + check <- error_warning( + has_name(meta_data[["..generic"]], "hash"), + msg = "Corrupt metadata, no hash found.", + message = message, previous = check + ) + + check <- error_warning( + has_name(meta_data[["..generic"]], "git2rdata"), + msg = "Data stored using an older version of `git2rdata`. +See `?upgrade_data()`.", + message = message, previous = check + ) + + used_version <- package_version(meta_data[["..generic"]][["git2rdata"]]) + check <- error_warning( + used_version >= package_version("0.4.0") || ( + used_version >= package_version("0.2.0") && + meta_data[["..generic"]][["optimize"]] + ), + msg = "Data stored using an older version of `git2rdata`. +See `?upgrade_data()`.", + message = message, previous = check + ) + + check <- error_warning( + has_name(meta_data[["..generic"]], "data_hash"), + msg = "Corrupt metadata, no data hash found.", + message = message, previous = check + ) + current_hash <- meta_data[["..generic"]][["hash"]] - if (current_hash != metadata_hash(meta_data)) { - msg <- "Corrupt metadata, mismatching hash." - switch(message, error = stop(msg, call. = FALSE), - warning = warning(msg, call. = FALSE)) - return(FALSE) - } + check <- error_warning( + current_hash == metadata_hash(meta_data), + msg = "Corrupt metadata, mismatching hash.", + message = message, previous = check + ) - return(TRUE) + return(check) } #' @export @@ -110,3 +115,19 @@ metadata_hash <- function(meta_data) { meta_data[["..generic"]][["data_hash"]] <- NULL hash(as.yaml(meta_data)) } + +error_warning <- function( + test, msg, message = c("none", "warning", "error"), previous = TRUE +) { + message <- match.arg(message) + if (!previous) { + return(FALSE) + } + if (!test) { + switch( + message, error = stop(msg, call. = FALSE), + warning = warning(msg, call. = FALSE) + ) + } + return(test) +} diff --git a/R/list_data.R b/R/list_data.R index 5dab7ae2..2e3a6793 100644 --- a/R/list_data.R +++ b/R/list_data.R @@ -28,13 +28,19 @@ list_data.character <- function(root = ".", path = ".", recursive = TRUE) { root <- normalizePath(root, winslash = "/", mustWork = TRUE) path <- normalizePath(file.path(root, path), winslash = "/", mustWork = TRUE) - data_files <- list.files(path, pattern = "\\.tsv$", recursive = recursive, + tsv_files <- list.files(path, pattern = "\\.tsv$", recursive = recursive, + full.names = TRUE) + csv_files <- list.files(path, pattern = "\\.csv$", recursive = recursive, full.names = TRUE) meta_files <- list.files(path, pattern = "\\.yml$", recursive = recursive, full.names = TRUE) - data_files <- gsub("\\.tsv$", "", data_files) + tsv_files <- gsub("\\.tsv$", "", tsv_files) + csv_files <- gsub("\\.csv$", "", csv_files) meta_files <- gsub("\\.yml$", "", meta_files) - meta_files <- meta_files[meta_files %in% data_files] + meta_files <- meta_files[meta_files %in% c(tsv_files, csv_files)] + if (length(meta_files) == 0) { + return(character(0)) + } meta_files_base <- remove_root(file = meta_files, root = root) check <- vapply(X = meta_files_base, FUN = is_git2rmeta, FUN.VALUE = NA, root = root, message = "none") @@ -43,8 +49,17 @@ list_data.character <- function(root = ".", path = ".", recursive = TRUE) { paste(meta_files_base[!check], collapse = "\n"), call. = FALSE) } meta_files <- meta_files[check] - data_files <- data_files[data_files %in% meta_files] - remove_root(file = data_files, root = root) + optimize <- vapply( + sprintf("%s.yml", meta_files), FUN.VALUE = logical(1), + FUN = function(x) { + read_yaml(x)[["..generic"]][["optimize"]] + } + ) + tsv_files <- sprintf("%s.tsv", tsv_files[tsv_files %in% meta_files[optimize]]) + csv_files <- sprintf( + "%s.csv", csv_files[csv_files %in% meta_files[!optimize]] + ) + remove_root(file = sort(c(tsv_files, csv_files)), root = root) } #' @export diff --git a/R/meta.R b/R/meta.R index b213e1c6..72993f63 100644 --- a/R/meta.R +++ b/R/meta.R @@ -36,15 +36,16 @@ meta <- function(x, ...) { #' @export #' @rdname meta #' @importFrom assertthat assert_that is.string noNA -meta.character <- function(x, na = "NA", ...) { +meta.character <- function(x, na = "NA", optimize = TRUE, ...) { assert_that(is.string(na), noNA(na), no_whitespace(na)) + assert_that(is.flag(optimize), noNA(optimize)) x <- enc2utf8(x) if (na %in% x) { stop("one of the strings matches the NA string ('", na, "') Please use a different NA string or consider using a factor.", call. = FALSE) } x <- gsub("\\\"", "\\\"\\\"", x) - to_escape <- grepl("(\"|\t|\n)", x) + to_escape <- grepl(ifelse(optimize, "(\"|\t|\n)", "(\"|,|\n)"), x) x[to_escape] <- paste0("\"", x[to_escape], "\"") x[is.na(x)] <- na m <- list(class = "character", na_string = na) diff --git a/R/prune.R b/R/prune.R index f99d2723..eeadecbf 100644 --- a/R/prune.R +++ b/R/prune.R @@ -42,9 +42,9 @@ rm_data.character <- function( if (length(to_do) == 0) { return(to_do) } - file.remove(sprintf("%s/%s.tsv", root, to_do)) + file.remove(file.path(root, to_do)) - return(invisible(paste0(to_do, ".tsv"))) + return(invisible(to_do)) } #' @export @@ -69,7 +69,6 @@ rm_data.git_repository <- function( if (length(to_do) == 0) { return(to_do) } - to_do <- paste0(to_do, ".tsv") keep <- unlist(switch(type, unmodified = status( @@ -148,6 +147,10 @@ prune_meta.character <- function( full.names = TRUE) keep <- gsub("\\.tsv$", ".yml", keep) to_do <- to_do[!to_do %in% keep] + keep <- list.files(path = path, pattern = "\\.csv$", recursive = recursive, + full.names = TRUE) + keep <- gsub("\\.csv$", ".yml", keep) + to_do <- to_do[!to_do %in% keep] to_do_base <- remove_root(file = to_do, root = root) check <- vapply(X = gsub(".yml$", "", to_do_base), FUN = is_git2rmeta, FUN.VALUE = NA, root = root, message = "none") @@ -183,18 +186,13 @@ prune_meta.git_repository <- function( assert_that(is.flag(stage)) to_do <- list.files( - path = path, - pattern = "\\.yml$", - recursive = recursive, - full.names = TRUE + path = path, pattern = "\\.yml$", recursive = recursive, full.names = TRUE ) keep <- list.files( - path = path, - pattern = "\\.tsv$", - recursive = recursive, + path = path, pattern = "\\.[ct]sv$", recursive = recursive, full.names = TRUE ) - keep <- gsub("\\.tsv$", ".yml", keep) + keep <- gsub("\\.[ct]sv$", ".yml", keep) to_do <- to_do[!to_do %in% keep] if (length(to_do) == 0) { return(invisible(NULL)) @@ -204,7 +202,9 @@ prune_meta.git_repository <- function( changed <- unlist(status( root, staged = FALSE, unstaged = TRUE, untracked = FALSE, ignored = FALSE )) - changed <- gsub("\\.tsv$", ".yml", file.path(root_wd, changed, fsep = "/")) + changed <- gsub( + "\\.[ct]sv$", ".yml", file.path(root_wd, changed, fsep = "/") + ) if (any(to_do %in% changed)) { stop( call. = FALSE, @@ -215,7 +215,9 @@ prune_meta.git_repository <- function( changed <- unlist(status( root, staged = TRUE, unstaged = FALSE, untracked = FALSE, ignored = FALSE )) - changed <- gsub("\\.tsv$", ".yml", file.path(root_wd, changed, fsep = "/")) + changed <- gsub( + "\\.[ct]sv$", ".yml", file.path(root_wd, changed, fsep = "/") + ) if (any(to_do %in% changed)) { warning("data removed and staged, metadata removed but unstaged", call. = FALSE) diff --git a/R/read_vc.R b/R/read_vc.R index 657a5d05..5cabdae3 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -42,14 +42,13 @@ read_vc.character <- function(file, root = ".") { stop(e$message, call. = FALSE) } ) - assert_that( - all(file.exists(file)), - msg = "raw file and/or meta file missing" - ) # read the metadata meta_data <- read_yaml(file["meta_file"]) optimize <- meta_data[["..generic"]][["optimize"]] + file["raw_file"] <- ifelse( + optimize, file["raw_file"], gsub("\\.tsv$", ".csv", file["raw_file"]) + ) col_type <- list( c( character = "character", factor = "character", integer = "integer", @@ -107,7 +106,8 @@ read_vc.character <- function(file, root = ".") { raw_data <- do.call(rbind, raw_data)[, col_names] } else { raw_data <- read.table( - file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", + file = file["raw_file"], header = TRUE, sep = ifelse(optimize, "\t", ","), + quote = "\"", dec = ".", numerals = "warn.loss", na.strings = na_string, colClasses = setNames(col_type[col_classes], col_names), comment.char = "", diff --git a/R/recent_commit.R b/R/recent_commit.R index 6cb06029..81a05b2b 100644 --- a/R/recent_commit.R +++ b/R/recent_commit.R @@ -68,12 +68,6 @@ #' # still points to the third commit as this is the latest commit in which the #' # data was present #' recent_commit("iris", repo, data = TRUE) -#' -#' #' clean up -#' junk <- file.remove( -#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, -#' include.dirs = TRUE, all.files = TRUE)), -#' repo_path) recent_commit <- function(file, root, data = FALSE) { UseMethod("recent_commit", root) } @@ -89,14 +83,13 @@ recent_commit.default <- function(file, root, data = FALSE) { recent_commit.git_repository <- function(file, root, data = FALSE) { assert_that(is.string(file), is.flag(data), noNA(data)) - path <- unique(dirname(file)) - if (path == ".") { - path <- "" - } + path <- ifelse(dirname(file) == ".", "", dirname(file)) if (data) { - file <- clean_data_path(root = workdir(root), file, normalize = FALSE) + bn <- gsub("\\..*$", "", basename(file)) + name <- paste(bn, c("tsv", "csv"), sep = ".") + } else { + name <- basename(file) } - name <- basename(file) blobs <- odb_blobs(root) blobs <- blobs[blobs$path == path & blobs$name %in% name, ] blobs <- blobs[blobs$when <= as.data.frame(last_commit(root))$when, ] diff --git a/R/relabel.R b/R/relabel.R index 2107c561..6cfc166f 100644 --- a/R/relabel.R +++ b/R/relabel.R @@ -61,12 +61,6 @@ #' read_vc("relabel", repo) #' # relabel() changed the metadata, not the raw data #' status(repo) -#' -#' # clean up -#' junk <- file.remove( -#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, -#' include.dirs = TRUE, all.files = TRUE)), -#' repo_path) #' @family storage relabel <- function(file, root = ".", change) { UseMethod("relabel", change) @@ -90,18 +84,12 @@ relabel.list <- function(file, root = ".", change) { assert_that(is.string(root), is.string(file)) assert_that(!is.null(names(change)), msg = "'change' has no names") root <- normalizePath(root, winslash = "/", mustWork = TRUE) - is_git2rmeta(file = file, root = root, message = "error") + is_git2rdata(file = file, root = root, message = "error") file <- clean_data_path(root = root, file = file) - assert_that( - all(file.exists(file)), - msg = "raw file and/or meta file missing" - ) meta_data <- read_yaml(file["meta_file"]) optimize <- meta_data[["..generic"]][["optimize"]] - if (!optimize) { - stop("relabelling factors on verbose data leads to large diffs. -Use write_vc() instead.", call. = FALSE) - } + stopifnot("relabelling factors on verbose data leads to large diffs. +Use write_vc() instead." = optimize) assert_that( all(names(change) %in% names(meta_data)), msg = "every name in 'change' must match an exisiting variable" diff --git a/R/rename_variable.R b/R/rename_variable.R index b4eb1211..d157249d 100644 --- a/R/rename_variable.R +++ b/R/rename_variable.R @@ -42,12 +42,6 @@ #' # check the changes #' read_vc("rename", repo) #' status(repo) -#' -#' # clean up -#' junk <- file.remove( -#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, -#' include.dirs = TRUE, all.files = TRUE)), -#' repo_path) #' @family storage rename_variable <- function(file, change, root = ".", ...) { UseMethod("rename_variable", root) @@ -69,6 +63,11 @@ rename_variable.character <- function(file, change, root = ".", ...) { is_git2rdata(file = file, root = root, message = "error") file <- clean_data_path(root = root, file = file) yaml <- read_yaml(file[["meta_file"]]) + file["raw_file"] <- ifelse( + yaml[["..generic"]][["optimize"]], + file["raw_file"], + gsub("\\.tsv$", ".csv", file["raw_file"]) + ) assert_that( all(change %in% names(yaml)), msg = "Not every old name in `change` present in the `git2rdata` object." diff --git a/R/upgrade_data.R b/R/upgrade_data.R index df8ec30a..f35e1abc 100644 --- a/R/upgrade_data.R +++ b/R/upgrade_data.R @@ -25,9 +25,6 @@ #' upgrade_data(file = "iris", root = root) #' # use path = "." to upgrade all git2rdata objects under root #' upgrade_data(path = ".", root = root) -#' -#' # clean up -#' junk <- file.remove(list.files(root, full.names = TRUE), root) upgrade_data <- function(file, root = ".", verbose, ..., path) { UseMethod("upgrade_data", root) } @@ -37,17 +34,15 @@ upgrade_data.default <- function(file, root, verbose, path, ...) { stop("a 'root' of class ", class(root), " is not supported", call. = FALSE) } -#' @importFrom assertthat assert_that is.string is.flag noNA +#' @importFrom assertthat assert_that is.string #' @importFrom yaml read_yaml write_yaml #' @importFrom utils packageVersion #' @export upgrade_data.character <- function( file, root = ".", verbose = TRUE, ..., path) { - assert_that(is.string(root), is.flag(verbose), noNA(verbose)) + assert_that(is.string(root)) root <- normalizePath(root, winslash = "/", mustWork = TRUE) if (missing(file)) { - assert_that(missing(file), - msg = "specify either 'file' or 'path'") assert_that(is.string(path)) full_path <- normalizePath(file.path(root, path), winslash = "/", mustWork = TRUE) @@ -64,7 +59,7 @@ upgrade_data.character <- function( target <- remove_root(file = file["meta_file"], root = root) target <- gsub(".yml", "", target) if (!has_name(meta_data, "..generic")) { - message(target, "is not a git2rdata object") + message(target, " is not a git2rdata object") return(target) } assert_that( @@ -72,14 +67,56 @@ upgrade_data.character <- function( msg = paste(target, "has corrupt metadata, no hash found.") ) if (has_name(meta_data[["..generic"]], "git2rdata")) { - if (package_version(meta_data[["..generic"]][["git2rdata"]]) >= - package_version("0.1.0.9001") - ) { - if (verbose) { - message(target, " already up to date") - } + current <- package_version(meta_data[["..generic"]][["git2rdata"]]) + if (current >= package_version("0.4.0")) { + display(verbose, c(target, " already up to date")) + return(target) + } + assert_that( + has_name(meta_data[["..generic"]], "optimize"), + msg = paste(target, "has corrupt metadata, optimize flag not found.") + ) + assert_that( + current >= package_version("0.2.0"), + msg = "Data stored with ancient version of git2rdata. +Please install version 0.3.1 and upgrade to that version first. +Then reinstall the current version and upgrade to this version. +Install version 0.3.1 with remotes::install_github('ropensci/git2rdata@v0.3.1')" + ) + if (meta_data[["..generic"]][["optimize"]]) { + display(verbose, c(target, " already up to date")) return(target) } + na_string <- meta_data[["..generic"]][["NA string"]] + details <- meta_data[names(meta_data) != "..generic"] + col_names <- names(details) + col_classes <- vapply(details, "[[", character(1), "class") + col_type <- c( + character = "character", factor = "character", integer = "integer", + numeric = "numeric", logical = "logical", Date = "Date", + POSIXct = "character", complex = "complex" + ) + old <- read.table( + file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"", + dec = ".", numerals = "warn.loss", na.strings = na_string, + colClasses = setNames(col_type[col_classes], col_names), + comment.char = "", + stringsAsFactors = FALSE, fileEncoding = "UTF-8" + ) + file.remove(file["raw_file"]) + file["raw_file"] <- gsub("\\.tsv$", ".csv", file["raw_file"]) + for (i in which(col_type[col_classes] == "character")) { + x <- gsub("\\\"", "\\\"\\\"", old[[i]]) + to_escape <- grepl("(\"|,|\n)", x) + x[to_escape] <- paste0("\"", x[to_escape], "\"") + x[is.na(x)] <- na_string + old[[i]] <- x + } + write.table( + x = old, file = file["raw_file"], + append = FALSE, quote = FALSE, sep = ",", eol = "\n", na = na_string, + dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" + ) meta_data[["..generic"]][["git2rdata"]] <- NULL meta_data[["..generic"]][["data_hash"]] <- NULL } @@ -92,9 +129,7 @@ upgrade_data.character <- function( meta_data[["..generic"]][["data_hash"]] <- datahash(file["raw_file"]) } write_yaml(meta_data, file["meta_file"], fileEncoding = "UTF-8") - if (verbose) { - message(file["meta_file"], " updated") - } + display(verbose, c(file["meta_file"], " updated")) return(target) } @@ -114,6 +149,10 @@ upgrade_data.git_repository <- function( if (!stage) { return(file) } - add(root, path = paste0(file, ".yml"), force = force) + + file <- gsub("^\\./", "", file) + add(root, path = sprintf("%s.csv", file), force = force) + add(root, path = sprintf("%s.tsv", file), force = force) + add(root, path = sprintf("%s.yml", file), force = force) return(file) } diff --git a/R/utils.R b/R/utils.R index b93e3a57..3c139c21 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,3 +5,17 @@ release_questions <- function() { # nocov start "Did you ran `gramr::check_project(exclude_chunks = TRUE)`" ) } # nocov end + +#' @noRd +#' @importFrom utils flush.console +#' @importFrom assertthat assert_that is.flag noNA +display <- function(verbose, message, linefeed = TRUE) { + assert_that(is.flag(verbose), noNA(verbose)) + assert_that(is.flag(linefeed), noNA(linefeed)) + + if (verbose) { + message(message, appendLF = linefeed) + flush.console() + } + return(invisible(NULL)) +} diff --git a/R/verify_vc.R b/R/verify_vc.R new file mode 100644 index 00000000..2c2c2d83 --- /dev/null +++ b/R/verify_vc.R @@ -0,0 +1,23 @@ +#' Read a file an verify the presence of variables +#' +#' Reads the file with [read_vc()]. +#' Then verifies that every variable listed in `variables` is present in the +#' data.frame. +#' @export +#' @inheritParams read_vc +#' @param variables a character vector with variable names. +#' @importFrom assertthat assert_that +#' @family storage +verify_vc <- function(file, root, variables) { + assert_that(is.character(variables), length(variables) > 0, noNA(variables)) + x <- read_vc(file = file, root = root) + ok <- variables %in% colnames(x) + assert_that( + all(ok), + msg = sprintf( + "variables missing from `%s`: %s", file, + paste(variables[!ok], collapse = ", ") + ) + ) + return(x) +} diff --git a/R/write_vc.R b/R/write_vc.R index 146e8007..c1bfd213 100644 --- a/R/write_vc.R +++ b/R/write_vc.R @@ -106,6 +106,11 @@ write_vc.character <- function( } } } + file["raw_file"] <- ifelse( + attr(raw_data, "meta")[["..generic"]][["optimize"]], + file["raw_file"], + gsub("\\.tsv$", ".csv", file["raw_file"]) + ) assert_that( unlink(file["raw_file"], recursive = TRUE) == 0, msg = "Failed to remove existing files." @@ -113,7 +118,10 @@ write_vc.character <- function( if (length(split_by) == 0) { write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, - sep = "\t", eol = "\n", na = na, dec = ".", row.names = FALSE, + sep = ifelse( + attr(raw_data, "meta")[["..generic"]][["optimize"]], "\t", "," + ), + eol = "\n", na = na, dec = ".", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8" ) } else { diff --git a/README.md b/README.md index 9130cdd4..afbd0279 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,13 @@ [![CRAN status](https://www.r-pkg.org/badges/version/git2rdata)](https://cran.r-project.org/package=git2rdata) -[![Rdoc](https://www.rdocumentation.org/badges/version/git2rdata)](https://www.rdocumentation.org/packages/git2rdata) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) +[![lifecycle](https://img.shields.io/badge/lifecycle-stable-green.svg)](https://lifecycle.r-lib.org/articles/stages.html#stable) [![](https://badges.ropensci.org/263_status.svg)](https://github.com/ropensci/software-review/issues/263) [![Licence](https://img.shields.io/badge/licence-GPL--3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0.en.html) [![minimal R version](https://img.shields.io/badge/R%3E%3D-3.5.0-6666ff.svg)](https://cran.r-project.org/) [![DOI](https://zenodo.org/badge/147685405.svg)](https://zenodo.org/badge/latestdoi/147685405) -[![codecov](https://codecov.io/gh/ropensci/git2rdata/branch/master/graph/badge.svg)](https://codecov.io/gh/ropensci/git2rdata) +[![codecov](https://codecov.io/gh/ropensci/git2rdata/branch/master/graph/badge.svg)](https://app.codecov.io/gh/ropensci/git2rdata) ![GitHub forks](https://img.shields.io/github/forks/ropensci/git2rdata.svg?style=social) ![GitHub stars](https://img.shields.io/github/stars/ropensci/git2rdata.svg?style=social) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/ropensci/git2rdata.svg) diff --git a/_pkgdown.yml b/_pkgdown.yml index bd478a96..5f965002 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -22,11 +22,11 @@ navbar: - text: Contributing href: CONTRIBUTING.html right: - - icon: fa-github fa-lg + - icon: "fa fa-github" href: https://github.com/ropensci/git2rdata - - icon: fa-twitter fa-lg + - icon: "fa fa-twitter" href: https://twitter.com/INBOVlaanderen - - icon: fa-facebook fg-lg + - icon: "fa fa-facebook" href: https://www.facebook.com/pg/INBOVlaanderen reference: @@ -44,5 +44,5 @@ authors: Thierry Onkelinx: href: "https://www.muscardinus.be" Research Institute for Nature and Forest: - href: "https://www.inbo.be/en" + href: "https://www.vlaanderen.be/inbo/en-gb" html: "" diff --git a/checklist.yml b/checklist.yml index 2b728d93..ba0a85ae 100644 --- a/checklist.yml +++ b/checklist.yml @@ -3,3 +3,10 @@ package: yes allowed: warnings: [] notes: [] +citation_roles: +- aut +- cre +keywords: +- R package +- reproducible research +- version control diff --git a/cran-comments.md b/cran-comments.md index 692ec2bb..b0f7a179 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,38 +1,28 @@ ## Test environments + * local - * ubuntu 18.04.5 LTS, R 4.0.3 + * ubuntu 20.04.4 LTS, R 4.1.3 * github actions * macOS-latest, release * windows-latest, release * ubuntu 20.04, devel - * ubuntu 16.04, oldrel - * checklist package: ubuntu 20.04.1, R 4.0.3 + * ubuntu 20.04, oldrel + * checklist package: ubuntu 20.04.4 LTS, R 4.1.3 * r-hub - * Windows Server 2008 R2 SP1, R-devel, 32/64 bit - * Ubuntu Linux 16.04 LTS, R-release, GCC - * Fedora Linux, R-devel, clang, gfortran + * debian: clang-devel, gcc-devel, gcc-patched, gcc-release + * fedora: clang-devel, gcc-devel + * macos: highsierra-release-cran + * windows_x86_64: devel, oldrel, release ## R CMD check results 0 errors | 0 warnings | 0 note -r-hub gave a few false positive notes - -* Windows Server 2008 R2 SP1, R-devel, 32/64 bit - -``` -Possibly mis-spelled words in DESCRIPTION: - rdata (28:22, 31:33, 36:20, 40:48, 41:20, 43:24, 44:62, 45:62) - workflow (41:37, 44:15, 44:36) -``` - -* Fedora Linux, R-devel, clang, gfortran - -``` -Possibly mis-spelled words in DESCRIPTION: - rdata (28:22, 31:33, 36:20, 40:48, 41:20, 43:24, 44:62, 45:62) -``` +r-hub gave a false positive note -Ubuntu Linux 16.04 LTS, R-release, GCC failed on r-hub because ICU is not -available on that build. +Windows Server 2022, R-devel, 64 bit +checking for detritus in the temp directory ... NOTE +Found the following files/directories: + 'lastMiKTeXException' + diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 00000000..ba849f4f --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,13 @@ +citHeader("To cite `git2rdata` in publications please use:") +# begin checklist entry +citEntry( + entry = "Manual", + title = "git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.4.0", + author = c(person(given = "Thierry", family = "Onkelinx")), + year = 2022, + url = "https://ropensci.github.io/git2rdata/", + abstract = "The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(\"plain_text\", package = \"git2rdata\"). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(\"version_control\", package = \"git2rdata\"). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(\"workflow\", package = \"git2rdata\") gives a toy example. 4) vignette(\"efficiency\", package = \"git2rdata\") provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.", + textVersion = "Onkelinx, Thierry (2022) git2rdata: Store and Retrieve Data.frames in a Git Repository. Version 0.4.0. https://ropensci.github.io/git2rdata/, https://github.com/ropensci/git2rdata/", + keywords = "R package, reproducible research, version control", +) +# end checklist entry diff --git a/inst/efficiency/file_timings.rds b/inst/efficiency/file_timings.rds index 48bb68a1..e4afcd8f 100644 Binary files a/inst/efficiency/file_timings.rds and b/inst/efficiency/file_timings.rds differ diff --git a/inst/efficiency/git_size.rds b/inst/efficiency/git_size.rds index 0c14df9d..57506227 100644 Binary files a/inst/efficiency/git_size.rds and b/inst/efficiency/git_size.rds differ diff --git a/inst/efficiency/read_timings.rds b/inst/efficiency/read_timings.rds index 632a38ea..a8c5c31d 100644 Binary files a/inst/efficiency/read_timings.rds and b/inst/efficiency/read_timings.rds differ diff --git a/inst/testthat/optimized_0_0_4.tsv b/inst/testthat/optimized_0_0_4.tsv new file mode 100644 index 00000000..b6e8a277 --- /dev/null +++ b/inst/testthat/optimized_0_0_4.tsv @@ -0,0 +1,48 @@ +a letters +" a" x +" +a" x +,a f +;a z +'b z +'b' w +'NA' t +"""b" p +"""b""" t +"""NA""" k +@ d +& y +# j +| r +$ p +€ a +a y +à l +"a " o +"a b" c +"a b c" g +"a +" w +"a +b" k +"a +b +c" g +a b x +a b c f +a, b +a,b e +a,b,c z +a; j +a;b w +a;b;c v +a' r +a'b t +a'b'c g +"a""" d +"a""b" h +"a""b""c" l +ç i +é m +µ v +NA w diff --git a/inst/testthat/optimized_0_0_4.yml b/inst/testthat/optimized_0_0_4.yml new file mode 100644 index 00000000..510b349e --- /dev/null +++ b/inst/testthat/optimized_0_0_4.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.0.4 + optimize: yes + NA string: NA + sorting: a + hash: b2ebf427feaafc5f4dea639777f199ffcf9f85b0 + data_hash: b6e8a277ab11f2ca500cbd3ee20427a94af4d594 +a: + class: character +letters: + class: character diff --git a/inst/testthat/optimized_0_3_1.tsv b/inst/testthat/optimized_0_3_1.tsv new file mode 100644 index 00000000..8a29fe63 --- /dev/null +++ b/inst/testthat/optimized_0_3_1.tsv @@ -0,0 +1,48 @@ +a letters +" a" x +" +a" m +,a t +;a w +'b f +'b' l +'NA' y +"""b" a +"""b""" z +"""NA""" y +@ z +& f +# c +| p +$ s +€ f +a j +à r +"a " b +"a b" l +"a b c" v +"a +" i +"a +b" n +"a +b +c" f +a b i +a b c o +a, c +a,b g +a,b,c y +a; p +a;b x +a;b;c l +a' u +a'b e +a'b'c t +"a""" c +"a""b" r +"a""b""c" v +ç w +é q +µ a +NA m diff --git a/inst/testthat/optimized_0_3_1.yml b/inst/testthat/optimized_0_3_1.yml new file mode 100644 index 00000000..f8129821 --- /dev/null +++ b/inst/testthat/optimized_0_3_1.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.3.1 + optimize: yes + NA string: NA + sorting: a + hash: b2ebf427feaafc5f4dea639777f199ffcf9f85b0 + data_hash: edfd19ec22a7172eb5f7201197bc56da7fade50f +a: + class: character +letters: + class: character diff --git a/inst/testthat/optimized_0_4_0.tsv b/inst/testthat/optimized_0_4_0.tsv new file mode 100644 index 00000000..8a29fe63 --- /dev/null +++ b/inst/testthat/optimized_0_4_0.tsv @@ -0,0 +1,48 @@ +a letters +" a" x +" +a" m +,a t +;a w +'b f +'b' l +'NA' y +"""b" a +"""b""" z +"""NA""" y +@ z +& f +# c +| p +$ s +€ f +a j +à r +"a " b +"a b" l +"a b c" v +"a +" i +"a +b" n +"a +b +c" f +a b i +a b c o +a, c +a,b g +a,b,c y +a; p +a;b x +a;b;c l +a' u +a'b e +a'b'c t +"a""" c +"a""b" r +"a""b""c" v +ç w +é q +µ a +NA m diff --git a/inst/testthat/optimized_0_4_0.yml b/inst/testthat/optimized_0_4_0.yml new file mode 100644 index 00000000..03bddc9f --- /dev/null +++ b/inst/testthat/optimized_0_4_0.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.4.0 + optimize: yes + NA string: NA + sorting: a + hash: b2ebf427feaafc5f4dea639777f199ffcf9f85b0 + data_hash: edfd19ec22a7172eb5f7201197bc56da7fade50f +a: + class: character +letters: + class: character diff --git a/inst/testthat/verbose_0_0_4.tsv b/inst/testthat/verbose_0_0_4.tsv new file mode 100644 index 00000000..b6e8a277 --- /dev/null +++ b/inst/testthat/verbose_0_0_4.tsv @@ -0,0 +1,48 @@ +a letters +" a" x +" +a" x +,a f +;a z +'b z +'b' w +'NA' t +"""b" p +"""b""" t +"""NA""" k +@ d +& y +# j +| r +$ p +€ a +a y +à l +"a " o +"a b" c +"a b c" g +"a +" w +"a +b" k +"a +b +c" g +a b x +a b c f +a, b +a,b e +a,b,c z +a; j +a;b w +a;b;c v +a' r +a'b t +a'b'c g +"a""" d +"a""b" h +"a""b""c" l +ç i +é m +µ v +NA w diff --git a/inst/testthat/verbose_0_0_4.yml b/inst/testthat/verbose_0_0_4.yml new file mode 100644 index 00000000..f49a3268 --- /dev/null +++ b/inst/testthat/verbose_0_0_4.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.0.4 + optimize: no + NA string: NA + sorting: a + hash: 3c02145886aac5a9eae5f1d2700a29f9a71b9829 + data_hash: b6e8a277ab11f2ca500cbd3ee20427a94af4d594 +a: + class: character +letters: + class: character diff --git a/inst/testthat/verbose_0_3_1.tsv b/inst/testthat/verbose_0_3_1.tsv new file mode 100644 index 00000000..8a29fe63 --- /dev/null +++ b/inst/testthat/verbose_0_3_1.tsv @@ -0,0 +1,48 @@ +a letters +" a" x +" +a" m +,a t +;a w +'b f +'b' l +'NA' y +"""b" a +"""b""" z +"""NA""" y +@ z +& f +# c +| p +$ s +€ f +a j +à r +"a " b +"a b" l +"a b c" v +"a +" i +"a +b" n +"a +b +c" f +a b i +a b c o +a, c +a,b g +a,b,c y +a; p +a;b x +a;b;c l +a' u +a'b e +a'b'c t +"a""" c +"a""b" r +"a""b""c" v +ç w +é q +µ a +NA m diff --git a/inst/testthat/verbose_0_3_1.yml b/inst/testthat/verbose_0_3_1.yml new file mode 100644 index 00000000..0a3b917c --- /dev/null +++ b/inst/testthat/verbose_0_3_1.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.3.1 + optimize: no + NA string: NA + sorting: a + hash: 3c02145886aac5a9eae5f1d2700a29f9a71b9829 + data_hash: edfd19ec22a7172eb5f7201197bc56da7fade50f +a: + class: character +letters: + class: character diff --git a/inst/testthat/verbose_0_4_0.csv b/inst/testthat/verbose_0_4_0.csv new file mode 100644 index 00000000..55b7f6f2 --- /dev/null +++ b/inst/testthat/verbose_0_4_0.csv @@ -0,0 +1,48 @@ +a,letters + a,x +" +a",m +",a",t +;a,w +'b,f +'b',l +'NA',y +"""b",a +"""b""",z +"""NA""",y +@,z +&,f +#,c +|,p +$,s +€,f +a,j +à,r +a ,b +a b,l +a b c,v +"a +",i +"a +b",n +"a +b +c",f +a b,i +a b c,o +"a,",c +"a,b",g +"a,b,c",y +a;,p +a;b,x +a;b;c,l +a',u +a'b,e +a'b'c,t +"a""",c +"a""b",r +"a""b""c",v +ç,w +é,q +µ,a +NA,m diff --git a/inst/testthat/verbose_0_4_0.yml b/inst/testthat/verbose_0_4_0.yml new file mode 100644 index 00000000..a4d4c3b4 --- /dev/null +++ b/inst/testthat/verbose_0_4_0.yml @@ -0,0 +1,11 @@ +..generic: + git2rdata: 0.4.0 + optimize: no + NA string: NA + sorting: a + hash: 3c02145886aac5a9eae5f1d2700a29f9a71b9829 + data_hash: f9abaee65ed6bbf187817a94a6305f4d1e577bf3 +a: + class: character +letters: + class: character diff --git a/man-roxygen/example_io.R b/man-roxygen/example_io.R index b5ae7496..a9adaed8 100644 --- a/man-roxygen/example_io.R +++ b/man-roxygen/example_io.R @@ -56,11 +56,3 @@ #' iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE #' ) #' status(repo) -#' -#' # clean up -#' junk <- file.remove( -#' list.files(root, full.names = TRUE, recursive = TRUE), root) -#' junk <- file.remove( -#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, -#' include.dirs = TRUE, all.files = TRUE)), -#' repo_path) diff --git a/man-roxygen/example_isgit2r.R b/man-roxygen/example_isgit2r.R index ac184be6..bd4e2e14 100644 --- a/man-roxygen/example_isgit2r.R +++ b/man-roxygen/example_isgit2r.R @@ -21,6 +21,3 @@ #' junk <- file.remove(file.path(root, "iris.tsv")) #' is_git2rmeta("iris", root) #' is_git2rdata("iris", root) -#' -#' # clean up -#' junk <- file.remove(list.files(root, full.names = TRUE), root) diff --git a/man-roxygen/example_prune.R b/man-roxygen/example_prune.R index 8046fc7e..e5eac14f 100644 --- a/man-roxygen/example_prune.R +++ b/man-roxygen/example_prune.R @@ -64,11 +64,3 @@ #' # check the removal #' list_data(repo) #' status(repo) -#' -#' # clean up -#' junk <- file.remove( -#' list.files(root, full.names = TRUE, recursive = TRUE), root) -#' junk <- file.remove( -#' rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, -#' include.dirs = TRUE, all.files = TRUE)), -#' repo_path) diff --git a/man/figures/background-pattern.png b/man/figures/background-pattern.png new file mode 100644 index 00000000..a2d9426f Binary files /dev/null and b/man/figures/background-pattern.png differ diff --git a/man/figures/flanders.woff b/man/figures/flanders.woff new file mode 100644 index 00000000..0e6a288e Binary files /dev/null and b/man/figures/flanders.woff differ diff --git a/man/figures/flanders.woff2 b/man/figures/flanders.woff2 new file mode 100644 index 00000000..a849aa31 Binary files /dev/null and b/man/figures/flanders.woff2 differ diff --git a/man/git2rdata-package.Rd b/man/git2rdata-package.Rd index 26720033..93e7ab2d 100644 --- a/man/git2rdata-package.Rd +++ b/man/git2rdata-package.Rd @@ -6,32 +6,13 @@ \alias{git2rdata-package} \title{git2rdata: Store and Retrieve Data.frames in a Git Repository} \description{ -The git2rdata package is an R package for writing and reading - dataframes as plain text files. A metadata file stores important - information. 1) Storing metadata allows to maintain the classes of - variables. By default, git2rdata optimizes the data for file storage. - The optimization is most effective on data containing factors. The - optimization makes the data less human readable. The user can turn - this off when they prefer a human readable format over smaller files. - Details on the implementation are available in vignette("plain_text", - package = "git2rdata"). 2) Storing metadata also allows smaller row - based diffs between two consecutive commits. This is a useful feature - when storing data as plain text files under version control. Details - on this part of the implementation are available in - vignette("version_control", package = "git2rdata"). Although we - envisioned git2rdata with a git workflow in mind, you can use it in - combination with other version control systems like subversion or - mercurial. 3) git2rdata is a useful tool in a reproducible and - traceable workflow. vignette("workflow", package = "git2rdata") gives - a toy example. 4) vignette("efficiency", package = "git2rdata") - provides some insight into the efficiency of file storage, git - repository size and speed for writing and reading. Please cite using - . +The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette("plain_text", package = "git2rdata"). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette("version_control", package = "git2rdata"). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette("workflow", package = "git2rdata") gives a toy example. 4) vignette("efficiency", package = "git2rdata") provides some insight into the efficiency of file storage, git repository size and speed for writing and reading. } \seealso{ Useful links: \itemize{ \item \url{https://ropensci.github.io/git2rdata/} + \item \url{https://github.com/ropensci/git2rdata/} \item Report bugs at \url{https://github.com/ropensci/git2rdata/issues} } diff --git a/man/is_git2rdata.Rd b/man/is_git2rdata.Rd index 86bdf6b4..d0c18c38 100644 --- a/man/is_git2rdata.Rd +++ b/man/is_git2rdata.Rd @@ -50,9 +50,6 @@ write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") junk <- file.remove(file.path(root, "iris.tsv")) is_git2rmeta("iris", root) is_git2rdata("iris", root) - -# clean up -junk <- file.remove(list.files(root, full.names = TRUE), root) } \seealso{ Other internal: diff --git a/man/is_git2rmeta.Rd b/man/is_git2rmeta.Rd index e5e085cd..dfdd7eab 100644 --- a/man/is_git2rmeta.Rd +++ b/man/is_git2rmeta.Rd @@ -53,9 +53,6 @@ write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Length") junk <- file.remove(file.path(root, "iris.tsv")) is_git2rmeta("iris", root) is_git2rdata("iris", root) - -# clean up -junk <- file.remove(list.files(root, full.names = TRUE), root) } \seealso{ Other internal: diff --git a/man/list_data.Rd b/man/list_data.Rd index 435ddd38..a84b9d71 100644 --- a/man/list_data.Rd +++ b/man/list_data.Rd @@ -89,14 +89,6 @@ prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) - -# clean up -junk <- file.remove( - list.files(root, full.names = TRUE, recursive = TRUE), root) -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -105,6 +97,7 @@ Other storage: \code{\link{relabel}()}, \code{\link{rename_variable}()}, \code{\link{rm_data}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/meta.Rd b/man/meta.Rd index c7190b95..db797216 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -13,7 +13,7 @@ \usage{ meta(x, ...) -\method{meta}{character}(x, na = "NA", ...) +\method{meta}{character}(x, na = "NA", optimize = TRUE, ...) \method{meta}{factor}(x, optimize = TRUE, na = "NA", index, strict = TRUE, ...) diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd index 7d4a6dec..2026ec83 100644 --- a/man/prune_meta.Rd +++ b/man/prune_meta.Rd @@ -103,14 +103,6 @@ prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) - -# clean up -junk <- file.remove( - list.files(root, full.names = TRUE, recursive = TRUE), root) -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -119,6 +111,7 @@ Other storage: \code{\link{relabel}()}, \code{\link{rename_variable}()}, \code{\link{rm_data}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/read_vc.Rd b/man/read_vc.Rd index 69764519..03452bdf 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -86,14 +86,6 @@ write_vc( iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE ) status(repo) - -# clean up -junk <- file.remove( - list.files(root, full.names = TRUE, recursive = TRUE), root) -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -102,6 +94,7 @@ Other storage: \code{\link{relabel}()}, \code{\link{rename_variable}()}, \code{\link{rm_data}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index 2204c9c7..79fc8c70 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -82,12 +82,6 @@ list.files(repo_path) # still points to the third commit as this is the latest commit in which the # data was present recent_commit("iris", repo, data = TRUE) - -#' clean up -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other version_control: diff --git a/man/relabel.Rd b/man/relabel.Rd index 59146312..f37e0639 100644 --- a/man/relabel.Rd +++ b/man/relabel.Rd @@ -79,12 +79,6 @@ relabel("relabel", repo, change) read_vc("relabel", repo) # relabel() changed the metadata, not the raw data status(repo) - -# clean up -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -93,6 +87,7 @@ Other storage: \code{\link{read_vc}()}, \code{\link{rename_variable}()}, \code{\link{rm_data}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/rename_variable.Rd b/man/rename_variable.Rd index 4d720cd3..0c78e42c 100644 --- a/man/rename_variable.Rd +++ b/man/rename_variable.Rd @@ -77,12 +77,6 @@ rename_variable(file = "rename", change = change, root = repo) # check the changes read_vc("rename", repo) status(repo) - -# clean up -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -91,6 +85,7 @@ Other storage: \code{\link{read_vc}()}, \code{\link{relabel}()}, \code{\link{rm_data}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 31d4052e..863db48f 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -119,14 +119,6 @@ prune_meta(repo, path = ".") # check the removal list_data(repo) status(repo) - -# clean up -junk <- file.remove( - list.files(root, full.names = TRUE, recursive = TRUE), root) -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -135,6 +127,7 @@ Other storage: \code{\link{read_vc}()}, \code{\link{relabel}()}, \code{\link{rename_variable}()}, +\code{\link{verify_vc}()}, \code{\link{write_vc}()} } \concept{storage} diff --git a/man/upgrade_data.Rd b/man/upgrade_data.Rd index d8e54d44..8f90cb76 100644 --- a/man/upgrade_data.Rd +++ b/man/upgrade_data.Rd @@ -61,9 +61,6 @@ write_vc(iris[5:10, ], file = "subdir/iris", root = root, upgrade_data(file = "iris", root = root) # use path = "." to upgrade all git2rdata objects under root upgrade_data(path = ".", root = root) - -# clean up -junk <- file.remove(list.files(root, full.names = TRUE), root) } \seealso{ Other internal: diff --git a/man/verify_vc.Rd b/man/verify_vc.Rd new file mode 100644 index 00000000..022af439 --- /dev/null +++ b/man/verify_vc.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/verify_vc.R +\name{verify_vc} +\alias{verify_vc} +\title{Read a file an verify the presence of variables} +\usage{ +verify_vc(file, root, variables) +} +\arguments{ +\item{file}{the name of the git2rdata object. Git2rdata objects cannot +have dots in their name. The name may include a relative path. \code{file} is a +path relative to the \code{root}. +Note that \code{file} must point to a location within \code{root}.} + +\item{root}{The root of a project. Can be a file path or a \code{git-repository}. +Defaults to the current working directory (\code{"."}).} + +\item{variables}{a character vector with variable names.} +} +\description{ +Reads the file with \code{\link[=read_vc]{read_vc()}}. +Then verifies that every variable listed in \code{variables} is present in the +data.frame. +} +\seealso{ +Other storage: +\code{\link{list_data}()}, +\code{\link{prune_meta}()}, +\code{\link{read_vc}()}, +\code{\link{relabel}()}, +\code{\link{rename_variable}()}, +\code{\link{rm_data}()}, +\code{\link{write_vc}()} +} +\concept{storage} diff --git a/man/write_vc.Rd b/man/write_vc.Rd index ed92e315..095a738f 100644 --- a/man/write_vc.Rd +++ b/man/write_vc.Rd @@ -157,14 +157,6 @@ write_vc( iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE ) status(repo) - -# clean up -junk <- file.remove( - list.files(root, full.names = TRUE, recursive = TRUE), root) -junk <- file.remove( - rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, - include.dirs = TRUE, all.files = TRUE)), - repo_path) } \seealso{ Other storage: @@ -173,6 +165,7 @@ Other storage: \code{\link{read_vc}()}, \code{\link{relabel}()}, \code{\link{rename_variable}()}, -\code{\link{rm_data}()} +\code{\link{rm_data}()}, +\code{\link{verify_vc}()} } \concept{storage} diff --git a/pkgdown/extra.css b/pkgdown/extra.css index fb1e7e6b..00938dd1 100644 --- a/pkgdown/extra.css +++ b/pkgdown/extra.css @@ -1,7 +1,7 @@ body { - background-color: #efefef; + background-color: #f5f5f5; color: #5e5e5e; - background-image: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/img/inbo/background-pattern.png'); + background-image: url('reference/figures/background-pattern.png'); font-family: FlandersArtSans-Light, Verdana, Arial, sans-serif; } @@ -13,14 +13,13 @@ a { color: #c04384; } a:hover { - color: #c2c444; + color: #337ab7; } - .navbar, .label-default, .navbar-default .navbar-nav>.active>a, .navbar-default .navbar-nav>.active>a:hover, .navbar-default .navbar-nav>.active>a:focus { - background-color: #c04384; + background-color: #356196; } .navbar-default .navbar-link, @@ -41,33 +40,14 @@ a:hover { color: #5e5e5e; } -@font-face { - font-family: FlandersArtSans-Light; - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Light.eot'); - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Light.eot?#iefix') format('embedded-opentype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Light.woff') format('woff'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Light.ttf') format('truetype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Light.svg#FlandersArtSans-Light') format('svg'); - font-weight: normal; - font-style: normal; -} -@font-face { - font-family: FlandersArtSans-Regular; - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Regular.eot'); - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Regular.eot?#iefix') format('embedded-opentype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Regular.woff') format('woff'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Regular.ttf') format('truetype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Regular.svg#FlandersArtSans-Regular') format('svg'); - font-weight: normal; - font-style: normal; -} -@font-face { - font-family: FlandersArtSans-Medium; - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Medium.eot'); - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Medium.eot?#iefix') format('embedded-opentype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Medium.woff') format('woff'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Medium.ttf') format('truetype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Medium.svg#FlandersArtSans-Medium') format('svg'); - font-weight: normal; - font-style: normal; -} -@font-face { - font-family: FlandersArtSans-Bold; - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Bold.eot'); - src: url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Bold.eot?#iefix') format('embedded-opentype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Bold.woff') format('woff'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Bold.ttf') format('truetype'), url('https://www.inbo.be/sites/all/themes/bootstrap_inbo/fonts/FlandersArtSans-Bold.svg#FlandersArtSans-Bold') format('svg'); - font-weight: normal; - font-style: normal; +@font-face{ + font-family: inbo; + src: + url('figures/flanders.woff2') format('woff2'), + url('reference/figures/flanders.woff') format('woff'); +; + font-weight:normal; + font-style:normal; } code.sourceCode.diff span.st { @@ -80,3 +60,33 @@ code.sourceCode.diff span.va { background-color: #E4E517; font-weight: bold; } + +/*selection color*/ +::selection { + background: #c04384; + color: #fff; +} +::-moz-selection { + background: #c04384; + color: #fff; +} + +.navbar-default .navbar-nav>.open>a, .navbar-default .navbar-nav>.open>a:focus, .navbar-default .navbar-nav>.open>a:hover{ + color: #337ab7; + background: #fff; +} + +.navbar-default .navbar-nav>.active>a, .navbar-default .navbar-nav>.active>a:focus, .navbar-default .navbar-nav>.active>a:hover { + color: #337ab7; + background: #fff; +} + +.navbar-default .navbar-nav>li>a:hover{ + color: #337ab7; + background: #fff; +} + +.dropdown-menu>li>a:hover{ + color: #fff; + background: #337ab7; +} diff --git a/tests/testthat/test_a_basics.R b/tests/testthat/test_a_basics.R index 1df75e3f..7f17f049 100644 --- a/tests/testthat/test_a_basics.R +++ b/tests/testthat/test_a_basics.R @@ -1,217 +1,228 @@ -context("write_vc() and read_vc() on a file system") -expect_error(meta("NA"), "one of the strings matches the NA string") -expect_error(meta("NA", na = "abc def"), "na contains whitespace characters") -expect_error(meta("NA", na = "abc\tdef"), "na contains whitespace characters") -expect_error(meta("NA", na = "abc\ndef"), "na contains whitespace characters") -expect_error( - meta(factor("NA"), optimize = FALSE), - "one of the levels matches the NA string" -) -expect_error(write_vc(root = 1), "a 'root' of class numeric is not supported") -expect_error(read_vc(root = 1), "a 'root' of class numeric is not supported") -root <- tempfile(pattern = "git2rdata-basic") -dir.create(root) -expect_false(any(file.exists(git2rdata:::clean_data_path(root, "test")))) -expect_error( - git2rdata:::clean_data_path(root, "../wrong_location"), - "file should not contain '..'" -) -expect_error( - git2rdata:::clean_data_path(root, "./../wrong_location"), - "file should not contain '..'" -) -expect_is( - suppressWarnings( - output <- write_vc( - x = test_data, file = "test.txt", root = root, sorting = "test_Date" - ) - ), - "character" -) -expect_identical(length(output), 2L) -expect_identical(unname(output), c("test.tsv", "test.yml")) -expect_true(all(file.exists(git2rdata:::clean_data_path(root, "test")))) -expect_equal( - stored <- read_vc(file = "test.xls", root = root), - sorted_test_data, - check.attributes = FALSE -) -for (i in colnames(stored)) { +test_that("write_vc() and read_vc() on a file system", { + expect_error(meta("NA"), "one of the strings matches the NA string") + expect_error(meta("NA", na = "abc def"), "na contains whitespace characters") + expect_error(meta("NA", na = "abc\tdef"), "na contains whitespace characters") + expect_error(meta("NA", na = "abc\ndef"), "na contains whitespace characters") + expect_error( + meta(factor("NA"), optimize = FALSE), + "one of the levels matches the NA string" + ) + expect_error(write_vc(root = 1), "a 'root' of class numeric is not supported") + expect_error(read_vc(root = 1), "a 'root' of class numeric is not supported") + root <- tempfile(pattern = "git2rdata-basic") + dir.create(root) + expect_false(any(file.exists(git2rdata:::clean_data_path(root, "test")))) + expect_error( + git2rdata:::clean_data_path(root, file.path("..", "wrong_location")), + "file should not contain '..'" + ) + expect_error( + git2rdata:::clean_data_path(root, file.path(".", "..", "wrong_location")), + "file should not contain '..'" + ) + expect_is( + suppressWarnings( + output <- write_vc( + x = test_data, file = "test.txt", root = root, sorting = "test_Date" + ) + ), + "character" + ) + expect_identical(length(output), 2L) + expect_identical(unname(output), c("test.tsv", "test.yml")) + expect_true(all(file.exists(git2rdata:::clean_data_path(root, "test")))) expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), - expected.label = paste0("sorted_test_data$", i) - ) -} -expect_identical( - suppressWarnings(write_vc(x = test_data, file = "test.xls", root = root)), - output -) -expect_error( - write_vc(data.frame(junk = 5), file = "test", root = root, sorting = "junk"), - "The data was not overwritten because of the issues below." -) -expect_error( - suppressWarnings( - write_vc(x = test_data, file = "test", root = root, optimize = FALSE) - ), - "New data is verbose, whereas old data was optimized" -) -expect_warning( - write_vc(x = test_data, file = "test", root = root, optimize = FALSE, - strict = FALSE), - "New data is verbose, whereas old data was optimized" -) -expect_error( - write_vc( - x = test_data[, colnames(test_data) != "test_Date"], - file = "test", root = root - ), - "All sorting variables must be available" -) - -expect_false(any(file.exists(git2rdata:::clean_data_path(root, "a/verbose")))) -expect_is( - output <- + stored <- read_vc(file = "test.xls", root = root), + sorted_test_data, + check.attributes = FALSE + ) + for (i in colnames(stored)) { + expect_equal( + stored[[i]], + sorted_test_data[[i]], + label = paste0("stored$", i), + expected.label = paste0("sorted_test_data$", i) + ) + } + expect_identical( + suppressWarnings(write_vc(x = test_data, file = "test.xls", root = root)), + output + ) + expect_error( write_vc( - x = test_data, file = "a/verbose", root = root, sorting = "test_Date", - optimize = FALSE + data.frame(junk = 5), file = "test", root = root, sorting = "junk" ), - "character" -) -expect_true(all(file.exists(git2rdata:::clean_data_path(root, "a/verbose")))) -expect_equal( - stored <- read_vc(file = "a/verbose", root = root), - sorted_test_data, - check.attributes = FALSE -) -for (i in colnames(stored)) { + "The data was not overwritten because of the issues below." + ) + expect_error( + suppressWarnings( + write_vc(x = test_data, file = "test", root = root, optimize = FALSE) + ), + "New data is verbose, whereas old data was optimized" + ) + expect_warning( + write_vc(x = test_data, file = "test", root = root, optimize = FALSE, + strict = FALSE), + "New data is verbose, whereas old data was optimized" + ) + expect_error( + write_vc( + x = test_data[, colnames(test_data) != "test_Date"], + file = "test", root = root + ), + "All sorting variables must be available" + ) + + expect_false( + any( + file.exists(git2rdata:::clean_data_path(root, file.path("a", "verbose"))) + ) + ) + expect_is( + output <- write_vc( + x = test_data, file = file.path("a", "verbose"), root = root, + sorting = "test_Date", optimize = FALSE + ), + "character" + ) + expect_true( + all(file.exists(file.path(root, "a", c("verbose.csv", "verbose.yml")))) + ) expect_equal( - stored[[i]], - sorted_test_data[[i]], - label = paste0("stored$", i), - expected.label = paste0("sorted_test_data$", i) - ) -} -expect_error( - write_vc(x = test_data, file = "a/verbose", root = root), - "New data is optimized, whereas old data was verbose" -) + stored <- read_vc(file = file.path("a", "verbose"), root = root), + sorted_test_data, + check.attributes = FALSE + ) + for (i in colnames(stored)) { + expect_equal( + stored[[i]], + sorted_test_data[[i]], + label = paste0("stored$", i), + expected.label = paste0("sorted_test_data$", i) + ) + } + expect_error( + write_vc(x = test_data, file = file.path("a", "verbose"), root = root), + "New data is optimized, whereas old data was verbose" + ) -expect_is( - output <- write_vc( - test_na, file = "na", root = root, - sorting = c("test_Date", "test_integer", "test_numeric") - ), - "character" -) -expect_equal( - stored <- read_vc(file = "na", root = root), - sorted_test_na, - check.attributes = FALSE -) -for (i in colnames(stored)) { + expect_is( + output <- write_vc( + test_na, file = "na", root = root, + sorting = c("test_Date", "test_integer", "test_numeric") + ), + "character" + ) expect_equal( - stored[[i]], - sorted_test_na[[i]], - label = paste0("stored$", i), - expected.label = paste0("sorted_test_na$", i) + stored <- read_vc(file = "na", root = root), + sorted_test_na, + check.attributes = FALSE ) -} + for (i in colnames(stored)) { + expect_equal( + stored[[i]], + sorted_test_na[[i]], + label = paste0("stored$", i), + expected.label = paste0("sorted_test_na$", i) + ) + } -expect_error( - write_vc(test_data, file = "error", root = root, sorting = 1), - "sorting is not a character vector" -) -expect_error( - write_vc(test_data, file = "error", root = root, sorting = "junk"), - "All sorting variables must be available" -) -expect_false(any(file.exists(git2rdata:::clean_data_path(root, "sorting")))) -expect_warning( - write_vc(test_data, file = "error", root = root, sorting = character(0)), - "No sorting applied" -) -expect_warning( - output <- - write_vc(test_data, file = "sorting", root = root, sorting = "test_factor"), - "Sorting on 'test_factor' results in ties" -) -expect_is(output, "character") -expect_true(all(file.exists(git2rdata:::clean_data_path(root, "sorting")))) -expect_warning( - write_vc(test_data, file = "sorting", root = root, - sorting = c("test_factor", "test_Date"), strict = FALSE), - "The sorting variables changed" -) -expect_error( - suppressWarnings( - write_vc(test_data, file = "sorting", root = root, sorting = "test_factor") - ), - "The sorting variables changed" -) -test_changed <- test_data -test_changed$junk <- test_changed$test_character -expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), - "New data has a different number of variables" -) -test_changed$test_character <- NULL -expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), - "New variables: junk" -) -test_changed <- test_data -test_changed$test_character <- factor(test_changed$test_character) -expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root - )), - "Change in class: 'test_character' from character to factor" -) -expect_error( - suppressWarnings( - write_vc(test_data, file = "sorting", root = root, sorting = "test_logical") - ), - "The sorting variables changed" -) -test_changed <- test_data -test_changed$test_ordered <- factor( - test_changed$test_ordered, - levels = levels(test_changed$test_ordered), - ordered = FALSE -) -expect_error( - suppressWarnings(write_vc(test_changed, file = "sorting", root = root - )), - "'test_ordered' changes from ordinal to nominal" -) + expect_error( + write_vc(test_data, file = "error", root = root, sorting = 1), + "sorting is not a character vector" + ) + expect_error( + write_vc(test_data, file = "error", root = root, sorting = "junk"), + "All sorting variables must be available" + ) + expect_false(any(file.exists(git2rdata:::clean_data_path(root, "sorting")))) + expect_warning( + write_vc(test_data, file = "error", root = root, sorting = character(0)), + "No sorting applied" + ) + expect_warning( + output <- write_vc( + test_data, file = "sorting", root = root, sorting = "test_factor" + ), + "Sorting on 'test_factor' results in ties" + ) + expect_is(output, "character") + expect_true(all(file.exists(git2rdata:::clean_data_path(root, "sorting")))) + expect_warning( + write_vc(test_data, file = "sorting", root = root, + sorting = c("test_factor", "test_Date"), strict = FALSE), + "The sorting variables changed" + ) + expect_error( + suppressWarnings( + write_vc( + test_data, file = "sorting", root = root, sorting = "test_factor" + ) + ), + "The sorting variables changed" + ) + test_changed <- test_data + test_changed$junk <- test_changed$test_character + expect_error( + suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), + "New data has a different number of variables" + ) + test_changed$test_character <- NULL + expect_error( + suppressWarnings(write_vc(test_changed, file = "sorting", root = root)), + "New variables: junk" + ) + test_changed <- test_data + test_changed$test_character <- factor(test_changed$test_character) + expect_error( + suppressWarnings(write_vc(test_changed, file = "sorting", root = root + )), + "Change in class: 'test_character' from character to factor" + ) + expect_error( + suppressWarnings( + write_vc( + test_data, file = "sorting", root = root, sorting = "test_logical" + ) + ), + "The sorting variables changed" + ) + test_changed <- test_data + test_changed$test_ordered <- factor( + test_changed$test_ordered, + levels = levels(test_changed$test_ordered), + ordered = FALSE + ) + expect_error( + suppressWarnings(write_vc(test_changed, file = "sorting", root = root + )), + "'test_ordered' changes from ordinal to nominal" + ) -test_no <- test_data -test_no$test_ordered <- NULL -expect_is( - output <- write_vc( - x = test_no, file = "no_ordered", root = root, sorting = "test_Date" - ), - "character" -) -sorted_test_no <- sorted_test_data -sorted_test_no$test_ordered <- NULL -expect_equal( - stored <- read_vc(file = "no_ordered", root = root), - sorted_test_no, - check.attributes = FALSE -) -for (i in colnames(stored)) { + test_no <- test_data + test_no$test_ordered <- NULL + expect_is( + output <- write_vc( + x = test_no, file = "no_ordered", root = root, sorting = "test_Date" + ), + "character" + ) + sorted_test_no <- sorted_test_data + sorted_test_no$test_ordered <- NULL expect_equal( - stored[[i]], - sorted_test_no[[i]], - label = paste0("stored$", i), - expected.label = paste0("sorted_test_data$", i) + stored <- read_vc(file = "no_ordered", root = root), + sorted_test_no, + check.attributes = FALSE ) -} - -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) + for (i in colnames(stored)) { + expect_equal( + stored[[i]], + sorted_test_no[[i]], + label = paste0("stored$", i), + expected.label = paste0("sorted_test_data$", i) + ) + } +}) test_that( "meta() works on complex", { @@ -286,7 +297,6 @@ test_that("user specified na strings work", { grep("junk", readLines(file.path(root, fn[1]), encoding = "UTF-8")), 2:4 ) - file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("write_vc() allows changes in factor levels", { @@ -314,7 +324,6 @@ test_that("write_vc() allows changes in factor levels", { write_vc(x, "factor_levels", root), "New factor labels for 'test_factor'" ) - file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("meta attributes are printed as yaml", { diff --git a/tests/testthat/test_b_is_git2rmeta.R b/tests/testthat/test_b_is_git2rmeta.R index 99eaa4f7..3aa32f41 100644 --- a/tests/testthat/test_b_is_git2rmeta.R +++ b/tests/testthat/test_b_is_git2rmeta.R @@ -167,8 +167,3 @@ test_that("is_git2rmeta handle git repositories", { expect_true(is_git2rmeta(file = file, root = root)) expect_true(is_git2rdata(file = file, root = root)) }) - -file.remove(list.files(git2r::workdir(root), recursive = TRUE, - full.names = TRUE)) -file.remove(list.files(git2r::workdir(root), recursive = TRUE, - include.dirs = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_b_prune.R b/tests/testthat/test_b_prune.R index 27173b4e..4fd70fc0 100644 --- a/tests/testthat/test_b_prune.R +++ b/tests/testthat/test_b_prune.R @@ -1,74 +1,80 @@ -context("rm_data & prune_meta") +test_that("rm_data & prune_meta", { + expect_error(rm_data(root = 1), "a 'root' of class numeric is not supported") + expect_error( + prune_meta(root = 1), "a 'root' of class numeric is not supported" + ) + expect_error( + list_data(root = 1), "a 'root' of class numeric is not supported" + ) -expect_error(rm_data(root = 1), "a 'root' of class numeric is not supported") -expect_error(prune_meta(root = 1), "a 'root' of class numeric is not supported") -expect_error(list_data(root = 1), "a 'root' of class numeric is not supported") + root <- tempfile(pattern = "git2rdata-prune") + root <- normalizePath(root, winslash = "/", mustWork = FALSE) + expect_error(rm_data(root, "."), root) + expect_error(prune_meta(root), root) + dir.create(root) + expect_null(prune_meta(root, path = "junk")) + write_vc(test_data, file = "test", root = root, sorting = "test_Date") + write_vc( + test_data, file = file.path("a", "verbose"), root = root, + sorting = "test_Date", optimize = FALSE + ) -root <- tempfile(pattern = "git2rdata-prune") -root <- normalizePath(root, winslash = "/", mustWork = FALSE) -expect_error(rm_data(root, "."), root) -expect_error(prune_meta(root), root) -dir.create(root) -expect_null(prune_meta(root, path = "junk")) -write_vc(test_data, file = "test", root = root, sorting = "test_Date") -write_vc( - test_data, file = "a/verbose", root = root, sorting = "test_Date", - optimize = FALSE -) + current <- list.files(root, recursive = TRUE) + expect_identical( + rm_data(root = root, path = "a"), file.path("a", "verbose.csv") + ) + expect_identical( + list.files(root, recursive = TRUE), + current[-grep("^.*/.*\\.csv", current)] + ) -current <- list.files(root, recursive = TRUE) -expect_identical(rm_data(root = root, path = "a"), "a/verbose.tsv") -expect_identical( - list.files(root, recursive = TRUE), - current[-grep("^.*/.*\\.tsv", current)] -) + current <- list.files(root, recursive = TRUE) + expect_identical( + prune_meta(root = root, path = "."), file.path("a", "verbose.yml") + ) + expect_identical( + list.files(root, recursive = TRUE), + current[-grep("^.*/.*", current)] + ) -current <- list.files(root, recursive = TRUE) -expect_identical(prune_meta(root = root, path = "."), "a/verbose.yml") -expect_identical( - list.files(root, recursive = TRUE), - current[-grep("^.*/.*", current)] -) + file.remove(file.path(root, "test.yml")) + current <- list.files(root, recursive = TRUE) + expect_identical(rm_data(root, path = "."), character(0)) + expect_identical(list.files(root, recursive = TRUE), current) -file.remove(file.path(root, "test.yml")) -current <- list.files(root, recursive = TRUE) -expect_identical(rm_data(root, path = "."), character(0)) -expect_identical(list.files(root, recursive = TRUE), current) - -write_vc(test_data, file = "test1", root = root, sorting = "test_Date") -junk <- write_vc(test_data, file = "test2", root = root, sorting = "test_Date") -write_vc(test_data, file = "a/test2", root = root, sorting = "test_Date") -meta_data <- yaml::read_yaml(file.path(root, junk[2])) -meta_data[["..generic"]] <- NULL -yaml::write_yaml(meta_data, file = file.path(root, junk[2])) -yaml::write_yaml(meta_data, file = file.path(root, "a", junk[2])) -expect_warning( - list_data(root = root, path = ".", recursive = FALSE), - "Invalid metadata files found.*:\ntest2" -) -expect_warning( - list_data(root = root, path = ".", recursive = TRUE), - "Invalid metadata files found.*:\na/test2\ntest2" -) -current <- list.files(root, recursive = TRUE) -expect_warning( - rm_data(root = root, path = "."), - "Invalid metadata files found.*:\na/test2\ntest2" -) -expect_identical(current[current != "test1.tsv"], - list.files(root, recursive = TRUE)) -file.remove(file.path(root, "test2.tsv")) -current <- list.files(root, recursive = TRUE) -expect_warning( - prune_meta(root = root, path = "."), - "Invalid metadata files found.*:\ntest2" -) -expect_identical(current[current != "test1.yml"], - list.files(root, recursive = TRUE)) - -file.remove( - list.files(root, recursive = TRUE, full.names = TRUE) -) -file.remove( - list.files(root, recursive = TRUE, include.dirs = TRUE, full.names = TRUE) -) + write_vc(test_data, file = "test1", root = root, sorting = "test_Date") + junk <- write_vc( + test_data, file = "test2", root = root, sorting = "test_Date" + ) + write_vc( + test_data, file = file.path("a", "test2"), root = root, + sorting = "test_Date" + ) + meta_data <- yaml::read_yaml(file.path(root, junk[2])) + meta_data[["..generic"]] <- NULL + yaml::write_yaml(meta_data, file = file.path(root, junk[2])) + yaml::write_yaml(meta_data, file = file.path(root, "a", junk[2])) + expect_warning( + list_data(root = root, path = ".", recursive = FALSE), + "Invalid metadata files found.*:\ntest2" + ) + expect_warning( + list_data(root = root, path = ".", recursive = TRUE), + "Invalid metadata files found.*:\na/test2\ntest2" + ) + current <- list.files(root, recursive = TRUE) + expect_warning( + rm_data(root = root, path = "."), + "Invalid metadata files found.*:\na/test2\ntest2" + ) + expect_identical(current[current != "test1.tsv"], + list.files(root, recursive = TRUE)) + file.remove(file.path(root, "test2.tsv")) + current <- list.files(root, recursive = TRUE) + expect_warning( + prune_meta(root = root, path = "."), + "Invalid metadata files found.*:\ntest2" + ) + expect_identical(current[current != "test1.yml"], + list.files(root, recursive = TRUE)) +}) diff --git a/tests/testthat/test_b_special.R b/tests/testthat/test_b_special.R index 61d0608b..bb51cd62 100644 --- a/tests/testthat/test_b_special.R +++ b/tests/testthat/test_b_special.R @@ -1,81 +1,82 @@ -context("handle special characters") -root <- tempfile(pattern = "git2rdata-special") -dir.create(root) -ds <- data.frame( - a = c( - "a", "a b", - "a\tb", "a\tb\tc", "\ta", "a\t", - "a\nb", "a\nb\nc", "\na", "a\n", - "a\"b", "a\"b\"c", "\"b", "a\"", "\"b\"", - "a'b", "a'b'c", "'b", "a'", "'b'", - "a b c", "\"NA\"", "'NA'", NA, - "\U00E9", "&", "\U00E0", "\U00B5", "\U00E7", "€", "|", "#", "@", "$" - ), - stringsAsFactors = FALSE -) -expect_is( - output <- write_vc(ds, "character", root, sorting = "a"), - "character" -) -expect_equal( - names(output)[1], - "1d135a85dc9beff3223d6c79f0d8975b559afca7" -) -old_locale <- git2rdata:::set_c_locale() -dso <- ds[order(ds$a), , drop = FALSE] # nolint -git2rdata:::set_local_locale(old_locale) -expect_equal( - junk <- read_vc("character", root), dso, check.attributes = FALSE -) -expect_identical( - names(output), - names(attr(junk, "source")) -) -expect_is( - write_vc(ds, "character2", root, sorting = "a", optimize = FALSE), - "character" -) -expect_equal( - junk <- read_vc("character2", root), dso, check.attributes = FALSE -) -z <- rbind(ds, "NA") -z$a <- factor(z$a) -expect_is( - suppressWarnings(write_vc(z, "factor", root, sorting = "a")), - "character" -) -expect_equal( - read_vc("factor", root), - z[order(z$a), , drop = FALSE], # nolint - check.attributes = FALSE -) +test_that("handle special characters", { + root <- tempfile(pattern = "git2rdata-special") + dir.create(root) + ds <- data.frame( + a = c( + "a", "a b", + "a\tb", "a\tb\tc", "\ta", "a\t", + "a,b", "a,b,c", ",a", "a,", + "a;b", "a;b;c", ";a", "a;", + "a\nb", "a\nb\nc", "\na", "a\n", + "a\"b", "a\"b\"c", "\"b", "a\"", "\"b\"", + "a'b", "a'b'c", "'b", "a'", "'b'", + "a b c", "\"NA\"", "'NA'", NA, + "\U00E9", "&", "\U00E0", "\U00B5", "\U00E7", "€", "|", "#", "@", "$" + ), + stringsAsFactors = FALSE + ) + expect_is( + output <- write_vc(ds, "character", root, sorting = "a"), + "character" + ) + expect_equal( + names(output)[1], + "e8a6734d740941f347bbc21e3227b4a6392b6562" + ) + old_locale <- git2rdata:::set_c_locale() + dso <- ds[order(ds$a), , drop = FALSE] # nolint + git2rdata:::set_local_locale(old_locale) + expect_equal( + junk <- read_vc("character", root), dso, check.attributes = FALSE + ) + expect_identical( + names(output), + names(attr(junk, "source")) + ) + expect_is( + write_vc(ds, "character2", root, sorting = "a", optimize = FALSE), + "character" + ) + expect_equal( + junk <- read_vc("character2", root), dso, check.attributes = FALSE + ) + z <- rbind(ds, "NA") + z$a <- factor(z$a) + expect_is( + suppressWarnings(write_vc(z, "factor", root, sorting = "a")), + "character" + ) + expect_equal( + read_vc("factor", root), + z[order(z$a), , drop = FALSE], # nolint + check.attributes = FALSE + ) -old_locale <- git2rdata:::set_c_locale() -ds$a <- factor(ds$a) -git2rdata:::set_local_locale(old_locale) -expect_is( - output <- write_vc(ds, "factor2", root, sorting = "a", optimize = FALSE), - "character" -) -expect_equal( - junk <- read_vc("factor2", root), - ds[order(ds$a), , drop = FALSE], # nolint - check.attributes = FALSE -) -expect_equal( - names(output)[1], - "1d135a85dc9beff3223d6c79f0d8975b559afca7" -) -expect_identical( - names(output), - names(attr(junk, "source")) -) + old_locale <- git2rdata:::set_c_locale() + ds$a <- factor(ds$a) + git2rdata:::set_local_locale(old_locale) + expect_is( + output <- write_vc(ds, "factor2", root, sorting = "a", optimize = FALSE), + "character" + ) + expect_equal( + junk <- read_vc("factor2", root), + ds[order(ds$a), , drop = FALSE], # nolint + check.attributes = FALSE + ) + expect_equal( + names(output)[1], + "5fd788c095d847d8e1a8386f621ee11fc69cd9a5" + ) + expect_identical( + names(output), + names(attr(junk, "source")) + ) -yaml_file <- yaml::read_yaml(file.path(root, "factor2.yml")) -yaml_file[["..generic"]][["data_hash"]] <- "zzz" -yaml::write_yaml(yaml_file, file.path(root, "factor2.yml")) -expect_warning(read_vc("factor2", root = root), - "Mismatching data hash. Data altered outside of git2rdata.") - -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) + yaml_file <- yaml::read_yaml(file.path(root, "factor2.yml")) + yaml_file[["..generic"]][["data_hash"]] <- "zzz" + yaml::write_yaml(yaml_file, file.path(root, "factor2.yml")) + expect_warning(read_vc("factor2", root = root), + "Mismatching data hash. Data altered outside of git2rdata.") +}) diff --git a/tests/testthat/test_b_verify_vc.R b/tests/testthat/test_b_verify_vc.R new file mode 100644 index 00000000..41f35f6f --- /dev/null +++ b/tests/testthat/test_b_verify_vc.R @@ -0,0 +1,21 @@ +test_that("verify_vc", { + root <- tempfile(pattern = "git2rdata-verify-vc") + dir.create(root) + write_vc( + x = test_data, file = "test.txt", root = root, sorting = "test_integer" + ) + expect_s3_class( + verify_vc("test.txt", root = root, variables = "test_integer"), + "data.frame" + ) + expect_s3_class( + verify_vc( + "test.txt", root = root, variables = c("test_numeric", "test_logical") + ), + "data.frame" + ) + expect_error( + verify_vc("test.txt", root = root, variables = c("test_integer", "junk")), + "variables missing.*junk" + ) +}) diff --git a/tests/testthat/test_c_git.R b/tests/testthat/test_c_git.R index efa45d89..c4dd7518 100644 --- a/tests/testthat/test_c_git.R +++ b/tests/testthat/test_c_git.R @@ -1,4 +1,4 @@ -context("write_vc() and read_vc() on a git-repository") +test_that("write_vc() and read_vc() on a git-repository", { root <- tempfile(pattern = "git2rdata-git") dir.create(root) root <- git2r::init(root) @@ -83,8 +83,8 @@ for (i in colnames(stored)) { } forced <- write_vc( - test_data, file = "forced/force", root = root, sorting = "test_Date", - stage = TRUE, force = TRUE + test_data, file = file.path("forced", "force"), root = root, + sorting = "test_Date", stage = TRUE, force = TRUE ) expect_equal( status(root, ignored = TRUE), @@ -97,7 +97,7 @@ expect_equal( check.attributes = FALSE ) expect_equal( - stored <- read_vc(file = "forced/force", root = root), + stored <- read_vc(file = file.path("forced", "force"), root = root), sorted_test_data, check.attributes = FALSE ) @@ -173,11 +173,11 @@ staged <- write_vc( current <- list.files(git2r::workdir(root), recursive = TRUE) expect_identical( rm_data(root = root, path = "."), - "forced/force.tsv" + file.path("forced", "force.tsv") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - "forced/force.tsv" + file.path("forced", "force.tsv") ) expect_error( prune_meta(root = root, path = ".", stage = TRUE), @@ -185,16 +185,16 @@ expect_error( ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - "forced/force.tsv" + file.path("forced", "force.tsv") ) expect_null(rm_data(root, path = ".")) expect_identical( prune_meta(root = root, path = ".", stage = FALSE), - "forced/force.yml" + file.path("forced", "force.yml") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "forced/force.yml") + file.path("forced", c("force.tsv", "force.yml")) ) expect_null(prune_meta(root, path = ".")) git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") @@ -205,20 +205,22 @@ staged <- write_vc( ) expect_identical( rm_data(root = root, path = ".", type = "m"), - c("forced/force.tsv", "staged.tsv") + c(file.path("forced", "force.tsv"), "staged.tsv") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "staged.tsv") + c(file.path("forced", "force.tsv"), "staged.tsv") ) expect_warning( removed <- prune_meta(root = root, path = ".", stage = FALSE), "data removed and staged, metadata removed but unstaged" ) -expect_identical(removed, c("forced/force.yml", "staged.yml")) +expect_identical(removed, c(file.path("forced", "force.yml"), "staged.yml")) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "forced/force.yml", "staged.tsv", "staged.yml") + c( + file.path("forced", c("force.tsv", "force.yml")), "staged.tsv", "staged.yml" + ) ) git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") @@ -228,20 +230,22 @@ staged <- write_vc( ) expect_identical( rm_data(root = root, path = ".", type = "i", stage = TRUE), - c("forced/force.tsv", "ignore.tsv", "staged.tsv") + c(file.path("forced", "force.tsv"), "ignore.tsv", "staged.tsv") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "ignore.tsv", "staged.tsv") + c(file.path("forced", "force.tsv"), "ignore.tsv", "staged.tsv") ) expect_identical( prune_meta(root = root, path = ".", stage = TRUE), - c("forced/force.yml", "ignore.yml", "staged.yml") + c(file.path("forced", "force.yml"), "ignore.yml", "staged.yml") ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "forced/force.yml", "ignore.tsv", "ignore.yml", - "staged.tsv", "staged.yml") + c( + file.path("forced", c("force.tsv", "force.yml")), "ignore.tsv", + "ignore.yml", "staged.tsv", "staged.yml" + ) ) git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") @@ -254,19 +258,31 @@ staged <- write_vc( ) expect_identical( rm_data(root = root, path = ".", type = "all", stage = TRUE), - c("forced/force.tsv", "ignore.tsv", "staged.tsv", "untracked.tsv") + c( + file.path("forced", "force.tsv"), "ignore.tsv", "staged.tsv", + "untracked.tsv" + ) ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "ignore.tsv", "staged.tsv", "untracked.tsv") + c( + file.path("forced", "force.tsv"), "ignore.tsv", "staged.tsv", + "untracked.tsv" + ) ) expect_identical( prune_meta(root = root, path = ".", stage = TRUE), - c("forced/force.yml", "ignore.yml", "staged.yml", "untracked.yml") + c( + file.path("forced", "force.yml"), "ignore.yml", "staged.yml", + "untracked.yml" + ) ) expect_identical( current[!current %in% list.files(git2r::workdir(root), recursive = TRUE)], - c("forced/force.tsv", "forced/force.yml", "ignore.tsv", "ignore.yml", - "staged.tsv", "staged.yml", "untracked.tsv", "untracked.yml") + c( + file.path("forced", c("force.tsv", "force.yml")), "ignore.tsv", + "ignore.yml", "staged.tsv", "staged.yml", "untracked.tsv", "untracked.yml" + ) ) git2r::reset(git2r::last_commit(root), reset_type = "hard", path = ".") +}) diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R index e5fe9090..38e92370 100644 --- a/tests/testthat/test_d_recent_commit.R +++ b/tests/testthat/test_d_recent_commit.R @@ -20,8 +20,8 @@ write_vc( commit_1 <- commit(root, "initial commit") write_vc( - test_data[3:4, ], file = "junk/test1", root = root, stage = TRUE, - sorting = "test_Date" + test_data[3:4, ], file = file.path("junk", "test1"), root = root, + stage = TRUE, sorting = "test_Date" ) commit_2 <- commit(root, "second file") @@ -55,7 +55,7 @@ expect_identical( ) ) expect_identical( - recent_commit(file = "junk/test1", root, data = TRUE), + recent_commit(file = file.path("junk", "test1"), root, data = TRUE), data.frame( commit = commit_2$sha, author = commit_2$author$name, diff --git a/tests/testthat/test_e_empty_label.R b/tests/testthat/test_e_empty_label.R index b3a66fa8..43d4fccc 100644 --- a/tests/testthat/test_e_empty_label.R +++ b/tests/testthat/test_e_empty_label.R @@ -99,5 +99,3 @@ test_that("relabel handles empty labels", { relabel(file = file, root = root, change = change) expect_is(mydfr <- read_vc(file = file, root = root), "data.frame") }) - -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_non_ascii.R b/tests/testthat/test_e_non_ascii.R index 4d9c7b33..39cd3f55 100644 --- a/tests/testthat/test_e_non_ascii.R +++ b/tests/testthat/test_e_non_ascii.R @@ -1,4 +1,3 @@ -context("check writing non ASCII characters") root <- tempfile("git2rdata-empty-label") dir.create(root) characters <- data.frame(a = c("€$£ @&#§µ^ ()[]{}|²³<>/\\*+- ,;:.?!~", @@ -40,5 +39,3 @@ test_that("special character are written properly as verbose factor", { ) expect_equivalent(read_vc(file = file, root = root), characters) }) - -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_e_upgrade.R b/tests/testthat/test_e_upgrade.R index 6df7bf1d..79a6dc95 100644 --- a/tests/testthat/test_e_upgrade.R +++ b/tests/testthat/test_e_upgrade.R @@ -1,150 +1,134 @@ -context("upgrade to new version") root <- tempfile("git2rdata-upgrade") dir.create(root) +origin <- system.file("testthat", package = "git2rdata") +file.copy(origin, root, recursive = TRUE) +path <- file.path(root, "testthat") + test_that("read_vc() checks version", { - file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") - correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.3" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) expect_error( - read_vc(file = file, root = root), - "Data stored using an older version of `git2rdata`." + read_vc("optimized_0_0_4", path), "Data stored using an older version" ) - - junk_yaml[["..generic"]][["git2rdata"]] <- NULL - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) expect_error( - read_vc(file = file, root = root), - "Data stored using an older version of `git2rdata`." + read_vc("verbose_0_0_4", path), "Data stored using an older version" ) -}) - -test_that("relabel() checks version", { - file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") - new_labels <- list(test_factor = list(a = "xyz")) - correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.3" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_is(read_vc("optimized_0_3_1", path), "data.frame") expect_error( - relabel(file = file, root = root, change = new_labels), - "Data stored using an older version of `git2rdata`." + read_vc("verbose_0_3_1", path), "Data stored using an older version" ) +}) - junk_yaml[["..generic"]][["git2rdata"]] <- NULL - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_error( - relabel(file = file, root = root, change = new_labels), - "Data stored using an older version of `git2rdata`." + +test_that("upgrade_data() works on single files", { + expect_message( + z <- upgrade_data(file = "optimized_0_4_0", root = path), + "already up to date" ) -}) + expect_is(z, "character") + expect_silent( + upgrade_data(file = "optimized_0_4_0", root = path, verbose = FALSE) + ) + expect_is(read_vc("optimized_0_4_0", path), "data.frame") -test_that("upgrade_data() validates metadata", { - file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") - expect_error( - upgrade_data(file = file, root = pi), - "a 'root' of class numeric is not supported" + expect_message( + z <- upgrade_data(file = "verbose_0_4_0", root = path), + "already up to date" + ) + expect_is(z, "character") + expect_silent( + upgrade_data(file = "verbose_0_4_0", root = path, verbose = FALSE) ) + expect_is(read_vc("verbose_0_4_0", path), "data.frame") - correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.4" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_identical( - unname(upgrade_data(file = file, root = root)), - file + expect_message( + z <- upgrade_data(file = "optimized_0_3_1", root = path), + "already up to date" ) - junk_yaml[["..generic"]][["hash"]] <- NULL - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_error( - upgrade_data(file = file, root = root), - "corrupt metadata, no hash found." + expect_is(z, "character") + expect_silent( + upgrade_data(file = "optimized_0_3_1", root = path, verbose = FALSE) ) - junk_yaml[["..generic"]] <- NULL - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) + expect_is(read_vc("optimized_0_3_1", path), "data.frame") + expect_message( - junk <- upgrade_data(file = file, root = root), - "is not a git2rdata object" + z <- upgrade_data(file = "verbose_0_3_1", root = path), "updated" ) - expect_equivalent(file, junk) + expect_true(file_test("-f", file.path(path, "verbose_0_3_1.csv"))) + expect_false(file_test("-f", file.path(path, "verbose_0_3_1.tsv"))) + expect_is(read_vc("verbose_0_3_1", path), "data.frame") - file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date", - optimize = FALSE) - correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - junk_yaml <- correct_yaml - junk_yaml[["..generic"]][["git2rdata"]] <- "0.0.5" - yaml::write_yaml(junk_yaml, file.path(root, junk[2])) - expect_identical( - unname(upgrade_data(file = file, root = root)), - file + expect_error( + upgrade_data(file = "optimized_0_0_4", root = path), "ancient" ) }) -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) +root <- tempfile("git2rdata-upgrade") +dir.create(root) +origin <- system.file("testthat", package = "git2rdata") +file.copy(origin, root, recursive = TRUE) +path <- file.path(root, "testthat") +file.remove( + list.files(path, pattern = "0_0_4", full.names = TRUE) +) +test_that("upgrade_data() works on paths", { + expect_message(z <- upgrade_data(root = root, path = ".")) + expect_is(z, "character") + expect_silent(upgrade_data(root = root, path = ".", verbose = FALSE)) + expect_is(read_vc("optimized_0_4_0", path), "data.frame") + expect_is(read_vc("verbose_0_4_0", path), "data.frame") + expect_is(read_vc("optimized_0_3_1", path), "data.frame") + expect_is(read_vc("verbose_0_3_1", path), "data.frame") +}) +file.remove( + list.files(root, recursive = TRUE, full.names = TRUE) +) -test_that("upgrade_data() works from 0.0.3 to 0.0.4", { - file <- basename(tempfile(tmpdir = root)) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") - correct_yaml <- yaml::read_yaml(file.path(root, junk[2])) - old_yaml <- correct_yaml - old_yaml[["..generic"]][["git2rdata"]] <- NULL - old_yaml[["..generic"]][["data_hash"]] <- NULL - yaml::write_yaml(old_yaml, file.path(root, junk[2])) - expect_message( - files <- upgrade_data(file = file, root = root, verbose = TRUE), - paste0(file, ".yml updated") - ) - expect_message( - files <- upgrade_data(file = file, root = root, verbose = TRUE), - paste(file, "already up to date") - ) - expect_equivalent(read_vc(file = file, root = root), sorted_test_data) +root <- tempfile("git2rdata-upgrade") +dir.create(root) +origin <- system.file("testthat", package = "git2rdata") +file.copy(origin, root, recursive = TRUE) +path <- file.path(root, "testthat") +file.remove(list.files(path, pattern = "0_0_4", full.names = TRUE)) +repo <- git2r::init(root) +git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") +git2r::add(repo, list.files(root, recursive = TRUE)) +git2r::commit(repo, message = "initial commit") - root <- git2r::init(root) - git2r::config(root, user.name = "Alice", user.email = "alice@example.org") - yaml::write_yaml(old_yaml, file.path(git2r::workdir(root), junk[2])) - git2r::add(root, paste0(file, c(".tsv", ".yml"))) - initial_commit <- commit(root, "initial commit", all = TRUE) - expect_message( - files <- upgrade_data(file = file, root = root, verbose = TRUE), - paste0(file, ".yml updated") +test_that("upgrade_data() works on a git repository", { + expect_message(z <- upgrade_data(root = repo, path = ".")) + expect_is(z, "character") + expect_silent(upgrade_data(root = repo, path = ".", verbose = FALSE)) + expect_is(read_vc("optimized_0_4_0", path), "data.frame") + expect_is(read_vc("verbose_0_4_0", path), "data.frame") + expect_is(read_vc("optimized_0_3_1", path), "data.frame") + expect_is(read_vc("verbose_0_3_1", path), "data.frame") + expect_identical( + vapply(status(repo), length, integer(1)), + c(staged = 0L, unstaged = 2L, untracked = 1L) ) - expect_equal( - status(root), - list(staged = list(), unstaged = list(paste0(files, ".yml")), - untracked = list()), - check.attributes = FALSE + expect_silent( + upgrade_data(root = repo, path = ".", verbose = FALSE, stage = TRUE) ) - expect_message( - files <- upgrade_data(file = file, root = root, verbose = TRUE, - stage = TRUE), - paste(file, "already up to date") - ) - expect_equal( - status(root), - list( - staged = list(paste0(files, ".yml")), unstaged = list(), - untracked = list() - ), - check.attributes = FALSE + expect_identical( + vapply(status(repo), length, integer(1)), + c(staged = 3L, unstaged = 0L, untracked = 0L) ) +}) - file <- basename(tempfile(tmpdir = git2r::workdir(root))) - junk <- write_vc(test_data, file = file, root = root, sorting = "test_Date") +test_that("validation", { + root <- tempfile("git2rdata-upgrade") + dir.create(root) + origin <- system.file("testthat", package = "git2rdata") + file.copy(origin, root, recursive = TRUE) + path <- file.path(root, "testthat") expect_error( - upgrade_data(file = file, path = ".", root = root, verbose = TRUE), - "specify either 'file' or 'path'" + upgrade_data(root = 1), "a 'root' of class numeric is not supported" ) - expect_is( - upgrade_data(path = ".", root = root, verbose = TRUE), - "character" + yml <- read_yaml(file.path(path, "verbose_0_0_4.yml")) + write_yaml( + yml[names(yml) != "..generic"], file.path(path, "verbose_0_0_4.yml") + ) + expect_message( + upgrade_data(file = "verbose_0_0_4", root = path), + "is not a git2rdata object" ) }) - -file.remove(list.files(git2r::workdir(root), recursive = TRUE, - full.names = TRUE)) diff --git a/tests/testthat/test_e_validate_metadata.R b/tests/testthat/test_e_validate_metadata.R index 748f702f..4a0e38c2 100644 --- a/tests/testthat/test_e_validate_metadata.R +++ b/tests/testthat/test_e_validate_metadata.R @@ -61,5 +61,3 @@ test_that("write_vc() checks existing metadata", { "Existing metadata file is invalid" ) }) - -file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) diff --git a/tests/testthat/test_f_split_by.R b/tests/testthat/test_f_split_by.R index 789770f3..921bce5f 100644 --- a/tests/testthat/test_f_split_by.R +++ b/tests/testthat/test_f_split_by.R @@ -101,6 +101,4 @@ test_that("write_vc() handles the split_by argument", { suppressWarnings(is_git2rdata("sorted", root, "warning")), "Corrupt data, incorrect header in index.tsv" ) - - file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) diff --git a/tests/testthat/test_g_rename_variable.R b/tests/testthat/test_g_rename_variable.R index 4d85bd0d..78dc202b 100644 --- a/tests/testthat/test_g_rename_variable.R +++ b/tests/testthat/test_g_rename_variable.R @@ -75,8 +75,6 @@ test_that("rename_variable() handles single files", { expect_identical(colnames(changed_df)[updated], names(change)) expect_equivalent(sorted_test_data[, change], changed_df[, names(change)]) git2r::reset(cm, "hard") - - file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("rename_variable() handles split_by files", { @@ -190,8 +188,6 @@ test_that("rename_variable() handles split_by files", { changed_df[, names(change)] ) git2r::reset(cm, "hard") - - file.remove(list.files(root, recursive = TRUE, full.names = TRUE)) }) test_that("rename_variable() handles wrong type of root", { diff --git a/vignettes/efficiency.Rmd b/vignettes/efficiency.Rmd index 56380af9..50a4cef3 100644 --- a/vignettes/efficiency.Rmd +++ b/vignettes/efficiency.Rmd @@ -1,7 +1,7 @@ --- title: "Efficiency Relative to Storage and Time" author: "Thierry Onkelinx" -output: +output: rmarkdown::html_vignette: fig_caption: yes vignette: > @@ -21,94 +21,111 @@ opts_chunk$set( comment = "#>" ) library(ggplot2) -inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517", +inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517", "#843860", "#C04384", "#C2C444", "#685457") theme_inbo <- function(base_size = 12, base_family = "") { - rect.bg <- "white" - legend.bg <- "white" - panel.bg <- "#F3F3F3" - panel.grid <- "white" - plot.bg <- "white" + rect_bg <- "white" + legend_bg <- "white" + panel_bg <- "#F3F3F3" + panel_grid <- "white" + plot_bg <- "white" half_line <- base_size / 2 - theme( - line = element_line(colour = "black", size = 0.5, linetype = 1, + ggplot2::theme( + line = ggplot2::element_line(colour = "black", size = 0.5, linetype = 1, lineend = "butt"), - rect = element_rect(fill = rect.bg, colour = "black", size = 0.5, + rect = ggplot2::element_rect(fill = rect_bg, colour = "black", size = 0.5, linetype = 1), - text = element_text(family = base_family, face = "plain", - colour = "#843860", size = base_size, hjust = 0.5, - vjust = 0.5, angle = 0, lineheight = 0.9, - margin = margin(), debug = FALSE), - axis.line = element_blank(), - axis.line.x = element_blank(), - axis.line.y = element_blank(), - axis.text = element_text(size = rel(0.8)), - axis.text.x = element_text(margin = margin(t = 0.8 * half_line / 2), - vjust = 1), + text = ggplot2::element_text(family = base_family, face = "plain", + colour = "#843860", size = base_size, hjust = 0.5, + vjust = 0.5, angle = 0, lineheight = 0.9, + margin = ggplot2::margin(), debug = FALSE), + axis.line = ggplot2::element_blank(), + axis.line.x = ggplot2::element_blank(), + axis.line.y = ggplot2::element_blank(), + axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)), + axis.text.x = ggplot2::element_text( + margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1 + ), axis.text.x.top = NULL, - axis.text.y = element_text(margin = margin(r = 0.8 * half_line / 2), - hjust = 1), + axis.text.y = ggplot2::element_text( + margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1 + ), axis.text.y.right = NULL, - axis.ticks = element_line(), - axis.ticks.length = unit(0.15, "cm"), - axis.title = element_text(colour = "black"), - axis.title.x = element_text( - margin = margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) + axis.ticks = ggplot2::element_line(), + axis.ticks.length = ggplot2::unit(0.15, "cm"), + axis.title = ggplot2::element_text(colour = "black"), + axis.title.x = ggplot2::element_text( + margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) ), axis.title.x.top = NULL, - axis.title.y = element_text( - margin = margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), + axis.title.y = ggplot2::element_text( + margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), angle = 90 ), axis.title.y.right = NULL, - legend.background = element_rect(colour = NA, fill = legend.bg), - legend.key = element_rect(fill = panel.bg, colour = NA), - legend.key.size = unit(1.2, "lines"), + legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg), + legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA), + legend.key.size = ggplot2::unit(1.2, "lines"), legend.key.height = NULL, legend.key.width = NULL, legend.margin = NULL, - legend.spacing = unit(0.2, "cm"), + legend.spacing = ggplot2::unit(0.2, "cm"), legend.spacing.x = NULL, legend.spacing.y = NULL, - legend.text = element_text(size = rel(0.8)), + legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)), legend.text.align = NULL, - legend.title = element_text(size = rel(0.8), face = "bold", hjust = 0, - colour = "black"), + legend.title = ggplot2::element_text( + size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black" + ), legend.title.align = NULL, legend.position = "right", legend.direction = NULL, legend.justification = "center", legend.box = NULL, - legend.box.margin = margin(t = half_line, r = half_line, b = half_line, - l = half_line), - legend.box.background = element_rect(colour = NA, fill = legend.bg), - legend.box.spacing = unit(0.2, "cm"), - panel.background = element_rect(fill = panel.bg, colour = NA), - panel.border = element_blank(), - panel.grid = element_line(colour = panel.grid), - panel.grid.minor = element_line(colour = panel.grid, size = 0.25), - panel.spacing = unit(half_line, "pt"), + legend.box.margin = ggplot2::margin( + t = half_line, r = half_line, b = half_line, l = half_line + ), + legend.box.background = ggplot2::element_rect( + colour = NA, fill = legend_bg + ), + legend.box.spacing = ggplot2::unit(0.2, "cm"), + panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA), + panel.border = ggplot2::element_blank(), + panel.grid = ggplot2::element_line(colour = panel_grid), + panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25), + panel.spacing = ggplot2::unit(half_line, "pt"), panel.spacing.x = NULL, panel.spacing.y = NULL, panel.ontop = FALSE, - strip.background = element_rect(fill = "#8E9DA7", colour = NA), - strip.text = element_text(size = rel(0.8), colour = "#F3F3F3"), - strip.text.x = element_text(margin = margin(t = half_line, b = half_line)), - strip.text.y = element_text(margin = margin(r = half_line, l = half_line), - angle = -90), - strip.switch.pad.grid = unit(0.1, "cm"), - strip.switch.pad.wrap = unit(0.1, "cm"), + strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA), + strip.text = ggplot2::element_text( + size = ggplot2::rel(0.8), colour = "#F3F3F3" + ), + strip.text.x = ggplot2::element_text( + margin = ggplot2::margin(t = half_line, b = half_line) + ), + strip.text.y = ggplot2::element_text( + margin = ggplot2::margin(r = half_line, l = half_line), angle = -90 + ), + strip.switch.pad.grid = ggplot2::unit(0.1, "cm"), + strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"), strip.placement = "outside", - plot.background = element_rect(colour = NA, fill = plot.bg), - plot.title = element_text(size = rel(1.2), - margin = margin(0, 0, half_line, 0)), - plot.subtitle = element_text(size = rel(1), - margin = margin(0, 0, half_line, 0)), - plot.caption = element_text(size = rel(0.6), - margin = margin(0, 0, half_line, 0)), - plot.margin = margin(t = half_line, r = half_line, b = half_line, - l = half_line), - plot.tag = element_text(size = rel(1.2), hjust = 0.5, vjust = 0.5), + plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg), + plot.title = ggplot2::element_text( + size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.subtitle = ggplot2::element_text( + size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.caption = ggplot2::element_text( + size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.margin = ggplot2::margin( + t = half_line, r = half_line, b = half_line, l = half_line + ), + plot.tag = ggplot2::element_text( + size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5 + ), plot.tag.position = "topleft", complete = TRUE ) @@ -121,9 +138,11 @@ update_geom_defaults("boxplot", list(colour = "#356196")) ## Introduction -This vignette compares storage and retrieval of data by `git2rdata` with other standard R functionality. We consider `write.table()` and `read.table()` for data stored in a plain text format. `saveRDS()` and `readRDS()` use a compressed binary format. +This vignette compares storage and retrieval of data by `git2rdata` with other standard R functionality. +We consider `write.table()` and `read.table()` for data stored in a plain text format. +`saveRDS()` and `readRDS()` use a compressed binary format. -To get some meaningful results, we will use the `nassCDS` dataset from the [DAAG](https://www.rdocumentation.org/packages/DAAG/versions/1.22/topics/nassCDS) package. +To get some meaningful results, we will use the `nassCDS` dataset from the [DAAG](https://www.rdocumentation.org/packages/DAAG/versions/1.22/topics/nassCDS) package. We'll avoid the dependency on the package by directly downloading the data. ```{r download_data, eval = system.file("efficiency", "airbag.rds", package = "git2rdata") == ""} @@ -150,11 +169,12 @@ if (system.file("efficiency", "airbag.rds", package = "git2rdata") == "") { str(airbag) ``` -## Data Storage +## Data Storage ### On a File System -We start by writing the dataset as is with `write.table()`, `saveRDS()`, `write_vc()` and `write_vc()` without storage optimization. Note that `write_vc()` uses optimization by default. Since `write_vc()` creates two files for each data set, we take their combined file size into account. +We start by writing the dataset as is with `write.table()`, `saveRDS()`, `write_vc()` and `write_vc()` without storage optimization. +Note that `write_vc()` uses optimization by default. Since `write_vc()` creates two files for each data set, we take their combined file size into account. ```{r set_tmp_dir} library(git2rdata) @@ -176,29 +196,30 @@ fn <- write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE) verbose_size <- sum(file.size(file.path(root, fn))) ``` -Since the data is highly compressible, `saveRDS()` yields the smallest file at the cost of having a binary file format. Both `write_vc()` formats yield smaller files than `write.table()`. -Partly because `write_vc()` doesn't store row names and doesn't use quotes unless needed. -The difference between the optimized and verbose version of `write_vc()` is, in this case, solely due to the way `write_vc()` stores factors in the data (`tsv`) file. -The optimized version stores the indices of the factor whereas the verbose version stores the levels. -For example: `airbag$dvcat` has 5 levels with short labels (on average 5 character), storing the index requires 1 character. +Since the data is highly compressible, `saveRDS()` yields the smallest file at the cost of having a binary file format. Both `write_vc()` formats yield smaller files than `write.table()`. +Partly because `write_vc()` doesn't store row names and doesn't use quotes unless needed. +The difference between the optimized and verbose version of `write_vc()` is, in this case, solely due to the way `write_vc()` stores factors in the data (`tsv`) file. +The optimized version stores the indices of the factor whereas the verbose version stores the levels. +For example: `airbag$dvcat` has 5 levels with short labels (on average 5 character), storing the index requires 1 character. This results in more compact files. ```{r table_file_size, echo = FALSE} kable( data.frame( - method = c("saveRDS()", "write_vc(), optimized", "write_vc(), verbose", + method = c("saveRDS()", "write_vc(), optimized", "write_vc(), verbose", "write.table()"), file_size = c(rds_size, optim_size, verbose_size, base_size) / 2 ^ 10, relative = c(rds_size, optim_size, verbose_size, base_size) / base_size ), - caption = "Resulting file sizes (in kB) and file sizes relative to the size of write.table().", + caption = "Resulting file sizes (in kB) and file sizes relative to the size of + write.table().", digits = 2 ) ``` -The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. -The figure below illustrates the strong gain as soon as the level labels contain more than two characters. -The gain is less pronounced when the factor has a large number of levels. +The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. +The figure below illustrates the strong gain as soon as the level labels contain more than two characters. +The gain is less pronounced when the factor has a large number of levels. The optimization fails in extreme cases with short factor labels and a high number of levels. ```{r factor_label_length, echo = FALSE, fig.cap = "Effect of the label length on the efficiency of storing factor optimized, assuming 1000 observations", warning = FALSE} @@ -240,7 +261,7 @@ ggplot(f_ratio, aes(x = label_length, y = ratio, colour = levels)) + geom_hline(yintercept = 1, linetype = 2) + geom_line() + scale_x_continuous("label length (characters)") + - scale_y_continuous("optimized bytes / verbose bytes", + scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "), breaks = seq(0, 1.25, by = 0.25)) + scale_colour_manual("number of \nlevels", values = inbo_colours) ``` @@ -276,34 +297,34 @@ ggplot(f_ratio, aes(x = observations, y = ratio, colour = levels)) + geom_hline(yintercept = 1, linetype = 2) + geom_line() + scale_x_log10() + - scale_y_continuous("optimized bytes / verbose bytes", + scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "), breaks = seq(0, 1.25, by = 0.25)) + scale_colour_manual("number of \nlevels", values = inbo_colours) ``` ### In Git Repositories -Here we will simulate how much space the data requires to store the history in a git repository. -We will create a git repository for each method and store different subsets of the same data. -Each commit contains a new version of the data. Each version is a random sample containing 90% of the observations of the `airbag` data. +Here we will simulate how much space the data requires to store the history in a git repository. +We will create a git repository for each method and store different subsets of the same data. +Each commit contains a new version of the data. Each version is a random sample containing 90% of the observations of the `airbag` data. Two consecutive versions of the subset will have about 90% of the observations in common. -After writing each version, we commit the file, perform garbage collection (`git gc`) on the git repository and then calculate the size of the git history (`git count-objects -v`). +After writing each version, we commit the file, perform garbage collection (`git gc`) on the git repository and then calculate the size of the git history (`git count-objects -v`). ```{r git_size, eval = system.file("efficiency", "git_size.rds", package = "git2rdata") == ""} library(git2r) tmp_repo <- function() { root <- tempfile("git2rdata-efficient-git") dir.create(root) - repo <- init(root) - config(repo, user.name = "me", user.email = "me@me.com") + repo <- git2r::init(root) + git2r::config(repo, user.name = "me", user.email = "me@me.com") return(repo) } commit_and_size <- function(repo, filename) { add(repo, filename) commit(repo, "test", session = TRUE) git_size <- system( - sprintf("cd %s\ngit gc\ngit count-objects -v", dirname(repo$path)), + sprintf("cd %s\ngit gc\ngit count-objects -v", dirname(repo$path)), intern = TRUE ) git_size <- git_size[grep("size-pack", git_size)] @@ -320,7 +341,7 @@ repo_size <- replicate( 100, { observed_subset <- rbinom(nrow(airbag), size = 1, prob = 0.9) == 1 this <- airbag[ - sample(which(observed_subset)), + sample(which(observed_subset)), sample(ncol(airbag)) ] this_sorted <- airbag[observed_subset, ] @@ -337,8 +358,8 @@ repo_size <- replicate( c( write.table = commit_and_size(repo_wt, fn_wt), write.table.sorted = commit_and_size(repo_wts, fn_wts), - saveRDS = commit_and_size(repo_rds, fn_rds), - write_vc.optimized = commit_and_size(repo_wvco, fn_wvco), + saveRDS = commit_and_size(repo_rds, fn_rds), + write_vc.optimized = commit_and_size(repo_wvco, fn_wvco), write_vc.verbose = commit_and_size(repo_wvcv, fn_wvcv) ) }) @@ -354,12 +375,12 @@ if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") { } ``` -Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest possible diff. We also test `write.table()` with a stable ordering of the observations and variables. +Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest possible diff. We also test `write.table()` with a stable ordering of the observations and variables. -The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repository size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. -Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because git compresses its history. -The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. -Storage size is a lot smaller when using `write_vc()` with optimization. +The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repository size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. +Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because git compresses its history. +The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. +Storage size is a lot smaller when using `write_vc()` with optimization. The verbose option of `write_vc()` has little the gain in storage size. Another advantage is that `write_vc()` stores metadata. @@ -369,28 +390,30 @@ rs <- lapply( function(x) { if (x == "saveRDS") { fun <- "saveRDS" - optimized = "yes" + optimized <- "yes" } else if (x == "write_vc.optimized") { fun <- "write_vc" - optimized = "yes" + optimized <- "yes" } else if (x == "write_vc.verbose") { fun <- "write_vc" - optimized = "no" + optimized <- "no" } else if (x == "write.table") { fun <- "write.table" - optimized = "no" + optimized <- "no" } else if (x == "write.table.sorted") { fun <- "write.table" - optimized = "yes" + optimized <- "yes" } - data.frame(commit = seq_along(repo_size[x, ]), size = repo_size[x, ], + data.frame(commit = seq_along(repo_size[x, ]), size = repo_size[x, ], rel_size = repo_size[x, ] / repo_size["write.table.sorted", ], fun = fun, optimized = optimized, stringsAsFactors = FALSE) } ) rs <- do.call(rbind, rs) rs$optimized <- factor(rs$optimized, levels = c("yes", "no")) -ggplot(rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized)) + +ggplot( + rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized) +) + geom_line() + scale_y_continuous("repo size (in MiB)") + scale_colour_manual("function", values = inbo_colours) @@ -399,13 +422,14 @@ ggplot(rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized)) ```{r plot_rel_git_size, echo = FALSE, fig.cap = "Relative size of the git repository when compared to write.table()."} ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) + geom_line() + - scale_y_continuous("size relative to sorted write.table()", breaks = 0:10) + + scale_y_continuous("size relative to sorted write.table()", breaks = 0:10) + scale_colour_manual("function", values = inbo_colours) ``` ## Timings -The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hundred times and yields a distribution of timings for each expression. +The code below runs a microbenchmark on the four methods. +A microbenchmark runs the code a hundred times and yields a distribution of timings for each expression. ### Writing Data @@ -415,14 +439,16 @@ mb <- microbenchmark( write.table = write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t"), saveRDS = saveRDS(airbag, file.path(root, "base_R.rds")), write_vc.optim = write_vc(airbag, "airbag_optimize", root, sorting = "X"), - write_vc.verbose = write_vc(airbag, "airbag_verbose", root, sorting = "X", + write_vc.verbose = write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE) ) mb$time <- mb$time / 1e6 ``` ```{r store_file_timings, echo = FALSE} -if (system.file("efficiency", "file_timings.rds", package = "git2rdata") == "") { +if ( + system.file("efficiency", "file_timings.rds", package = "git2rdata") == "" +) { saveRDS(mb, file.path("..", "inst", "efficiency", "file_timings.rds")) } else { mb <- readRDS( @@ -433,14 +459,13 @@ if (system.file("efficiency", "file_timings.rds", package = "git2rdata") == "") ```{r median_write, echo = FALSE} median_time <- aggregate(time ~ expr, data = mb, FUN = median) -write_ratio <- 100 * median_time$time / +write_ratio <- 100 * median_time$time / median_time$time[median_time$expr == "write.table"] names(write_ratio) <- median_time$expr ``` - -`write_vc()` takes `r paste(sprintf("%.0f%%", -100 + write_ratio[grep("write_vc", names(write_ratio))]), collapse = " to ")` more time than `write.table()` because it needs to prepare the metadata and sort the observations and variables. -When overwriting existing data, `write_vc()` checks the new data against the existing metadata. +`write_vc()` takes `r paste(sprintf("%.0f%%", -100 + write_ratio[grep("write_vc", names(write_ratio))]), collapse = " to ")` more time than `write.table()` because it needs to prepare the metadata and sort the observations and variables. +When overwriting existing data, `write_vc()` checks the new data against the existing metadata. `saveRDS()` requires `r sprintf("%.0f%%", write_ratio["saveRDS"])` of the time that `write.table()` needs. ```{r plot_file_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for the different methods."} @@ -449,14 +474,14 @@ levels(mb$expr) <- gsub("write_vc\\.", "write_vc\n", levels(mb$expr)) ggplot(mb, aes(x = expr, y = time)) + geom_boxplot() + scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) + - theme(axis.title.x = element_blank()) + theme(axis.title.x = ggplot2::element_blank()) ``` ### Reading Data ```{r get_read_timings, eval = system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""} mb <- microbenchmark( - read.table = read.table(file.path(root, "base_R.tsv"), header = TRUE, + read.table = read.table(file.path(root, "base_R.tsv"), header = TRUE, sep = "\t"), readRDS = readRDS(file.path(root, "base_R.rds")), read_vc.optim = read_vc("airbag_optimize", root), @@ -466,7 +491,9 @@ mb$time <- mb$time / 1e6 ``` ```{r store_read_timings, echo = FALSE} -if (system.file("efficiency", "read_timings.rds", package = "git2rdata") == "") { +if ( + system.file("efficiency", "read_timings.rds", package = "git2rdata") == "" +) { saveRDS(mb, file.path("..", "inst", "efficiency", "read_timings.rds")) } else { mb <- readRDS( @@ -477,7 +504,7 @@ if (system.file("efficiency", "read_timings.rds", package = "git2rdata") == "") ```{r median_read, echo = FALSE} median_time <- aggregate(time ~ expr, data = mb, FUN = median) -read_ratio <- 100 * median_time$time / +read_ratio <- 100 * median_time$time / median_time$time[median_time$expr == "read.table"] names(read_ratio) <- median_time$expr ``` @@ -486,12 +513,12 @@ The timings on reading the data is another story. Reading the binary format take ```{r plot_read_timings, echo = FALSE, fig.cap = "Boxplots for the read timings for the different methods."} mb$expr <- factor( - mb$expr, + mb$expr, levels = c("readRDS", "read.table", "read_vc.optim", "read_vc.verbose") ) levels(mb$expr) <- gsub("read_vc\\.", "read_vc\n", levels(mb$expr)) ggplot(mb, aes(x = expr, y = time)) + geom_boxplot() + scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) + - theme(axis.title.x = element_blank()) + theme(axis.title.x = ggplot2::element_blank()) ``` diff --git a/vignettes/plain_text.Rmd b/vignettes/plain_text.Rmd index e6f6b4ce..379efeab 100644 --- a/vignettes/plain_text.Rmd +++ b/vignettes/plain_text.Rmd @@ -132,14 +132,22 @@ print_file("first_test.yml", path) ## Storing Verbose -Adding `optimize = FALSE` to `write_vc()` will keep the raw data in a human readable format. The metadata file is slightly different. The most obvious is the `optimize: no` tag and the different hash. Another difference is the metadata for POSIXct and Date classes. They will no longer have an origin tag but a format tag. +Adding `optimize = FALSE` to `write_vc()` will keep the raw data in a human readable format. +The metadata file is slightly different. +The most obvious is the `optimize: no` tag and the different hash. +Another difference is the metadata for POSIXct and Date classes. +They will no longer have an origin tag but a format tag. + +Another important difference is that we store the data file as comma separated values instead of tab separated values. +We noticed that the csv file format is more easily recognised by a larger audience as a data file. + ```{r write_verbose} write_vc(x = x, file = "verbose", root = path, optimize = FALSE, strict = FALSE) ``` ```{r manual_verbose_data} -print_file("verbose.tsv", path, 10) +print_file("verbose.csv", path, 10) print_file("verbose.yml", path) ``` diff --git a/vignettes/split_by.Rmd b/vignettes/split_by.Rmd index 90cea152..243ccfed 100644 --- a/vignettes/split_by.Rmd +++ b/vignettes/split_by.Rmd @@ -30,85 +30,105 @@ theme_inbo <- function(base_size = 12, base_family = "") { panel_grid <- "white" plot_bg <- "white" half_line <- base_size / 2 - theme( - line = element_line(colour = "black", size = 0.5, linetype = 1, - lineend = "butt"), - rect = element_rect(fill = rect_bg, colour = "black", size = 0.5, - linetype = 1), - text = element_text(family = base_family, face = "plain", - colour = "#843860", size = base_size, hjust = 0.5, - vjust = 0.5, angle = 0, lineheight = 0.9, - margin = margin(), debug = FALSE), - axis.line = element_blank(), - axis.line.x = element_blank(), - axis.line.y = element_blank(), - axis.text = element_text(size = rel(0.8)), - axis.text.x = element_text(margin = margin(t = 0.8 * half_line / 2), - vjust = 1), + ggplot2::theme( + line = ggplot2::element_line( + colour = "black", size = 0.5, linetype = 1, lineend = "butt" + ), + rect = ggplot2::element_rect( + fill = rect_bg, colour = "black", size = 0.5, linetype = 1 + ), + text = ggplot2::element_text( + family = base_family, face = "plain", colour = "#843860", + size = base_size, hjust = 0.5, vjust = 0.5, angle = 0, lineheight = 0.9, + margin = ggplot2::margin(), debug = FALSE + ), + axis.line = ggplot2::element_blank(), + axis.line.x = ggplot2::element_blank(), + axis.line.y = ggplot2::element_blank(), + axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)), + axis.text.x = ggplot2::element_text( + margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1 + ), axis.text.x.top = NULL, - axis.text.y = element_text(margin = margin(r = 0.8 * half_line / 2), - hjust = 1), + axis.text.y = ggplot2::element_text( + margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1 + ), axis.text.y.right = NULL, - axis.ticks = element_line(), - axis.ticks.length = unit(0.15, "cm"), - axis.title = element_text(colour = "black"), - axis.title.x = element_text( - margin = margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) + axis.ticks = ggplot2::element_line(), + axis.ticks.length = ggplot2::unit(0.15, "cm"), + axis.title = ggplot2::element_text(colour = "black"), + axis.title.x = ggplot2::element_text( + margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) ), axis.title.x.top = NULL, - axis.title.y = element_text( - margin = margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), + axis.title.y = ggplot2::element_text( + margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), angle = 90 ), axis.title.y.right = NULL, - legend.background = element_rect(colour = NA, fill = legend_bg), - legend.key = element_rect(fill = panel_bg, colour = NA), - legend.key.size = unit(1.2, "lines"), + legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg), + legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA), + legend.key.size = ggplot2::unit(1.2, "lines"), legend.key.height = NULL, legend.key.width = NULL, legend.margin = NULL, - legend.spacing = unit(0.2, "cm"), + legend.spacing = ggplot2::unit(0.2, "cm"), legend.spacing.x = NULL, legend.spacing.y = NULL, - legend.text = element_text(size = rel(0.8)), + legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)), legend.text.align = NULL, - legend.title = element_text(size = rel(0.8), face = "bold", hjust = 0, - colour = "black"), + legend.title = ggplot2::element_text( + size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black" + ), legend.title.align = NULL, legend.position = "right", legend.direction = NULL, legend.justification = "center", legend.box = NULL, - legend.box.margin = margin(t = half_line, r = half_line, b = half_line, - l = half_line), - legend.box.background = element_rect(colour = NA, fill = legend_bg), - legend.box.spacing = unit(0.2, "cm"), - panel.background = element_rect(fill = panel_bg, colour = NA), - panel.border = element_blank(), - panel.grid = element_line(colour = panel_grid), - panel.grid.minor = element_line(colour = panel_grid, size = 0.25), - panel.spacing = unit(half_line, "pt"), + legend.box.margin = ggplot2::margin( + t = half_line, r = half_line, b = half_line, l = half_line + ), + legend.box.background = ggplot2::element_rect( + colour = NA, fill = legend_bg + ), + legend.box.spacing = ggplot2::unit(0.2, "cm"), + panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA), + panel.border = ggplot2::element_blank(), + panel.grid = ggplot2::element_line(colour = panel_grid), + panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25), + panel.spacing = ggplot2::unit(half_line, "pt"), panel.spacing.x = NULL, panel.spacing.y = NULL, panel.ontop = FALSE, - strip.background = element_rect(fill = "#8E9DA7", colour = NA), - strip.text = element_text(size = rel(0.8), colour = "#F3F3F3"), - strip.text.x = element_text(margin = margin(t = half_line, b = half_line)), - strip.text.y = element_text(margin = margin(r = half_line, l = half_line), - angle = -90), - strip.switch.pad.grid = unit(0.1, "cm"), - strip.switch.pad.wrap = unit(0.1, "cm"), + strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA), + strip.text = ggplot2::element_text( + size = ggplot2::rel(0.8), colour = "#F3F3F3" + ), + strip.text.x = ggplot2::element_text( + margin = ggplot2::margin(t = half_line, b = half_line) + ), + strip.text.y = ggplot2::element_text( + margin = ggplot2::margin(r = half_line, l = half_line), angle = -90 + ), + strip.switch.pad.grid = ggplot2::unit(0.1, "cm"), + strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"), strip.placement = "outside", - plot.background = element_rect(colour = NA, fill = plot_bg), - plot.title = element_text(size = rel(1.2), - margin = margin(0, 0, half_line, 0)), - plot.subtitle = element_text(size = rel(1), - margin = margin(0, 0, half_line, 0)), - plot.caption = element_text(size = rel(0.6), - margin = margin(0, 0, half_line, 0)), - plot.margin = margin(t = half_line, r = half_line, b = half_line, - l = half_line), - plot.tag = element_text(size = rel(1.2), hjust = 0.5, vjust = 0.5), + plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg), + plot.title = ggplot2::element_text( + size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.subtitle = ggplot2::element_text( + size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.caption = ggplot2::element_text( + size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0) + ), + plot.margin = ggplot2::margin( + t = half_line, r = half_line, b = half_line, l = half_line + ), + plot.tag = ggplot2::element_text( + size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5 + ), plot.tag.position = "topleft", complete = TRUE ) @@ -202,7 +222,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + geom_line() + facet_wrap(~ paste("r =", r)) + scale_x_continuous( - expression(b~{"="}~N[s]~{"/"}~N), + expression(b~{"="}~N[s]~{"/"}~N), # nolint labels = function(x) { paste0(100 * x, "%") } @@ -214,7 +234,7 @@ ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + } ) + scale_colour_manual( - "a = s / r", + paste("a = s", "r", sep = " / "), values = inbo_colours, labels = c("1/4", "1/2", "1", "2", "4") ) diff --git a/vignettes/version_control.Rmd b/vignettes/version_control.Rmd index 92ed48c4..15155465 100644 --- a/vignettes/version_control.Rmd +++ b/vignettes/version_control.Rmd @@ -226,7 +226,7 @@ Let's add an observation with a new factor level. If we store the updated datafr ```{r factor2} updated <- data.frame( - color = c("red", "green", "blue"), + color = c("red", "green", "blue"), stringsAsFactors = TRUE ) write_vc(updated, "factor2", root, sorting = "color") @@ -291,7 +291,7 @@ write_vc(old, "relabel", root, sorting = "color") relabel("relabel", root, change = list(color = c(red = "rood", blue = "blauw"))) print_file("relabel.yml", root) relabel( - "relabel", root, + "relabel", root, change = data.frame( factor = "color", old = "blauw", new = "blue", stringsAsFactors = TRUE ) diff --git a/vignettes/workflow.Rmd b/vignettes/workflow.Rmd index b6f0b904..38895eb1 100644 --- a/vignettes/workflow.Rmd +++ b/vignettes/workflow.Rmd @@ -61,7 +61,10 @@ writeLines("*extra*", file.path(path, ".gitignore")) git2r::add(init_repo, ".gitignore", force = TRUE) git2r::commit(init_repo, message = "Initial commit") # push initial commit to remote -git2r::push(init_repo, "origin", "refs/heads/master") +branch_name <- git2r::branches(init_repo)[[1]]$name +git2r::push( + init_repo, "origin", file.path("refs", "heads", branch_name, fsep = "/") +) rm(init_repo) ``` @@ -131,7 +134,7 @@ Sys.sleep(1.2) ```{r} status(repo, ignored = TRUE) -fn <- write_vc(beaver2, "extra_beaver", repo, sorting = "time", stage = TRUE, +fn <- write_vc(beaver2, "extra_beaver", repo, sorting = "time", stage = TRUE, force = TRUE) status(repo) cm2 <- commit(repo, message = "Second commit") @@ -154,7 +157,7 @@ Sys.sleep(1.2) beaver1$beaver <- 1 beaver2$beaver <- 2 beaver <- rbind(beaver1, beaver2) -fn <- write_vc(beaver, "beaver", repo, sorting = c("beaver", "time"), +fn <- write_vc(beaver, "beaver", repo, sorting = c("beaver", "time"), strict = FALSE, stage = TRUE) file.remove(list.files(path, "extra", full.names = TRUE)) status(repo) @@ -185,7 +188,7 @@ Below is an example script recreating the "beaver" git2rdata object from the [th library(git2rdata) # step 1: setup the repository and data path repo <- repository(".") -data_path <- "data/beaver" +data_path <- file.path("data", "beaver") # step 1b: sync the repository with the remote pull(repo = repo) # step 2: remove all existing data files @@ -195,7 +198,7 @@ rm_data(root = repo, path = data_path, stage = TRUE) beaver1$beaver <- 1 beaver2$beaver <- 2 body_temp <- rbind(beaver1, beaver2) -fn <- write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), +fn <- write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), root = repo, sorting = c("beaver", "time"), stage = TRUE) # step 4: remove any dangling metadata files @@ -233,26 +236,26 @@ Consider running the import from the command line. e.g. `Rscript -e 'mypackage:: import_body_temp <- function(path) { # step 1: setup the repository and data path repo <- repository(path) - data_path <- "data/beaver" + data_path <- file.path("data", "beaver") # step 1b: sync the repository with the remote pull(repo = repo) # step 2: remove all existing data files rm_data(root = repo, path = data_path, stage = TRUE) - + # step 3: write all relevant git2rdata objects to the data path beaver1$beaver <- 1 beaver2$beaver <- 2 body_temp <- rbind(beaver1, beaver2) - fn <- write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), + write_vc(x = body_temp, file = file.path(data_path, "body_temperature"), root = repo, sorting = c("beaver", "time"), stage = TRUE) - + # step 4: remove any dangling metadata files prune_meta(root = repo, path = data_path, stage = TRUE) - + # step 5: commit the changes - cm <- commit(repo = repo, message = "import", session = TRUE) + commit(repo = repo, message = "import", session = TRUE) # step 5b: sync the repository with the remote - push(repo = repo) + push(object = repo) } ``` @@ -274,7 +277,7 @@ analysis <- function(ds_name, repo) { report <- function(x) { knitr::kable( coef(summary(x$model)), - caption = sprintf("**dataset:** %s \n**commit:** %s \n**repository:** %s", + caption = sprintf("**dataset:** %s \n**commit:** %s \n**repository:** %s", x$dataset, x$commit$commit, x$repository) ) }