diff --git a/.Rbuildignore b/.Rbuildignore index 672c633..d8347fd 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,5 @@ ^sticker$ ^tic\.R$ ^.*\.Rproj$ +^cran-comments\.md$ +^CRAN-RELEASE$ diff --git a/.travis.yml b/.travis.yml index 1b27489..7284610 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ deploy: script: R -q -e 'tic::deploy()' on: branch: master + dist: xenial condition: - $TRAVIS_PULL_REQUEST = false - $TRAVIS_EVENT_TYPE != cron @@ -28,18 +29,32 @@ after_script: R -q -e 'tic::after_script()' # Header language: r -r: - - oldrel - - release - - devel +latex: false +matrix: + include: + - dist: trusty + r: oldrel + addons: + apt: + packages: + - libgit2-dev + - dist: xenial + r: release + addons: + apt: + packages: + - libgit2-dev + - dist: xenial + r: devel + addons: + apt: + packages: + - libgit2-dev + - os: osx + r: release + brew_packages: openssl sudo: false -dist: trusty cache: packages -addons: - apt: - packages: - - libgit2-dev -latex: false #env env: @@ -47,5 +62,10 @@ env: - _R_CHECK_FORCE_SUGGESTS_=false - MAKEFLAGS="-j 2" +notifications: + email: + on_success: change + on_failure: change + #services services: diff --git a/DESCRIPTION b/DESCRIPTION index 84cd963..f51146a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Imports: methods, yaml Suggests: + spelling, ggplot2, knitr, microbenchmark, @@ -52,3 +53,4 @@ Collate: 'relabel.R' 'upgrade_data.R' VignetteBuilder: knitr +Language: en-GB diff --git a/NEWS.md b/NEWS.md index 6b9563b..c9b0325 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,7 +37,7 @@ git2rdata 0.0.4 (2019-05-16) * Use a faster algorithm to detect duplicates (suggestion by @brodieG). * Improve documentation. * Fix typo's in documentation, vignettes and README. - * Add a ROpenSci review badge to the README. + * Add a rOpenSci review badge to the README. * The README mentions on upper bound on the size of dataframes. * Set lifecycle to "maturing" and repo status to "active". * The functions handle `root` containing regex expressions. @@ -62,11 +62,11 @@ git2rdata 0.0.2 (2019-02-26) ### NEW FEATURES - * Vignette on [efficiency](../articles/efficiency.html) added (#2). + * Vignette on [efficiency](https://ropensci.github.io/git2rdata/articles/efficiency.html) added (#2). * Three separate vignettes instead of one large vignette. - * Focus on the [plain text format](../arsticles/plain_text.html). - * Focus on [version control](../articles/version_control.html). - * Focus on [workflows](../articles/workflow.html). + * Focus on the [plain text format](https://ropensci.github.io/git2rdata/articles/plain_text.html). + * Focus on [version control](https://ropensci.github.io/git2rdata/articles/version_control.html). + * Focus on [workflows](https://ropensci.github.io/git2rdata/articles/workflow.html). * S3 methods replace the old S4 methods (#8). * Optimized factors use stable indices. Adding or removing levels result in smaller diffs (#13). * Use `relabel()` to alter factor levels without changing their index (#13). @@ -82,7 +82,7 @@ git2rdata 0.0.2 (2019-02-26) * Each helpfile contains a working example (#11). * README updated (#12). * Updated the rationale with links to the vignettes. - * `git2rdata` has a hexsticker logo. + * `git2rdata` has a hexagon sticker logo. * Add the [![DOI](https://zenodo.org/badge/147685405.svg)](https://zenodo.org/badge/latestdoi/147685405). * The installation instructions use `remotes` and build the vignettes. * We removed `auto_commit()` because of limited extra functionality over `git2r::commit()`. diff --git a/R/meta.R b/R/meta.R index 309c3b4..86173bd 100644 --- a/R/meta.R +++ b/R/meta.R @@ -4,7 +4,7 @@ #' Prepares a vector for storage. When relevant, `meta()`optimizes the object #' for storage by changing the format to one which needs less characters. The #' metadata stored in the `meta` attribute, contains all required information to -#' backtransform the optimized format into the original format. +#' back-transform the optimized format into the original format. #' @param x the vector. #' @param ... further arguments to the methods. #' @return the optimized vector `x` with `meta` attribute. @@ -185,7 +185,7 @@ meta.Date <- function(x, optimize = TRUE, ...){ #' #' \code{\link{write_vc}} uses this function to prepare a dataframe for storage. #' Existing metadata is passed through the optional `old` argument. This -#' argument intendent for internal use. +#' argument intended for internal use. #' @rdname meta #' @inheritParams write_vc meta.data.frame <- function(x, optimize = TRUE, na = "NA", sorting, ...) { diff --git a/R/prune.R b/R/prune.R index d717042..88836ad 100644 --- a/R/prune.R +++ b/R/prune.R @@ -14,7 +14,7 @@ #' @param path the directory in which to clean all the data files. The directory #' is relative to `root`. #' @param recursive remove files in subdirectories too. -#' @return returns invisibily a vector of removed files names. The paths are +#' @return returns invisibly a vector of removed files names. The paths are #' relative to `root`. #' @inheritParams write_vc #' @export @@ -108,7 +108,7 @@ rm_data.git_repository <- function( #' `vignette("workflow", package = "git2rdata")` for some examples on how to use #' this. #' @inheritParams rm_data -#' @return returns invisibily a vector of removed files names. The paths are +#' @return returns invisibly a vector of removed files names. The paths are #' relative to `root`. #' @inheritParams write_vc #' @export diff --git a/R/read_vc.R b/R/read_vc.R index ddee500..f888823 100644 --- a/R/read_vc.R +++ b/R/read_vc.R @@ -3,7 +3,7 @@ #' @description #' `read_vc()` handles git2rdata objects stored by `write_vc()`. It reads and #' verifies the metadata file (`.yml`). Then it reads and verifies the raw data. -#' The last step is backtransforming any transformation done by `meta()` to +#' The last step is back-transforming any transformation done by `meta()` to #' return the `data.frame` as stored by `write_vc()`. #' #' `read_vc()` is an S3 generic on `root` which currently handles `"character"` diff --git a/R/recent_commit.R b/R/recent_commit.R index 90048a1..792bea6 100644 --- a/R/recent_commit.R +++ b/R/recent_commit.R @@ -8,7 +8,7 @@ #' Use this information to document the current version of file or git2rdata #' object in an analysis. Since it refers to the most recent change of this #' file, it remains unchanged by committing changes to other files. You can -#' also use it to track if data got updated, requirering an analysis to +#' also use it to track if data got updated, requiring an analysis to #' be rerun. See `vignette("workflow", package = "git2rdata")`. #' @inheritParams write_vc #' @param root The root of a project. Can be a file path or a `git-repository`. diff --git a/R/reexport.R b/R/reexport.R index f482284..cd94dff 100644 --- a/R/reexport.R +++ b/R/reexport.R @@ -1,4 +1,4 @@ -#' Reexported Function From `git2r` +#' Re-exported Function From `git2r` #' #' See \code{\link[git2r]{repository}} in `git2r`. #' @name repository @@ -7,7 +7,7 @@ #' @export NULL -#' Reexported Function From `git2r` +#' Re-exported Function From `git2r` #' #' See \code{\link[git2r]{status}} in `git2r`. #' @name status @@ -16,7 +16,7 @@ NULL #' @export NULL -#' Reexported Function From `git2r` +#' Re-exported Function From `git2r` #' #' See \code{\link[git2r]{commit}} in `git2r`. #' @name commit @@ -25,7 +25,7 @@ NULL #' @export NULL -#' Reexported Function From `git2r` +#' Re-exported Function From `git2r` #' #' See \code{\link[git2r]{pull}} in `git2r`. #' @name pull @@ -34,7 +34,7 @@ NULL #' @export NULL -#' Reexported Function From `git2r` +#' Re-exported Function From `git2r` #' #' See \code{\link[git2r]{push}} in `git2r`. #' @name push diff --git a/R/relabel.R b/R/relabel.R index 230bc42..5c65b93 100644 --- a/R/relabel.R +++ b/R/relabel.R @@ -5,7 +5,7 @@ #' the factor indices and the metadata contains the link between the factor #' index and the corresponding label. See #' `vignette("version_control", package = "git2rdata")`. In such a case, -#' relabeling a factor can be fast and lightweight by updating the metadata. +#' relabelling a factor can be fast and lightweight by updating the metadata. #' @inheritParams write_vc #' @param change either a `list` or a `data.frame`. In case of a `list` is a #' named `list` with named `vectors`. The names of list elements must match the @@ -93,7 +93,7 @@ relabel.list <- function(file, root = ".", change) { meta_data <- read_yaml(file["meta_file"]) optimize <- meta_data[["..generic"]][["optimize"]] if (!optimize) { - stop("relabeling factors on verbose data leads to large diffs. + stop("relabelling factors on verbose data leads to large diffs. Use write_vc() instead.") } assert_that( @@ -117,7 +117,7 @@ Use write_vc() instead.") meta_data[[id]][["labels"]] <- unname(meta_data[[id]][["labels"]]) assert_that( anyDuplicated(meta_data[[id]][["labels"]]) == 0, - msg = sprintf("relabeling '%s' leads to duplicated labels", id) + msg = sprintf("relabelling '%s' leads to duplicated labels", id) ) } meta_data[["..generic"]][["hash"]] <- metadata_hash(meta_data) diff --git a/README.md b/README.md index 1524d6e..58df735 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # The `git2rdata` package +[![CRAN status](https://www.r-pkg.org/badges/version/git2rdata)](https://cran.r-project.org/package=git2rdata) +[![Rdoc](https://www.rdocumentation.org/badges/version/git2rdata)](https://www.rdocumentation.org/packages/git2rdata) + [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) [![](https://badges.ropensci.org/263_status.svg)](https://github.com/ropensci/software-review/issues/263) @@ -46,7 +49,13 @@ The `git2rdata` package is an R package for writing and reading dataframes as pl ## Installation -Install the development version +Install from CRAN + +```r +install.packages("git2rdata") +``` + +Install the development version from GitHub ```r # installation requires the "remotes" package @@ -91,7 +100,7 @@ read_vc(file = "rel_path/filename", root = repo) Please read `vignette("version_control", package = "git2rdata")` for more details on using git2rdata in combination with version control. -## What data sizes can `git2rdata` handle? +## What Data Sizes Can Git2rdata Handle? The recommendation for git repositories is to use files smaller than 100 MiB, an overall repository size less than 1 GiB and less than 25k files. The individual file size is the limiting factor. Storing the airbag dataset ([`DAAG::nassCDS`](https://cran.r-project.org/package=DAAG)) with `write_vc()` requires on average 68 (optimized) or 97 (verbose) byte per record. The 100 MiB file limit for this data is reached after about 1.5 million (optimize) or 1 million (verbose) observations. @@ -130,6 +139,6 @@ git2rdata ## Contributions -Contributions to `git2rdata` are welcome. Please read our [Contributing guidelines](.github/CONTRIBUTING.md) first. The `git2rdata` project is released with a [Contributor Code of Conduct](.github/CODE_OF_CONDUCT.md). By contributing to this project, you agree to abide by its terms. +Contributions to `git2rdata` are welcome. Please read our [Contributing guidelines](https://github.com/ropensci/git2rdata/blob/master/.github/CONTRIBUTING.md) first. The `git2rdata` project is released with a [Contributor Code of Conduct](https://github.com/ropensci/git2rdata/blob/master/.github/CODE_OF_CONDUCT.md). By contributing to this project, you agree to abide by its terms. -[![ropensci_footer](http://ropensci.org/public_images/github_footer.png)](https://ropensci.org) +[![rOpenSci footer](http://ropensci.org/public_images/github_footer.png)](https://ropensci.org) diff --git a/codemeta.json b/codemeta.json index 4a04d6e..2b4c034 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,10 @@ "description": "Make versioning of data.frame easy and efficient using git repositories.", "name": "git2rdata: Store and Retrieve Data.frames in a Git Repository", "codeRepository": "https://github.com/ropensci/git2rdata", - "relatedLink": "https://doi.org/10.5281/zenodo.1485309", + "relatedLink": [ + "https://doi.org/10.5281/zenodo.1485309", + "https://CRAN.R-project.org/package=git2rdata" + ], "issueTracker": "https://github.com/ropensci/git2rdata/issues", "license": "https://spdx.org/licenses/GPL-3.0", "version": "0.1", @@ -68,6 +71,18 @@ } ], "softwareSuggestions": [ + { + "@type": "SoftwareApplication", + "identifier": "spelling", + "name": "spelling", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=spelling" + }, { "@type": "SoftwareApplication", "identifier": "ggplot2", @@ -179,9 +194,9 @@ "sameAs": "https://CRAN.R-project.org/package=yaml" } ], - "releaseNotes": "https://github.com/inbo/git2rdata/blob/master/NEWS.md", + "releaseNotes": "https://github.com/ropensci/git2rdata/blob/master/NEWS.md", "readme": "https://github.com/ropensci/git2rdata/blob/master/README.md", - "fileSize": "363.628KB", + "fileSize": "363.637KB", "contIntegration": [ "https://travis-ci.org/inbo/git2rdata", "https://ci.appveyor.com/project/ThierryO/git2rdata/branch/master", @@ -197,5 +212,11 @@ "r-package", "version-control", "reproducible-research" - ] + ], + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } } diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..608e1ee --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,19 @@ +## Test environments +* local + * ubuntu 18.04, R 3.6.0 +* travis-ci + * trusty, oldrel + * xenial, release and devel + * osx, release +* AppVeyor + * Windows Server 2012, R 3.6.0 Patched +* r-hub + * Windows Server 2008 R2 SP1, R-devel, 32/64 bit + * Ubuntu Linux 16.04 LTS, R-release, GCC + * Fedora Linux, R-devel, clang, gfortran + +## R CMD check results + +0 errors | 0 warnings | 1 note + +* This is a new release. diff --git a/inst/WORDLIST b/inst/WORDLIST new file mode 100644 index 0000000..d204283 --- /dev/null +++ b/inst/WORDLIST @@ -0,0 +1,19 @@ +AppVeyor +Bitbucket +codecov +DAAG +ersion +Gitlab +kiB +lifecycle +microbenchmark +ontrol +POSIXct +rdata +regex +rOpenSci +Roxygen +sensu +testthat +tsv +YAML diff --git a/man/commit.Rd b/man/commit.Rd index 9a8c656..55f2914 100644 --- a/man/commit.Rd +++ b/man/commit.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reexport.R \name{commit} \alias{commit} -\title{Reexported Function From \code{git2r}} +\title{Re-exported Function From \code{git2r}} \description{ See \code{\link[git2r]{commit}} in \code{git2r}. } diff --git a/man/meta.Rd b/man/meta.Rd index 4bb6bc9..36c5f89 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -53,7 +53,7 @@ the optimized vector \code{x} with \code{meta} attribute. Prepares a vector for storage. When relevant, \code{meta()}optimizes the object for storage by changing the format to one which needs less characters. The metadata stored in the \code{meta} attribute, contains all required information to -backtransform the optimized format into the original format. +back-transform the optimized format into the original format. In case of a data.frame, \code{meta()} applies itself to each of the columns. The \code{meta} attribute becomes a named list containing the metadata for each column @@ -62,7 +62,7 @@ the metadata and not allowed as column name in a \code{data.frame}. \code{\link{write_vc}} uses this function to prepare a dataframe for storage. Existing metadata is passed through the optional \code{old} argument. This -argument intendent for internal use. +argument intended for internal use. } \examples{ meta(c(NA, "'NA'", '"NA"', "abc\\tdef", "abc\\ndef")) diff --git a/man/prune_meta.Rd b/man/prune_meta.Rd index c8b8b73..677ad4f 100644 --- a/man/prune_meta.Rd +++ b/man/prune_meta.Rd @@ -24,7 +24,7 @@ is relative to \code{root}.} \item{stage}{stage the changes after removing the files. Defaults to \code{FALSE}.} } \value{ -returns invisibily a vector of removed files names. The paths are +returns invisibly a vector of removed files names. The paths are relative to \code{root}. } \description{ diff --git a/man/pull.Rd b/man/pull.Rd index f5a6511..78ba631 100644 --- a/man/pull.Rd +++ b/man/pull.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reexport.R \name{pull} \alias{pull} -\title{Reexported Function From \code{git2r}} +\title{Re-exported Function From \code{git2r}} \description{ See \code{\link[git2r]{pull}} in \code{git2r}. } diff --git a/man/push.Rd b/man/push.Rd index c02d752..5d4c426 100644 --- a/man/push.Rd +++ b/man/push.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reexport.R \name{push} \alias{push} -\title{Reexported Function From \code{git2r}} +\title{Re-exported Function From \code{git2r}} \description{ See \code{\link[git2r]{push}} in \code{git2r}. } diff --git a/man/read_vc.Rd b/man/read_vc.Rd index 92b18a1..f23ae09 100644 --- a/man/read_vc.Rd +++ b/man/read_vc.Rd @@ -20,7 +20,7 @@ The \code{data.frame} with the file names and hashes as attributes. \description{ \code{read_vc()} handles git2rdata objects stored by \code{write_vc()}. It reads and verifies the metadata file (\code{.yml}). Then it reads and verifies the raw data. -The last step is backtransforming any transformation done by \code{meta()} to +The last step is back-transforming any transformation done by \code{meta()} to return the \code{data.frame} as stored by \code{write_vc()}. \code{read_vc()} is an S3 generic on \code{root} which currently handles \code{"character"} diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index 9e5ce25..8e193d5 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -28,7 +28,7 @@ ignores the deletion of files. Use this information to document the current version of file or git2rdata object in an analysis. Since it refers to the most recent change of this file, it remains unchanged by committing changes to other files. You can -also use it to track if data got updated, requirering an analysis to +also use it to track if data got updated, requiring an analysis to be rerun. See \code{vignette("workflow", package = "git2rdata")}. } \examples{ diff --git a/man/relabel.Rd b/man/relabel.Rd index 41d49a0..2728291 100644 --- a/man/relabel.Rd +++ b/man/relabel.Rd @@ -31,7 +31,7 @@ have stored it with \code{write_vc(optimize = TRUE)}. The raw data file contains the factor indices and the metadata contains the link between the factor index and the corresponding label. See \code{vignette("version_control", package = "git2rdata")}. In such a case, -relabeling a factor can be fast and lightweight by updating the metadata. +relabelling a factor can be fast and lightweight by updating the metadata. } \examples{ diff --git a/man/repository.Rd b/man/repository.Rd index 9e653d8..a6e8cad 100644 --- a/man/repository.Rd +++ b/man/repository.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reexport.R \name{repository} \alias{repository} -\title{Reexported Function From \code{git2r}} +\title{Re-exported Function From \code{git2r}} \description{ See \code{\link[git2r]{repository}} in \code{git2r}. } diff --git a/man/rm_data.Rd b/man/rm_data.Rd index db49c08..ab9069e 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -33,7 +33,7 @@ listed in a \code{.gitignore} file. Selecting \code{modified} will remove both visible data files, including \code{untracked} files.} } \value{ -returns invisibily a vector of removed files names. The paths are +returns invisibly a vector of removed files names. The paths are relative to \code{root}. } \description{ diff --git a/man/status.Rd b/man/status.Rd index 8b372eb..372a51d 100644 --- a/man/status.Rd +++ b/man/status.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/reexport.R \name{status} \alias{status} -\title{Reexported Function From \code{git2r}} +\title{Re-exported Function From \code{git2r}} \description{ See \code{\link[git2r]{status}} in \code{git2r}. } diff --git a/tests/testthat/test_d_recent_commit.R b/tests/testthat/test_d_recent_commit.R index 251fe57..52dcfc2 100644 --- a/tests/testthat/test_d_recent_commit.R +++ b/tests/testthat/test_d_recent_commit.R @@ -65,17 +65,22 @@ expect_identical( ) target <- file.path(git2r::workdir(root), "subsecond.txt") -write.table(test_data[11, ], file = target) -git2r::add(root, target) -commit_6 <- commit(root, "first subsecond") -write.table(test_data[12, ], file = target) -git2r::add(root, target) -commit_7 <- commit(root, "second subsecond") -write.table(test_data[13, ], file = target) -git2r::add(root, target) -commit_8 <- commit(root, "third subsecond") +while (TRUE) { + writeLines(sample(letters), con = target) + git2r::add(root, target) + cm_1 <- commit(root, "first subsecond") + writeLines(sample(letters), con = target) + git2r::add(root, target) + cm_2 <- commit(root, "second subsecond") + output <- suppressWarnings( + recent_commit(file = "subsecond.txt", root) + ) + if (nrow(output) > 1) { + break + } +} +expect_true(all(output$commit %in% c(cm_1$sha, cm_2$sha))) expect_warning( - output <- recent_commit(file = "subsecond.txt", root), + recent_commit(file = "subsecond.txt", root), "More than one commit within the same second" ) -expect_true(all(output$commit %in% c(commit_6$sha, commit_7$sha, commit_8$sha))) diff --git a/tests/testthat/test_d_relabel.R b/tests/testthat/test_d_relabel.R index dbfd7ce..a365c7e 100644 --- a/tests/testthat/test_d_relabel.R +++ b/tests/testthat/test_d_relabel.R @@ -54,7 +54,7 @@ test_that("relabel handles a data.frame of changes", { test_that("relabel only works on optimized files", { write_vc(ds, "relabel_verbose", root, sorting = "b", optimize = FALSE) expect_error(relabel("relabel_verbose", root, new_labels), - "relabeling factors on verbose data leads to large diffs") + "relabelling factors on verbose data leads to large diffs") }) test_that("relabel handles git repositories", { diff --git a/vignettes/efficiency.Rmd b/vignettes/efficiency.Rmd index 4878821..cdff72f 100644 --- a/vignettes/efficiency.Rmd +++ b/vignettes/efficiency.Rmd @@ -173,7 +173,7 @@ fn <- write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE) verbose_size <- sum(file.size(file.path(root, fn))) ``` -Since the data is highly compressable, `saveRDS()` yields the smallest file at the cost of having a binary file format. Both `write_vc()` formats yield smaller files than `write.table()`. Partly because `write_vc()` doesn't store row names and only uses quotes when needed. The difference between the optimized and verbose version of `write_vc()` is, in this case, solely due to the way factors are stored in the data (tsv) file. The optimized version stores the indices of the factor whereas the verbose version stores the levels. For example: `airbag$dvcat` has 5 levels with fairly short labels (on average 5 character), however storing the index requires only 1 character. Resulting in more compact files. +Since the data is highly compressible, `saveRDS()` yields the smallest file at the cost of having a binary file format. Both `write_vc()` formats yield smaller files than `write.table()`. Partly because `write_vc()` doesn't store row names and only uses quotes when needed. The difference between the optimized and verbose version of `write_vc()` is, in this case, solely due to the way factors are stored in the data (tsv) file. The optimized version stores the indices of the factor whereas the verbose version stores the levels. For example: `airbag$dvcat` has 5 levels with fairly short labels (on average 5 character), however storing the index requires only 1 character. Resulting in more compact files. ```{r table_file_size, echo = FALSE} kable( @@ -340,9 +340,9 @@ if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") { } ``` -Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest posibble diff. We also test `write.table()` with a stable ordering of the observations and variables. +Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest possible diff. We also test `write.table()` with a stable ordering of the observations and variables. -The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repostory size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because the git repository will be compressed too. The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. There is a clear gain when using `write_vc()` with optimization in terms of storage size and the availability of metadata. The verbose option of `write_vc()` lacks the gain in terms of storage size but still has the metadata advantage. +The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repository size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because the git repository will be compressed too. The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. There is a clear gain when using `write_vc()` with optimization in terms of storage size and the availability of metadata. The verbose option of `write_vc()` lacks the gain in terms of storage size but still has the metadata advantage. ```{r plot_git_size, echo = FALSE, fig.cap = "Size of the git history using the different storage methods."} rs <- lapply( @@ -386,7 +386,7 @@ ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) + ## Timings -The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hunderd times and yields a distribution of timings for each expression. +The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hundred times and yields a distribution of timings for each expression. ### Writing Data diff --git a/vignettes/workflow.Rmd b/vignettes/workflow.Rmd index 6c767f2..4bfbf1c 100644 --- a/vignettes/workflow.Rmd +++ b/vignettes/workflow.Rmd @@ -180,7 +180,7 @@ push(repo = repo) We recommend a two repository set-up in case of recurring analyses. These are relative stable analyses which have to run with some frequency on updated data (e.g. once a month). Then it is worthwhile to convert the analyses into an R package. Long scripts can be converted into a set of shorter functions which are much easier to document and maintain. An R package offers lots of [functionality](http://r-pkgs.had.co.nz/check.html) out of the box to check the quality of your code. -The example below converts the import script above into a function. We illustrate how you can use Roxygen2 (see `vignette("roxygen2", package = "roxygen2")`) tags to document the function and to list its dependencies. Note that we added `session = TRUE` to `commit()`. This will append the `sessionInfo()` at the time of the commit to the commit message. Thus documenting all loaded R packages and their version. This documents to code used to create the git2rdatad object since your analysis code resides in a dedicated package with its own version number. We strongly recommend to run the import from a fresh R session. Then the `sessionInfo()` at commit time is limited to those packages with are strictly required for the import. Consider running the import from the command line. e.g. `Rscript -e 'mypackage::import_body_temp("path/to/root")'`. +The example below converts the import script above into a function. We illustrate how you can use Roxygen2 (see `vignette("roxygen2", package = "roxygen2")`) tags to document the function and to list its dependencies. Note that we added `session = TRUE` to `commit()`. This will append the `sessionInfo()` at the time of the commit to the commit message. Thus documenting all loaded R packages and their version. This documents to code used to create the git2rdata object since your analysis code resides in a dedicated package with its own version number. We strongly recommend to run the import from a fresh R session. Then the `sessionInfo()` at commit time is limited to those packages with are strictly required for the import. Consider running the import from the command line. e.g. `Rscript -e 'mypackage::import_body_temp("path/to/root")'`. ```{r eval = FALSE} #' Import the beaver body temperature data