ropensci · ThierryO · Jan 24, 2025 · Oct 11, 2024 · Oct 11, 2024 · Nov 26, 2024
diff --git a/.github/workflows/pr_title.yml b/.github/workflows/pr_title.yml
@@ -0,0 +1,20 @@
+name: 'PR Title Checker'
+on:
+  pull_request:
+    types: [edited, opened, synchronize, reopened]
+    branches:
+    - main
+    - master
+
+jobs:
+  title-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: naveenk1223/action-pr-title@master
+        with:
+          regex: 'Version [0-9]+\.[0-9]+(\.[0-9]+)?$' # Regex the title should match.
+          allowed_prefixes: ':bookmark:' # title should start with the given prefix
+          prefix_case_sensitive: true # title prefix are case insensitive
+          min_length: 11 # Min length of the title
+          max_length: -1 # Max length of the title
+          name: Check PR title
diff --git a/.zenodo.json b/.zenodo.json
@@ -1,6 +1,6 @@
 {
   "title": "git2rdata: Store and Retrieve Data.frames in a Git Repository",
-  "version": "0.4.1",
+  "version": "0.5.0",
   "license": "GPL-3.0",
   "upload_type": "software",
   "description": "<p>The git2rdata package is an R package for writing and reading dataframes as plain text files. A metadata file stores important information. 1) Storing metadata allows to maintain the classes of variables. By default, git2rdata optimizes the data for file storage. The optimization is most effective on data containing factors. The optimization makes the data less human readable. The user can turn this off when they prefer a human readable format over smaller files. Details on the implementation are available in vignette(“plain_text”, package = “git2rdata”). 2) Storing metadata also allows smaller row based diffs between two consecutive commits. This is a useful feature when storing data as plain text files under version control. Details on this part of the implementation are available in vignette(“version_control”, package = “git2rdata”). Although we envisioned git2rdata with a git workflow in mind, you can use it in combination with other version control systems like subversion or mercurial. 3) git2rdata is a useful tool in a reproducible and traceable workflow. vignette(“workflow”, package = “git2rdata”) gives a toy example. 4) vignette(“efficiency”, package = “git2rdata”) provides some insight into the efficiency of file storage, git repository size and speed for writing and reading.<\/p>",

diff --git a/CITATION.cff b/CITATION.cff
@@ -40,4 +40,4 @@ identifiers:
   value: 10.5281/zenodo.1485309
 - type: url
   value: https://ropensci.github.io/git2rdata/
-version: 0.4.1
+version: 0.5.0
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: git2rdata
 Title: Store and Retrieve Data.frames in a Git Repository
-Version: 0.4.1
+Version: 0.5.0
 Authors@R: c(
     person("Thierry", "Onkelinx", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-8804-4216", affiliation = "Research Institute for Nature and Forest (INBO)")),
@@ -38,14 +38,15 @@ URL: https://ropensci.github.io/git2rdata/,
     https://doi.org/10.5281/zenodo.1485309
 BugReports: https://github.com/ropensci/git2rdata/issues
 Depends: 
-    R (>= 3.5.0)
+    R (>= 4.1.0)
 Imports:
     assertthat,
     git2r (>= 0.23.0),
     methods,
     yaml
 Suggests:
     ggplot2,
+    jsonlite,
     knitr,
     microbenchmark,
     rmarkdown,
@@ -60,6 +61,7 @@ Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
 Collate:
     'clean_data_path.R'
+    'data_package.R'
     'datahash.R'
     'display_metadata.R'
     'git2rdata_package.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -51,6 +51,7 @@ S3method(write_vc,character)
 S3method(write_vc,default)
 S3method(write_vc,git_repository)
 export(commit)
+export(data_package)
 export(display_metadata)
 export(is_git2rdata)
 export(is_git2rmeta)
@@ -74,6 +75,7 @@ importFrom(assertthat,"on_failure<-")
 importFrom(assertthat,assert_that)
 importFrom(assertthat,has_attr)
 importFrom(assertthat,has_name)
+importFrom(assertthat,is.count)
 importFrom(assertthat,is.flag)
 importFrom(assertthat,is.string)
 importFrom(assertthat,noNA)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# git2rdata 0.5.0
+
+* `read_vc()` handles empty datasets stored with `split_by`. 
+* `write_vc()` and `meta()` gain a `digits` argument.
+  The arguments specifies the number of significant digits to store for numeric
+  values.
+
 # git2rdata 0.4.1
 
 * Add `update_metadata()` to update the description of a `git2rdata` object.

diff --git a/R/data_package.R b/R/data_package.R
@@ -0,0 +1,107 @@
+#' Create a Data Package for a directory of CSV files
+#'
+#' @description
+#' Create a `datapackage.json` file for a directory of CSV files.
+#' The function will look for all `.csv` files in the directory and its
+#' subdirectories.
+#' It will then create a `datapackage.json` file with the metadata of each CSV
+#' file.
+#'
+#' @param path the directory in which to create the `datapackage.json` file.
+#' @family storage
+#' @export
+#' @importFrom assertthat assert_that is.string noNA
+data_package <- function(path = ".") {
+  assert_that(
+    is.string(path), noNA(path), requireNamespace("jsonlite", quietly = TRUE)
+  )
+  stopifnot("`path` is not a directory" = file_test("-d", path))
+
+  data_files <- list.files(path, pattern = ".csv$", recursive = TRUE)
+  relevant <- vapply(
+    data_files, FUN = is_git2rdata, FUN.VALUE = logical(1), root = path
+  )
+  stopifnot(
+    "no non-optimized git2rdata objects found at `path`" = any(relevant)
+  )
+  data_files <- data_files[relevant]
+
+  list(
+    resources = vapply(
+        data_files, path = path, FUN = data_resource,
+        FUN.VALUE = vector(mode = "list", length = 1)
+      ) |>
+        unname()
+  ) |>
+    jsonlite::toJSON(pretty = TRUE, auto_unbox = TRUE) |>
+    writeLines(file.path(path, "datapackage.json"))
+  return(file.path(path, "datapackage.json"))
+}
+
+#' @importFrom assertthat assert_that is.string noNA
+#' @importFrom yaml read_yaml
+data_resource <- function(file, path = ".") {
+  assert_that(
+    is.string(file), is.string(path), noNA(file), noNA(path)
+  )
+  stopifnot("`path` is not a directory" = file_test("-d", path))
+
+  clean_data_path(root = path, file = file)[2] |>
+    read_yaml() -> metadata
+  list(
+    name = coalesce(metadata[["..generic"]][["name"]], file), path = file,
+    "encoding" = "utf-8", format = "csv", media_type = "text/csv",
+    hash = paste0("sha1:", metadata[["..generic"]][["data_hash"]]),
+    schema = list(
+      fields = vapply(
+        names(metadata)[-1], metadata = metadata, FUN = field_schema,
+        FUN.VALUE = vector(mode = "list", length = 1)
+      ) |>
+        unname(),
+      missingValues = list(
+        c(value = metadata[["..generic"]][["NA string"]], label = "missing")
+      )
+    )
+  ) -> dr
+  extra <- c("title", "description")
+  metadata[["..generic"]][extra[extra %in% names(metadata[["..generic"]])]] |>
+    c(dr) |>
+    list()
+}
+
+field_schema <- function(x, metadata) {
+  switch(
+    metadata[[x]]$class,
+    "character" = list(name = x, type = "string"),
+    "Date" = list(name = x, type = "date"),
+    "logical" = list(
+      name = x, type = "boolean", trueValues = c("TRUE", "true"),
+      falseValues = c("FALSE", "false")
+    ),
+    "factor" = list(
+      name = x, type = "string", categories = metadata[[x]][["labels"]],
+      categoriesOrdered = metadata[[x]][["ordered"]]
+    ),
+    "integer" = list(name = x, type = "integer"),
+    "numeric" = list(name = x, type = "number"),
+    "POSIXct" = list(
+      name = x, type = "datetime", format = "%Y-%m-%dT%H:%M:%SZ"
+    ),
+    stop("field_schema() can't handle ", metadata[[x]]$class)
+  ) -> fs
+  if ("description" %in% names(metadata[[x]])) {
+    fs$description <- metadata[[x]][["description"]]
+  }
+  return(list(fs))
+}
+
+coalesce <- function(...) {
+  dots <- list(...)
+  if (length(dots) == 0) {
+    return(NULL)
+  }
+  if (!is.null(dots[[1]])) {
+    return(dots[[1]])
+  }
+  do.call(coalesce, dots[-1])
+}
diff --git a/R/meta.R b/R/meta.R
@@ -14,7 +14,7 @@
 #' @examples
 #' meta(c(NA, "'NA'", '"NA"', "abc\tdef", "abc\ndef"))
 #' meta(1:3)
-#' meta(seq(1, 3, length = 4))
+#' meta(seq(1, 3, length = 4), digits = 6)
 #' meta(factor(c("b", NA, "NA"), levels = c("NA", "b", "c")))
 #' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c")), optimize = FALSE)
 #' meta(factor(c("b", NA, "a"), levels = c("a", "b", "c"), ordered = TRUE))
@@ -29,7 +29,7 @@
 #' meta(as.POSIXct("2019-02-01 10:59:59", tz = "CET"), optimize = FALSE)
 #' meta(as.Date("2019-02-01"))
 #' meta(as.Date("2019-02-01"), optimize = FALSE)
-meta <- function(x, ...) {
+meta <- function(x, ..., digits) {
   UseMethod("meta", x)
 }
 
@@ -63,8 +63,11 @@ meta.integer <- function(x, ...) {
 }
 
 #' @export
-meta.numeric <- function(x, ...) {
-  list(class = "numeric") -> m
+#' @importFrom assertthat assert_that is.count
+meta.numeric <- function(x, ..., digits) {
+  stopifnot("`digits` must be a strict positive integer" = is.count(digits))
+  x <- signif(x, digits = digits)
+  list(class = "numeric", digits = as.integer(digits)) -> m
   class(m) <- "meta_detail"
   attr(x, "meta") <- m
   return(x)
@@ -218,7 +221,7 @@ meta.Date <- function(x, optimize = TRUE, ...) {
 #' @inheritParams write_vc
 meta.data.frame <- function(# nolint
   x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
-  split_by = character(0), ...
+  split_by = character(0), ..., digits
 ) {
   assert_that(
     !has_name(x, "..generic"),
@@ -237,13 +240,46 @@ meta.data.frame <- function(# nolint
   )
 
   dots <- list(...)
+  float <- vapply(x, is.numeric, logical(1)) &
+    !vapply(x, is.integer, logical(1))
   if (has_name(dots, "old")) {
     old <- dots$old
     assert_that(inherits(old, "meta_list"))
     if (missing(sorting)) {
       sorting <- old[["..generic"]][["sorting"]]
     }
+    if (any(float) && missing(digits)) {
+      old_numeric <- vapply(
+        old, FUN.VALUE = logical(1),
+        FUN = function(x) {
+          has_name(x, "class") && x$class == "numeric" && has_name(x, "digits")
+        }
+      )
+      digits <- vapply(
+        old[old_numeric], FUN.VALUE = numeric(1),
+        FUN = function(x) {
+          x[["digits"]]
+        }
+      )
+      relevant <- names(float)[float][!names(float)[float] %in% names(digits)]
+      rep(6L, length(relevant)) -> digits[relevant]
+    }
+  }
+  if (any(float) && missing(digits)) {
+    digits <- 6L
+    warning("`digits` was not set. Setting is automatically to 6. See ?meta")
   }
+  if (any(float) && is.null(names(digits))) {
+    stopifnot(
+      "`digits` must be either named or have length 1" = length(digits) == 1
+    )
+    digits <- rep(digits, sum(float))
+    names(digits) <- names(float)[float]
+  }
+  stopifnot(
+    "`digits` must contain all numeric variables of `x`" =
+      all(!float) || all(names(float)[float] %in% names(digits))
+  )
 
   # apply sorting
   if (missing(sorting) || is.null(sorting) || !length(sorting)) {
@@ -271,12 +307,13 @@ Add extra sorting variables to ensure small diffs.", sorted)
   if (length(split_by) > 0) {
     generic <- c(generic, split_by = list(split_by))
   }
+
   # calculate meta for each column
   if (!has_name(dots, "old")) {
     z <- lapply(
       colnames(x),
       function(id, optimize, na) {
-        meta(x[[id]], optimize = optimize, na = na)
+        meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]])
       },
       optimize = optimize, na = na
     )
@@ -290,7 +327,7 @@ Add extra sorting variables to ensure small diffs.", sorted)
           meta(
             x[[id]], optimize = optimize, na = na,
             index = setNames(old[[id]][["index"]], old[[id]][["labels"]]),
-            strict = strict
+            strict = strict, digits = digits[[id]]
           )
         },
         optimize = old[["..generic"]][["optimize"]],
@@ -305,7 +342,7 @@ Add extra sorting variables to ensure small diffs.", sorted)
       z_new <- lapply(
         new,
         function(id, optimize, na) {
-          meta(x[[id]], optimize = optimize, na = na)
+          meta(x[[id]], optimize = optimize, na = na, digits = digits[[id]])
         },
         optimize = optimize, na = na
       )

diff --git a/R/read_vc.R b/R/read_vc.R
@@ -83,29 +83,38 @@
       comment.char = "",
       stringsAsFactors = FALSE, fileEncoding = "UTF-8"
     )
-    raw_data <- vapply(
-      seq_len(nrow(index)),
-      function(i) {
-        rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
-        raw_data <- read.table(
-          file = rf, header = TRUE, sep = "\t", quote = "\"",
-          dec = ".", numerals = "warn.loss", na.strings = na_string,
-          colClasses = setNames(
-            col_type[col_classes[!which_split_by]],
-            col_names[!which_split_by]
-          ),
-          comment.char = "",
-          stringsAsFactors = FALSE, fileEncoding = "UTF-8"
-        )
-        raw_data <- cbind(
-          index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
-          raw_data
-        )
-        return(list(raw_data))
-      },
-      vector(mode = "list", length = 1)
-    )
-    raw_data <- do.call(rbind, raw_data)[, col_names]
+    if (nrow(index) == 0) {
+      list(
+        character = character(0), factor = character(0), integer = integer(0),
+        numeric = numeric(0)
+      )[col_classes] |>
+        setNames(col_names) |>
+        as.data.frame() -> raw_data
+    } else {
+      raw_data <- vapply(
+        seq_len(nrow(index)),
+        function(i) {
+          rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
+          raw_data <- read.table(
+            file = rf, header = TRUE, sep = "\t", quote = "\"",
+            dec = ".", numerals = "warn.loss", na.strings = na_string,
+            colClasses = setNames(
+              col_type[col_classes[!which_split_by]],
+              col_names[!which_split_by]
+            ),
+            comment.char = "",
+            stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+          )
+          raw_data <- cbind(
+            index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
+            raw_data
+          )
+          return(list(raw_data))
+        },
+        vector(mode = "list", length = 1)
+      )
+      raw_data <- do.call(rbind, raw_data)[, col_names]
+    }
   } else {
     raw_data <- read.table(
       file = file["raw_file"], header = TRUE, sep = ifelse(optimize, "\t", ","),