ropensci · ThierryO · Jan 13, 2021 · Sep 6, 2020 · Sep 15, 2020 · Sep 15, 2020
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,3 +13,5 @@
 ^codecov.yml$
 ^LICENSE.md$
 ^\.httr-oauth$
+^doc$
+^Meta$
diff --git a/.github/workflows/check_on_different_r_os.yml b/.github/workflows/check_on_different_r_os.yml
@@ -18,9 +18,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macOS-latest,   r: 'devel'}
           - {os: macOS-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-20.04,   r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
           - {os: ubuntu-16.04,   r: 'oldrel',  rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 
     env:

diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,5 @@
 inst/doc
 docs
 .httr-oauth
+doc
+Meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: git2rdata
 Title: Store and Retrieve Data.frames in a Git Repository
-Version: 0.2.2
+Version: 0.3.0
 Authors@R: 
     c(person(given = "Thierry",
              family = "Onkelinx",
@@ -66,5 +66,6 @@ Collate:
     'recent_commit.R'
     'reexport.R'
     'relabel.R'
+    'rename_variable.R'
     'upgrade_data.R'
     'utils.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -33,6 +33,9 @@ S3method(recent_commit,git_repository)
 S3method(relabel,data.frame)
 S3method(relabel,default)
 S3method(relabel,list)
+S3method(rename_variable,character)
+S3method(rename_variable,default)
+S3method(rename_variable,git_repository)
 S3method(rm_data,character)
 S3method(rm_data,default)
 S3method(rm_data,git_repository)
@@ -53,6 +56,7 @@ export(push)
 export(read_vc)
 export(recent_commit)
 export(relabel)
+export(rename_variable)
 export(repository)
 export(rm_data)
 export(status)
@@ -67,7 +71,6 @@ importFrom(assertthat,noNA)
 importFrom(git2r,add)
 importFrom(git2r,commit)
 importFrom(git2r,hash)
-importFrom(git2r,hashfile)
 importFrom(git2r,last_commit)
 importFrom(git2r,odb_blobs)
 importFrom(git2r,pull)
@@ -77,6 +80,7 @@ importFrom(git2r,status)
 importFrom(git2r,workdir)
 importFrom(methods,setOldClass)
 importFrom(stats,setNames)
+importFrom(utils,file_test)
 importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,write.table)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,17 @@
+# git2rdata 0.3.0
+
+## New features
+
+* `write_vc()` gains an optional `split_by` argument.
+  See `vignette("split_by")` for more details.
+* `rename_variable()` efficiently renames variables in a stored `git2rdata`
+  object.
+
+## Bugfixes
+
+* `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
+  when both the data and metadata are missing.
+
 # git2rdata 0.2.2
 
 * Use the [checklist](https://inbo.github.io/checklist) package for CI.

diff --git a/R/datahash.R b/R/datahash.R
@@ -8,30 +8,40 @@
 #' @family internal
 #' @importFrom assertthat assert_that
 #' @importFrom git2r hash
+#' @importFrom utils file_test
 datahash <- function(file) {
-  chunk_size <- 1e4
-  hashes <- character(chunk_size + 1)
-  i <- 0
-  rawdata <- scan(
-    file = file, what = character(), nmax = -1, sep = "\n", quote = "",
-    skip = i * chunk_size, nlines = chunk_size, na.strings = "",
-    flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
-    blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
-    encoding = "UTF-8", skipNul = FALSE
-  )
-  while (length(rawdata)) {
-    hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
-    i <- i + 1
-    if (i  %% chunk_size == 0) {
-      hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
-    }
+  if (file_test("-f", file)) {
+    chunk_size <- 1e4
+    hashes <- character(chunk_size + 1)
+    i <- 0
     rawdata <- scan(
       file = file, what = character(), nmax = -1, sep = "\n", quote = "",
       skip = i * chunk_size, nlines = chunk_size, na.strings = "",
       flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
       blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
       encoding = "UTF-8", skipNul = FALSE
     )
+    while (length(rawdata)) {
+      hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
+      i <- i + 1
+      if (i  %% chunk_size == 0) {
+        hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
+      }
+      rawdata <- scan(
+        file = file, what = character(), nmax = -1, sep = "\n", quote = "",
+        skip = i * chunk_size, nlines = chunk_size, na.strings = "",
+        flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
+        blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
+        encoding = "UTF-8", skipNul = FALSE
+      )
+    }
+  } else {
+    hashes <- sapply(
+      list.files(
+        file, pattern = "(index|[[:xdigit:]]{20}\\.tsv$)", full.names = TRUE
+      ),
+      datahash
+    )
   }
   hash(paste(hashes, collapse = ""))
 }

diff --git a/R/is_git2rdata.R b/R/is_git2rdata.R
@@ -43,17 +43,51 @@ is_git2rdata.character <- function(file, root = ".",
 
   # read the metadata
   meta_data <- read_yaml(file["meta_file"])
-
-  correct <- names(meta_data)
-  correct <- paste(correct[correct != "..generic"], collapse = "\t")
-  header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
-  if (correct != header) {
-    msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
-    switch(message, error = stop(msg, call. = FALSE),
-           warning = warning(msg, call. = FALSE))
-    return(FALSE)
+  if (has_name(meta_data[["..generic"]], "split_by")) {
+    header <- readLines(
+      file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8"
+    )
+    correct <- paste(
+      c(meta_data[["..generic"]][["split_by"]], "..hash"),
+      collapse = "\t"
+    )
+    if (correct != header) {
+      msg <- paste(
+        "Corrupt data, incorrect header in index.tsv. Expecting:", correct
+      )
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
+    correct <- names(meta_data)
+    keep <- !correct %in% c("..generic", meta_data[["..generic"]][["split_by"]])
+    correct <- paste(correct[keep], collapse = "\t")
+    header <- vapply(
+      list.files(file["raw_file"], pattern = "[[:xdigit:]]{20}\\.tsv"),
+      function(z) {
+        readLines(
+          file.path(file["raw_file"], z), n = 1, encoding = "UTF-8"
+        )
+      },
+      character(1)
+    )
+    if (any(header != correct)) {
+      msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
+  } else {
+    correct <- names(meta_data)
+    correct <- paste(correct[correct != "..generic"], collapse = "\t")
+    header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
+    if (correct != header) {
+      msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
+      switch(message, error = stop(msg, call. = FALSE),
+             warning = warning(msg, call. = FALSE))
+      return(FALSE)
+    }
   }
-
   return(TRUE)
 }
 

diff --git a/R/is_git2rmeta.R b/R/is_git2rmeta.R
@@ -38,7 +38,11 @@ is_git2rmeta.character <- function(file, root = ".",
   file <- clean_data_path(root = root, file = file)
 
   if (!file.exists(file["meta_file"])) {
-    msg <- "Metadata file missing."
+    msg <- ifelse(
+      file.exists(file["raw_file"]),
+      "Metadata file missing.",
+      "`git2rdata` object not found."
+    )
     switch(message, error = stop(msg, call. = FALSE),
            warning = warning(msg, call. = FALSE))
     return(FALSE)

diff --git a/R/meta.R b/R/meta.R
@@ -211,12 +211,24 @@ meta.Date <- function(x, optimize = TRUE, ...) {
 #' @rdname meta
 #' @inheritParams write_vc
 meta.data.frame <- function(# nolint
-  x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...
+  x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
+  split_by = character(0), ...
 ) {
   assert_that(
     !has_name(x, "..generic"),
     msg = "'..generic' is a reserved name and not allowed as column name")
+  assert_that(
+    !has_name(x, "..hash"),
+    msg = "'..hash' is a reserved name and not allowed as column name")
   generic <- list(optimize = optimize, "NA string" = na)
+  assert_that(is.character(split_by))
+  assert_that(
+    all(split_by %in% colnames(x)),
+    msg = "All split_by variables must be available in the data.frame")
+  assert_that(
+    any(!colnames(x) %in% split_by),
+    msg = "No remaining variables after splitting"
+  )
 
   dots <- list(...)
   if (has_name(dots, "old")) {
@@ -236,6 +248,7 @@ Sorting is strongly recommended in combination with version control.")
     assert_that(
       all(sorting %in% colnames(x)),
       msg = "All sorting variables must be available in the data.frame")
+    sorting <- unique(c(split_by, sorting))
     if (nrow(x) > 1) {
       old_locale <- set_c_locale()
       x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint
@@ -249,6 +262,9 @@ Add extra sorting variables to ensure small diffs.", sorted)
     }
     generic <- c(generic, sorting = list(sorting))
   }
+  if (length(split_by) > 0) {
+    generic <- c(generic, split_by = list(split_by))
+  }
   # calculate meta for each column
   if (has_name(dots, "old")) {
     common <- names(old)[names(old) %in% colnames(x)]

diff --git a/R/read_vc.R b/R/read_vc.R
@@ -30,7 +30,6 @@ read_vc.default <- function(file, root) {
 #' @importFrom yaml read_yaml
 #' @importFrom utils read.table
 #' @importFrom stats setNames
-#' @importFrom git2r hashfile
 read_vc.character <- function(file, root = ".") {
   assert_that(is.string(file), is.string(root))
   root <- normalizePath(root, winslash = "/", mustWork = TRUE)
@@ -69,14 +68,54 @@ read_vc.character <- function(file, root = ".") {
   col_classes <- vapply(details, "[[", character(1), "class")
 
   # read the raw data and check the data hash
-  raw_data <- read.table(
-    file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
-    dec = ".", numerals = "warn.loss", na.strings = na_string,
-    colClasses = setNames(col_type[col_classes], col_names), comment.char = "",
-    stringsAsFactors = FALSE, fileEncoding = "UTF-8"
-  )
-
+  if (has_name(meta_data[["..generic"]], "split_by")) {
+    split_by <- meta_data[["..generic"]][["split_by"]]
+    which_split_by <- col_names %in% split_by
+    index <- read.table(
+      file = file.path(file["raw_file"], "index.tsv"),
+      header = TRUE, sep = "\t", quote = "\"",
+      dec = ".", numerals = "warn.loss", na.strings = na_string,
+      colClasses = setNames(
+        col_type[col_classes[which_split_by]],
+        col_names[which_split_by]
+      ),
+      comment.char = "",
+      stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+    )
+    raw_data <- vapply(
+      seq_len(nrow(index)),
+      function(i) {
+        rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
+        raw_data <- read.table(
+          file = rf, header = TRUE, sep = "\t", quote = "\"",
+          dec = ".", numerals = "warn.loss", na.strings = na_string,
+          colClasses = setNames(
+            col_type[col_classes[!which_split_by]],
+            col_names[!which_split_by]
+          ),
+          comment.char = "",
+          stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+        )
+        raw_data <- cbind(
+          index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
+          raw_data
+        )
+        return(list(raw_data))
+      },
+      vector(mode = "list", length = 1)
+    )
+    raw_data <- do.call(rbind, raw_data)[, col_names]
+  } else {
+    raw_data <- read.table(
+      file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
+      dec = ".", numerals = "warn.loss", na.strings = na_string,
+      colClasses = setNames(col_type[col_classes], col_names),
+      comment.char = "",
+      stringsAsFactors = FALSE, fileEncoding = "UTF-8"
+    )
+  }
   dh <- datahash(file["raw_file"])
+
   if (meta_data[["..generic"]][["data_hash"]] != dh) {
     meta_data[["..generic"]][["data_hash"]] <- dh
     warning("Mismatching data hash. Data altered outside of git2rdata.",
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,5 @@ @@
     inst/doc
     docs
     .httr-oauth
+    doc
+    Meta