Skip to content

Commit

Permalink
Merge pull request #62 from ropensci/split_table
Browse files Browse the repository at this point in the history
Split table
  • Loading branch information
ThierryO authored Jan 13, 2021
2 parents bad8a4c + 2ed454e commit 20762c5
Show file tree
Hide file tree
Showing 32 changed files with 1,215 additions and 95 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@
^codecov.yml$
^LICENSE.md$
^\.httr-oauth$
^doc$
^Meta$
2 changes: 1 addition & 1 deletion .github/workflows/check_on_different_r_os.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ jobs:
fail-fast: false
matrix:
config:
- {os: macOS-latest, r: 'devel'}
- {os: macOS-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
- {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}

env:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
inst/doc
docs
.httr-oauth
doc
Meta
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: git2rdata
Title: Store and Retrieve Data.frames in a Git Repository
Version: 0.2.2
Version: 0.3.0
Authors@R:
c(person(given = "Thierry",
family = "Onkelinx",
Expand Down Expand Up @@ -66,5 +66,6 @@ Collate:
'recent_commit.R'
'reexport.R'
'relabel.R'
'rename_variable.R'
'upgrade_data.R'
'utils.R'
6 changes: 5 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ S3method(recent_commit,git_repository)
S3method(relabel,data.frame)
S3method(relabel,default)
S3method(relabel,list)
S3method(rename_variable,character)
S3method(rename_variable,default)
S3method(rename_variable,git_repository)
S3method(rm_data,character)
S3method(rm_data,default)
S3method(rm_data,git_repository)
Expand All @@ -53,6 +56,7 @@ export(push)
export(read_vc)
export(recent_commit)
export(relabel)
export(rename_variable)
export(repository)
export(rm_data)
export(status)
Expand All @@ -67,7 +71,6 @@ importFrom(assertthat,noNA)
importFrom(git2r,add)
importFrom(git2r,commit)
importFrom(git2r,hash)
importFrom(git2r,hashfile)
importFrom(git2r,last_commit)
importFrom(git2r,odb_blobs)
importFrom(git2r,pull)
Expand All @@ -77,6 +80,7 @@ importFrom(git2r,status)
importFrom(git2r,workdir)
importFrom(methods,setOldClass)
importFrom(stats,setNames)
importFrom(utils,file_test)
importFrom(utils,packageVersion)
importFrom(utils,read.table)
importFrom(utils,write.table)
Expand Down
14 changes: 14 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# git2rdata 0.3.0

## New features

* `write_vc()` gains an optional `split_by` argument.
See `vignette("split_by")` for more details.
* `rename_variable()` efficiently renames variables in a stored `git2rdata`
object.

## Bugfixes

* `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
when both the data and metadata are missing.

# git2rdata 0.2.2

* Use the [checklist](https://inbo.github.io/checklist) package for CI.
Expand Down
42 changes: 26 additions & 16 deletions R/datahash.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,40 @@
#' @family internal
#' @importFrom assertthat assert_that
#' @importFrom git2r hash
#' @importFrom utils file_test
datahash <- function(file) {
chunk_size <- 1e4
hashes <- character(chunk_size + 1)
i <- 0
rawdata <- scan(
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
encoding = "UTF-8", skipNul = FALSE
)
while (length(rawdata)) {
hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
i <- i + 1
if (i %% chunk_size == 0) {
hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
}
if (file_test("-f", file)) {
chunk_size <- 1e4
hashes <- character(chunk_size + 1)
i <- 0
rawdata <- scan(
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
encoding = "UTF-8", skipNul = FALSE
)
while (length(rawdata)) {
hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
i <- i + 1
if (i %% chunk_size == 0) {
hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
}
rawdata <- scan(
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
encoding = "UTF-8", skipNul = FALSE
)
}
} else {
hashes <- sapply(
list.files(
file, pattern = "(index|[[:xdigit:]]{20}\\.tsv$)", full.names = TRUE
),
datahash
)
}
hash(paste(hashes, collapse = ""))
}
Expand Down
54 changes: 44 additions & 10 deletions R/is_git2rdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,51 @@ is_git2rdata.character <- function(file, root = ".",

# read the metadata
meta_data <- read_yaml(file["meta_file"])

correct <- names(meta_data)
correct <- paste(correct[correct != "..generic"], collapse = "\t")
header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
if (correct != header) {
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
switch(message, error = stop(msg, call. = FALSE),
warning = warning(msg, call. = FALSE))
return(FALSE)
if (has_name(meta_data[["..generic"]], "split_by")) {
header <- readLines(
file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8"
)
correct <- paste(
c(meta_data[["..generic"]][["split_by"]], "..hash"),
collapse = "\t"
)
if (correct != header) {
msg <- paste(
"Corrupt data, incorrect header in index.tsv. Expecting:", correct
)
switch(message, error = stop(msg, call. = FALSE),
warning = warning(msg, call. = FALSE))
return(FALSE)
}
correct <- names(meta_data)
keep <- !correct %in% c("..generic", meta_data[["..generic"]][["split_by"]])
correct <- paste(correct[keep], collapse = "\t")
header <- vapply(
list.files(file["raw_file"], pattern = "[[:xdigit:]]{20}\\.tsv"),
function(z) {
readLines(
file.path(file["raw_file"], z), n = 1, encoding = "UTF-8"
)
},
character(1)
)
if (any(header != correct)) {
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
switch(message, error = stop(msg, call. = FALSE),
warning = warning(msg, call. = FALSE))
return(FALSE)
}
} else {
correct <- names(meta_data)
correct <- paste(correct[correct != "..generic"], collapse = "\t")
header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
if (correct != header) {
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
switch(message, error = stop(msg, call. = FALSE),
warning = warning(msg, call. = FALSE))
return(FALSE)
}
}

return(TRUE)
}

Expand Down
6 changes: 5 additions & 1 deletion R/is_git2rmeta.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ is_git2rmeta.character <- function(file, root = ".",
file <- clean_data_path(root = root, file = file)

if (!file.exists(file["meta_file"])) {
msg <- "Metadata file missing."
msg <- ifelse(
file.exists(file["raw_file"]),
"Metadata file missing.",
"`git2rdata` object not found."
)
switch(message, error = stop(msg, call. = FALSE),
warning = warning(msg, call. = FALSE))
return(FALSE)
Expand Down
18 changes: 17 additions & 1 deletion R/meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,24 @@ meta.Date <- function(x, optimize = TRUE, ...) {
#' @rdname meta
#' @inheritParams write_vc
meta.data.frame <- function(# nolint
x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...
x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
split_by = character(0), ...
) {
assert_that(
!has_name(x, "..generic"),
msg = "'..generic' is a reserved name and not allowed as column name")
assert_that(
!has_name(x, "..hash"),
msg = "'..hash' is a reserved name and not allowed as column name")
generic <- list(optimize = optimize, "NA string" = na)
assert_that(is.character(split_by))
assert_that(
all(split_by %in% colnames(x)),
msg = "All split_by variables must be available in the data.frame")
assert_that(
any(!colnames(x) %in% split_by),
msg = "No remaining variables after splitting"
)

dots <- list(...)
if (has_name(dots, "old")) {
Expand All @@ -236,6 +248,7 @@ Sorting is strongly recommended in combination with version control.")
assert_that(
all(sorting %in% colnames(x)),
msg = "All sorting variables must be available in the data.frame")
sorting <- unique(c(split_by, sorting))
if (nrow(x) > 1) {
old_locale <- set_c_locale()
x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint
Expand All @@ -249,6 +262,9 @@ Add extra sorting variables to ensure small diffs.", sorted)
}
generic <- c(generic, sorting = list(sorting))
}
if (length(split_by) > 0) {
generic <- c(generic, split_by = list(split_by))
}
# calculate meta for each column
if (has_name(dots, "old")) {
common <- names(old)[names(old) %in% colnames(x)]
Expand Down
55 changes: 47 additions & 8 deletions R/read_vc.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ read_vc.default <- function(file, root) {
#' @importFrom yaml read_yaml
#' @importFrom utils read.table
#' @importFrom stats setNames
#' @importFrom git2r hashfile
read_vc.character <- function(file, root = ".") {
assert_that(is.string(file), is.string(root))
root <- normalizePath(root, winslash = "/", mustWork = TRUE)
Expand Down Expand Up @@ -69,14 +68,54 @@ read_vc.character <- function(file, root = ".") {
col_classes <- vapply(details, "[[", character(1), "class")

# read the raw data and check the data hash
raw_data <- read.table(
file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
dec = ".", numerals = "warn.loss", na.strings = na_string,
colClasses = setNames(col_type[col_classes], col_names), comment.char = "",
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
)

if (has_name(meta_data[["..generic"]], "split_by")) {
split_by <- meta_data[["..generic"]][["split_by"]]
which_split_by <- col_names %in% split_by
index <- read.table(
file = file.path(file["raw_file"], "index.tsv"),
header = TRUE, sep = "\t", quote = "\"",
dec = ".", numerals = "warn.loss", na.strings = na_string,
colClasses = setNames(
col_type[col_classes[which_split_by]],
col_names[which_split_by]
),
comment.char = "",
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
)
raw_data <- vapply(
seq_len(nrow(index)),
function(i) {
rf <- file.path(file["raw_file"], paste0(index[i, "..hash"], ".tsv"))
raw_data <- read.table(
file = rf, header = TRUE, sep = "\t", quote = "\"",
dec = ".", numerals = "warn.loss", na.strings = na_string,
colClasses = setNames(
col_type[col_classes[!which_split_by]],
col_names[!which_split_by]
),
comment.char = "",
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
)
raw_data <- cbind(
index[rep(i, nrow(raw_data)), split_by, drop = FALSE],
raw_data
)
return(list(raw_data))
},
vector(mode = "list", length = 1)
)
raw_data <- do.call(rbind, raw_data)[, col_names]
} else {
raw_data <- read.table(
file = file["raw_file"], header = TRUE, sep = "\t", quote = "\"",
dec = ".", numerals = "warn.loss", na.strings = na_string,
colClasses = setNames(col_type[col_classes], col_names),
comment.char = "",
stringsAsFactors = FALSE, fileEncoding = "UTF-8"
)
}
dh <- datahash(file["raw_file"])

if (meta_data[["..generic"]][["data_hash"]] != dh) {
meta_data[["..generic"]][["data_hash"]] <- dh
warning("Mismatching data hash. Data altered outside of git2rdata.",
Expand Down
Loading

0 comments on commit 20762c5

Please sign in to comment.