Merge pull request #93 from ShawHahnLab/release-0.4.1

Release 0.4.1
ShawHahnLab · Jan 10, 2023 · 646f52e · 646f52e
2 parents 229ffbd + 73aca8f
commit 646f52e
Show file tree

Hide file tree

Showing 131 changed files with 2,873 additions and 2,400 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -80,4 +80,4 @@ workflows:
       - build:
           matrix:
             parameters:
-              rversion: ["3.6.3", "4.2.1"]
+              rversion: ["3.6.3", "4.2.1", "4.2.2"]
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chiimp
 Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
-Version: 0.4.0
+Version: 0.4.1
 Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
 Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
 Depends: R (>= 3.6)
@@ -33,7 +33,7 @@ Imports:
 Remotes:
   github::sherrillmix/dnar,
   github::sherrillmix/dnaplotr
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.0
 Suggests:
   testthat,
   roxygen2
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,19 @@
+# chiimp 0.4.1
+
+ * Overhauled unit tests with more modular saved inputs and expected outputs
+   ([#92])
+ * Made warnings encountered in the chiimp script display as they occur, rather
+   than at the end ([#91])
+ * Made `load_dataset` warn about repeated Sample+Replicate+Locus entries
+   across rows ([#90])
+ * Made `load_config` warn about any unrecognized configuration file entries
+   ([#88])
+
+[#92]: https://github.com/ShawHahnLab/chiimp/pull/92
+[#91]: https://github.com/ShawHahnLab/chiimp/pull/91
+[#90]: https://github.com/ShawHahnLab/chiimp/pull/90
+[#88]: https://github.com/ShawHahnLab/chiimp/pull/88
+
 # chiimp 0.4.0
 
  * Fixed desktop icon drag-and-drop for recent Linux distributions ([#85])

diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R
@@ -54,24 +54,24 @@
 #'
 #' @export
 analyze_dataset <- function(
-  dataset,
-  locus_attrs,
-  nrepeats,
-  stutter.count.ratio_max = config.defaults$seq_analysis$
-    stutter.count.ratio_max,
-  artifact.count.ratio_max = config.defaults$seq_analysis$
-    artifact.count.ratio_max,
-  use_reverse_primers = config.defaults$seq_analysis$
-    use_reverse_primers,
-  reverse_primer_r1 = config.defaults$seq_analysis$
-    reverse_primer_r1,
-  ncores = 0,
-  analysis_opts,
-  summary_opts,
-  analysis_function=analyze_sample,
-  summary_function=summarize_sample,
-  known_alleles=NULL,
-  name_args=list()) {
+    dataset,
+    locus_attrs,
+    nrepeats,
+    stutter.count.ratio_max = config.defaults$seq_analysis$
+      stutter.count.ratio_max,
+    artifact.count.ratio_max = config.defaults$seq_analysis$
+      artifact.count.ratio_max,
+    use_reverse_primers = config.defaults$seq_analysis$
+      use_reverse_primers,
+    reverse_primer_r1 = config.defaults$seq_analysis$
+      reverse_primer_r1,
+    ncores = 0,
+    analysis_opts,
+    summary_opts,
+    analysis_function = analyze_sample,
+    summary_function = summarize_sample,
+    known_alleles = NULL,
+    name_args = list()) {
   if (! all(dataset$Locus %in% locus_attrs$Locus)) {
     rogue_loci <- unique(dataset$Locus[! dataset$Locus %in% locus_attrs$Locus])
     msg <- paste("ERROR: Locus names in dataset not in attributes table:",
@@ -241,8 +241,8 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
 #'   name.
 name_known_sequences <- function(results, known_alleles, name_args) {
   # Name all of the called alleles across samples
-  results$summary <- name_alleles_in_table(results$summary, known_alleles,
-                                           name_args)
+  results$summary <- name_alleles_in_table(
+    results$summary, known_alleles, name_args)
 
   # Create table of allele names for current dataset
   a1 <- results$summary[, c("Locus", "Allele1Seq", "Allele1Name")]

diff --git a/R/analyze_sample.R b/R/analyze_sample.R
@@ -170,3 +170,20 @@ analyze_sample_naive <- function(seq_data, sample.attrs, fraction.min) {
     Category[is.na(Category)] <- "Prominent"
   })
 }
+
+#' Check Sample Data for Potential Allele Matches
+#'
+#' Check the entries in a processed sample data frame for potential matches to a
+#' given locus.
+#'
+#' @param sample_data data frame of processed data for sample as produced by
+#'   \code{\link{analyze_seqs}}.
+#' @param locus.name character name of locus to match against.
+#'
+#' @return logical vector of entries for potential alleles.
+allele_match <- function(sample_data, locus.name) {
+  with(sample_data,
+       as.character(MatchingLocus) == locus.name &
+         MotifMatch &
+         LengthMatch)
+}
diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R
@@ -66,14 +66,14 @@
 #'                          locus_attrs,
 #'                          num_adjacent_repeats)
 analyze_seqs <- function(
-  seqs, locus_attrs, nrepeats,
-  stutter.count.ratio_max = config.defaults$seq_analysis$
-    stutter.count.ratio_max,
-  artifact.count.ratio_max = config.defaults$seq_analysis$
-    artifact.count.ratio_max,
-  use_reverse_primers = config.defaults$seq_analysis$
-    use_reverse_primers,
-  reverse_primer_r1 = config.defaults$seq_analysis$reverse_primer_r1) {
+    seqs, locus_attrs, nrepeats,
+    stutter.count.ratio_max = config.defaults$seq_analysis$
+      stutter.count.ratio_max,
+    artifact.count.ratio_max = config.defaults$seq_analysis$
+      artifact.count.ratio_max,
+    use_reverse_primers = config.defaults$seq_analysis$
+      use_reverse_primers,
+    reverse_primer_r1 = config.defaults$seq_analysis$reverse_primer_r1) {
   # Dereplicate sequences
   tbl <- table(seqs)
   count <- as.integer(tbl)
@@ -117,8 +117,9 @@ analyze_seqs <- function(
   # subsets of the original sample data.
   data$FractionOfTotal <- data$Count / sum(data$Count)
   data$FractionOfLocus <- with(data, {
-    total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc)
-      sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE)))
+    total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc) {
+      sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE)
+      }))
     names(total_per_locus) <- levels(MatchingLocus)
     Count / total_per_locus[MatchingLocus]
   })
@@ -138,8 +139,8 @@ analyze_seqs <- function(
 #'   should be reverse complemented before comparing.
 #'
 #' @return factor of locus names corresponding the matched primer sequences.
-find_matching_primer <- function(sample.data, locus_attrs,
-                                 forward=TRUE, reverse_primer_r1=TRUE) {
+find_matching_primer <- function(
+    sample.data, locus_attrs, forward = TRUE, reverse_primer_r1 = TRUE) {
   seqs <- sample.data$Seq
   if (forward) {
     primercol <- "Primer"
@@ -232,7 +233,7 @@ check_length <- function(sample.data, locus_attrs) {
 #'   entry that may have produced each entry as a stutter band.
 find_stutter <- function(
   sample.data, locus_attrs,
-  count.ratio_max=config.defaults$seq_analysis$stutter.count.ratio_max) {
+  count.ratio_max = config.defaults$seq_analysis$stutter.count.ratio_max) {
 
   stutter <- integer(nrow(sample.data)) * NA
 
@@ -281,8 +282,8 @@ find_stutter <- function(
 #' @return integer vector specifying, for each entry, the row index for another
 #'   entry that may have produced each entry as an artifactual sequence.
 find_artifact <- function(
-  sample.data, locus_attrs,
-  count.ratio_max=config.defaults$seq_analysis$artifact.count.ratio_max) {
+    sample.data, locus_attrs,
+    count.ratio_max = config.defaults$seq_analysis$artifact.count.ratio_max) {
 
   artifact <- integer(nrow(sample.data)) * NA
 

diff --git a/R/categorize.R b/R/categorize.R
@@ -29,7 +29,7 @@ match_known_genotypes <- function(results_summary, genotypes.known) {
                        CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
                        stringsAsFactors = FALSE)
   # Ensure ordering within pairs matches samples, if possible.
-  for (i in 1:nrow(result)) {
+  for (i in seq_len(nrow(result))) {
     a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
     kg <- result[i, ]
     idx <- match(a, kg)

diff --git a/R/chiimp.R b/R/chiimp.R
@@ -149,7 +149,7 @@
 #' }
 #'
 #' @export
-full_analysis <- function(config, dataset=NULL) {
+full_analysis <- function(config, dataset = NULL) { # nolint: cyclocomp_linter.
   # Overaly explicit configuration onto the default settings
   config_full <- utils::modifyList(config.defaults, config)
 
@@ -191,23 +191,20 @@ full_analysis <- function(config, dataset=NULL) {
   allele.names <- NULL
   if (!is.null(cfg$fp_allele_names))
     allele.names <- load_allele_names(cfg$fp_allele_names)
-  results <- analyze_dataset(dataset, locus_attrs,
-                             nrepeats = cfg$seq_analysis$nrepeats,
-                             stutter.count.ratio_max = cfg$seq_analysis$
-                               stutter.count.ratio_max,
-                             artifact.count.ratio_max = cfg$seq_analysis$
-                               artifact.count.ratio_max,
-                             use_reverse_primers = cfg$seq_analysis$
-                               use_reverse_primers,
-                             reverse_primer_r1 = cfg$seq_analysis$
-                               reverse_primer_r1,
-                             ncores = cfg$dataset_analysis$ncores,
-                             analysis_opts = cfg$sample_analysis_opts,
-                             summary_opts = cfg$sample_summary_opts,
-                             analysis_function = sample_analysis_func,
-                             summary_function = sample_summary_func,
-                             known_alleles = allele.names,
-                             name_args = cfg$dataset_analysis$name_args)
+  results <- analyze_dataset(
+    dataset, locus_attrs,
+    nrepeats = cfg$seq_analysis$nrepeats,
+    stutter.count.ratio_max = cfg$seq_analysis$stutter.count.ratio_max,
+    artifact.count.ratio_max = cfg$seq_analysis$artifact.count.ratio_max,
+    use_reverse_primers = cfg$seq_analysis$use_reverse_primers,
+    reverse_primer_r1 = cfg$seq_analysis$reverse_primer_r1,
+    ncores = cfg$dataset_analysis$ncores,
+    analysis_opts = cfg$sample_analysis_opts,
+    summary_opts = cfg$sample_summary_opts,
+    analysis_function = sample_analysis_func,
+    summary_function = sample_summary_func,
+    known_alleles = allele.names,
+    name_args = cfg$dataset_analysis$name_args)
   results$allele.names <- allele.names
   results$locus_attrs <- locus_attrs
   if (cfg$verbose) logmsg("Summarizing results...")
@@ -216,9 +213,10 @@ full_analysis <- function(config, dataset=NULL) {
     genotypes.known <- load_genotypes(cfg$fp_genotypes_known)
   results <- summarize_dataset(results, genotypes.known)
   if (!is.null(cfg$fp_genotypes_known))
-    results$closest_matches <- find_closest_matches(results$dist_mat_known,
-                                                range = cfg$report.dist_range,
-                                                maximum = cfg$report.dist_max)
+    results$closest_matches <- find_closest_matches(
+      results$dist_mat_known,
+      range = cfg$report.dist_range,
+      maximum = cfg$report.dist_max)
   results$config <- config_full
   if (cfg$verbose) logmsg("Saving output files...")
     save_data(results, results$config)
@@ -260,7 +258,7 @@ full_analysis <- function(config, dataset=NULL) {
 #' }
 #'
 #' @export
-main <- function(args=NULL) {
+main <- function(args = NULL) {
   if (missing(args))
     args <- commandArgs(trailingOnly = TRUE)
   desc <- "Identify microsatellite alleles fasta/fastq files"

diff --git a/R/histogram.R b/R/histogram.R
@@ -28,7 +28,7 @@ histogram <- function(seq_data,
   # mark top edges of allele sequences (to handle the same-length case)
   # label bars$topknown by name? positioning may get tricky.
   bars <- str_hist_setup(seq_data, sample_data)
-  if (render & nrow(seq_data) > 0) {
+  if (render && nrow(seq_data) > 0) {
     if (is.null(cutoff_fraction)) {
       cutoff_fraction <- attr(sample_data, "fraction.min")
     }
@@ -47,7 +47,7 @@ histogram <- function(seq_data,
 #' @return list of data frames for the sets of counts-versus-length bars drawn
 #'   in the plot, split by category.
 str_hist_setup <- function(seq_data, sample_data = NULL) {
-  vec_to_df <- function(data, cols=c("Length", "Count")) {
+  vec_to_df <- function(data, cols = c("Length", "Count")) {
     if (length(data) == 0) {
       df <- data.frame(Length = as.integer(NA), Count = as.integer(NA))[0, ]
     } else {
@@ -86,9 +86,9 @@ str_hist_setup <- function(seq_data, sample_data = NULL) {
                                                   "Count"])
   }
   bars <- lapply(bars, function(b) {
-      rownames(b) <- NULL
-      b
-    })
+    rownames(b) <- NULL
+    b
+  })
   bars
 }
 
@@ -108,25 +108,27 @@ str_hist_render <- function(bars, main, xlim, cutoff_fraction) {
   categories <- str_hist_setup_legend(bars)
 
   ylim <- range(bars$orig$Count)
-  graphics::plot(c(),
-                 c(),
-                 main = main,
-                 xlab = "Length (nt)",
-                 ylab = "Sequence Count",
-                 xlim = xlim,
-                 ylim = ylim)
+  graphics::plot(
+    c(),
+    c(),
+    main = main,
+    xlab = "Length (nt)",
+    ylab = "Sequence Count",
+    xlim = xlim,
+    ylim = ylim)
 
   # How wide should each bar be, in pixels, to be flush with the adjacent bars?
   lwd <- max(1, get_px_width()) # at least one pixel
 
   for (nm in names(bars)) {
     if (nrow(bars[[nm]]) > 0) {
-      graphics::points(bars[[nm]]$Length,
-                       bars[[nm]]$Count,
-                       type = "h",
-                       col = categories[nm, "col"],
-                       lend = 1,
-                       lwd = lwd)
+      graphics::points(
+        bars[[nm]]$Length,
+        bars[[nm]]$Count,
+        type = "h",
+        col = categories[nm, "col"],
+        lend = 1,
+        lwd = lwd)
     }
   }
 
@@ -144,20 +146,18 @@ str_hist_render <- function(bars, main, xlim, cutoff_fraction) {
     # Draw domain of sample data
     ymax <- max(bars$orig$Count)
     xlim_filt <- range(as.integer(bars$filt$Length))
-    graphics::polygon(x = rep(xlim_filt, each = 2),
-                      y = c(0, ymax, ymax, 0),
-                      col = "#0000001E",
-                      border = NA)
+    graphics::polygon(
+      x = rep(xlim_filt, each = 2),
+      y = c(0, ymax, ymax, 0),
+      col = "#0000001E",
+      border = NA)
   }
 
   # Draw legend
   leg <- categories[categories$Render, c("legend", "col", "pch", "lty")]
   filt <- sapply(leg, function(x) all(is.na(x)))
   leg <- leg[! filt]
-  do.call(graphics::legend,
-          c(list(x = "topright",
-                 bty = "n"),
-            leg))
+  do.call(graphics::legend, c(list(x = "topright", bty = "n"), leg))
 }
 
 #' Setup display attributes for STR histogram
@@ -181,9 +181,8 @@ str_hist_setup_legend <- function(bars) {
     col = c("black", "gray", "pink", "blue", "red", "#00000080"),
     pch = c(15, 15, 15, 15, 15, NA),
     lty = c(NA, NA, NA, NA, NA, 1),
-    stringsAsFactors = FALSE
-  )
-  categories$Render[1:length(bars)] <- TRUE
+    stringsAsFactors = FALSE)
+  categories$Render[seq_len(length(bars))] <- TRUE
   rownames(categories) <- categories$Name
   categories
 }
@@ -195,8 +194,8 @@ str_hist_setup_legend <- function(bars) {
 get_px_width <- function() {
   # https://stackoverflow.com/questions/17213293/how-to-get-r-plot-window-size
   px_width_fig <- grDevices::dev.size("px")[1] # width of whole figure in pixels
-  px_width_plt <- diff(graphics::par("plt")[1:2] * px_width_fig) # just the plot region
-  width_plt <- diff(graphics::par("usr")[1:2]) # width of plot region in plot units
+  px_width_plt <- diff(graphics::par("plt")[1:2] * px_width_fig) # plt region
+  width_plt <- diff(graphics::par("usr")[1:2]) # plot region (in plot units)
   step_width <- px_width_plt / width_plt # pixels per plot unit increment
   step_width
 }