Merge branch 'release-0.2.2'

ShawHahnLab · Dec 14, 2018 · 1d08387 · 1d08387
2 parents 9bb8c02 + d896926
commit 1d08387
Show file tree

Hide file tree

Showing 47 changed files with 944 additions and 203 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,3 +11,4 @@ script:
   - R CMD build .
   - R CMD check *tar.gz
   - ./inst/bin/demo.sh
+  - ./inst/bin/demo_empty.sh
diff --git a/.utils/lint.R b/.utils/lint.R
@@ -3,15 +3,75 @@
 # Lint the package that contains this file's directory, minus some lint
 # categories that just annoy me.
 
+
+# Configure Linters -------------------------------------------------------
+
+
+# This could be done via a lintr config file if I took the time to figure out
+# the syntax.
+
+linters <- list(
+  # Linters to add to default list.
+  # T_and_F_symbol is quite new as of 2018-12-10.
+  yes = c(
+    "T_and_F_symbol" # "use TRUE and FALSE, not T and F"
+  ),
+  # Linters to remove from default list.
+  # Some of these are from older versions.
+  no = c(
+    "multiple_dots", # "Don't use dots in names"
+    "camel_case",    # "Don't capitalize stuff"
+    "object_name",   # "Don't use dots in names, don't capitalize"
+    "object_usage"   # "I don't see that variable"
+  )
+)
+
+
+# Detect Package Path -----------------------------------------------------
+
+
+# If run as a script
 args <- commandArgs()
-f <- gsub("^--file=", "", args[grep("^--file=", args)])
-f <- normalizePath(f)
-path <- dirname(dirname(f))
-
-linters_no <- c("multiple_dots", # "Don't use dots in names"
-                "camel_case",    # "Don't capitalize stuff"
-                "object_usage")  # "I don't see that variable"
-linters_no <- paste0(linters_no, "_linter")
-linters <- lintr::default_linters[-match(linters_no,
-                                         names(lintr::default_linters))]
-lintr::lint_package(path = path, linters = linters)
+path_script <- gsub("^--file=", "", args[grep("^--file=", args)])
+path_script <- normalizePath(path_script)
+path_pkg <- dirname(dirname(path_script))
+
+# If run in Rstudio for example
+if (! length(path_pkg)) {
+  # https://stackoverflow.com/a/16046056
+  if (length(sys.frames())) {
+    path_pkg <- dirname(dirname(sys.frame(1)$ofile))
+  } else {
+    # Last fallback, for example run in separate code chunks
+    path_pkg <- getwd()
+  }
+}
+
+
+# Run lintr ---------------------------------------------------------------
+
+library(lintr)
+linters$combined <- lintr::default_linters
+
+# Remove linters
+linters$no <- paste0(linters$no, "_linter")
+idx <- match(linters$no, names(lintr::default_linters))
+idx <- idx[! is.na(idx)]
+linters$combined <- lintr::default_linters[-idx]
+
+# Add linters
+linters$yes <- paste0(linters$yes, "_linter")
+linters$yes <- linters$yes[linters$yes %in% ls("package:lintr")]
+.names <- names(linters$yes)
+linters$yes <- get(linters$yes, "package:lintr")
+names(linters$yes) <- .names
+linters$combined <- c(linters$combined, linters$yes)
+
+# Run
+results <- lintr::lint_package(path = path_pkg, linters = linters$combined)
+results
+if (length(path_script) == 1) {
+  if (length(results) > 0) {
+    quit(status = 1)
+  }
+}
diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh
@@ -1,11 +1,17 @@
 #!/usr/bin/env bash
 
+# NOTE: assumes working directory is the project directory
+
 set -e
 
 VERSION=$1
 
 chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'
 
+# Run lint script
+echo "Running lint check"
+./.utils/lint.R
+
 # Update version in download link in README
 VER_MSG="The most recent released version is"
 TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
@@ -26,7 +32,10 @@ zip -r chiimp-v${VERISON}.zip chiimp/*
 tar czvf chiimp-v${VERSION}.tgz chiimp/*
 popd
 
-# TODO show reminder of checks before tagging a release:
-# * full test on all three platforms
-# * make sure NEWS.md contains all updates under a heading matching this version
-# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct
+echo
+echo "REMINDER BEFORE TAGGING RELEASE $VERSION:"
+echo
+echo " * Run full test on Mac OS, Windows, and Linux"
+echo " * Update NEWS.md with all updates under a heading matching this version"
+echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct"
+echo
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chiimp
 Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
-Version: 0.2.1
+Version: 0.2.2
 Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
 Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
 Depends: R (>= 3.2.3)

diff --git a/GUIDE.Rmd b/GUIDE.Rmd
@@ -4,7 +4,7 @@
 
 title: "CHIIMP User Guide"
 author: "Jesse Connell"
-date: "2018/07/23"
+date: "2018/11/07"
 output:
   pdf_document:
     toc: true
@@ -42,18 +42,21 @@ are installed, follow the specific instructions below for your operating system.
 On Windows, double-click the `install_windows.cmd` script.  This will install
 the package and R dependencies, and create a desktop shortcut.
 
+### Mac OS
+
+On Mac OS, double-click the `install_mac.command` shell script to automatically
+install the package along with R dependencies and create a desktop alias.  If
+the install script won't open because of a security warning, you can
+right-click (control+click) and select "Open" in the menu that appears.
+Apple has specific instructions [here](https://support.apple.com/kb/PH25088?locale=en_US)
+about these security setings.
+
 ### Linux
 
 On Linux, run the `install_linux.sh` shell script to automatically install the 
 package along with R dependencies.  An icon for the program is created at 
 `$HOME/Desktop/CHIIMP`.
 
-### Mac OS
-
-On Mac OS, run the `install_mac.command` shell script to automatically install
-the package along with R dependencies.  An icon for the program is created at
-`$HOME/Desktop/CHIIMP`.
-
 ## Input Data Organization
 
 The information CHIIMP uses during analysis is:

diff --git a/NAMESPACE b/NAMESPACE
@@ -51,3 +51,4 @@ export(summarize_sample)
 export(summarize_sample_guided)
 export(tabulate_allele_names)
 export(tally_cts_per_locus)
+export(test_data)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,19 @@
+# chiimp 0.2.2
+
+ * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank
+   results and only one unique non-blank value ([#22]).
+ * Added check in `analyze_dataset` for locus name mismatches between dataset
+   table and locus attributes table ([#21]).
+ * Added check in `prepare_dataset` for missing data directory ([#20]).
+ * Added check in `prepare_dataset` for zero-detected-files case.
+ * Added check in `load_dataset` for missing data files.
+ * Added check in `full_analysis` to warn if any input data files are
+   completely empty.
+
+[#22]: https://github.com/ShawHahnLab/chiimp/issues/22
+[#21]: https://github.com/ShawHahnLab/chiimp/issues/21
+[#20]: https://github.com/ShawHahnLab/chiimp/issues/20
+
 # chiimp 0.2.1
 
  * Minor improvements to release process ([#14]).

diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R
@@ -11,7 +11,8 @@
 #' in the summary data frame will be sorted according to the ordering of loci in
 #' \code{locus_attrs} and by the sample attributes.  Processed files are stored
 #' separately (as there may be multiple samples per file) and named by input
-#' file path.
+#' file path.  An error is thrown if any locus entries in the given dataset are
+#' not found in the locus attributes data frame.
 #'
 #' @param dataset data frame of sample details as produced by
 #'   \code{\link{prepare_dataset}}.
@@ -53,6 +54,12 @@ analyze_dataset <- function(dataset,
                             summary_function=summarize_sample,
                             known_alleles=NULL,
                             name_args=list()) {
+  if (! all(dataset$Locus %in% locus_attrs$Locus)) {
+    rogue_loci <- unique(dataset$Locus[! dataset$Locus %in% locus_attrs$Locus])
+    msg <- paste("ERROR: Locus names in dataset not in attributes table:",
+                 paste(rogue_loci, collapse = ", "))
+    stop(msg)
+  }
   if (ncores == 0) {
     ncores <- max(1, as.integer(parallel::detectCores() / 2) - 1)
   }
@@ -132,6 +139,14 @@ analyze_dataset <- function(dataset,
                          summary_function = summary_function,
                          analyzed_files = analyzed_files)
   }
+
+  # Check if any of the raw data files had no reads to start with.
+  empties <- sum(sapply(analyzed_files, nrow) == 0)
+  if (empties) {
+    logmsg(paste("WARNING: Zero reads for", empties, "of",
+                 length(analyzed_files), "data files"))
+  }
+
   # Recombined results into a summary data frame and a list of full sample data.
   results <- tidy_analyzed_dataset(dataset, raw.results)
   results$files <- analyzed_files

diff --git a/R/analyze_sample.R b/R/analyze_sample.R
@@ -108,9 +108,8 @@ analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) {
 
   switch(length(expected_lengths) + 1,
          # Zero expected lengths: analyze as usual
-         analyze_sample(seq_data, sample.attrs, fraction.min),
+         analyze_sample(seq_data, sample.attrs, fraction.min), {
          # One expected length: may be homozygous or heterozygous.
-         {
            # Find rows of interest, matching expected length.
            idxl <- chunk$Length == expected_lengths
            within(chunk, {
@@ -128,10 +127,9 @@ analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) {
              # And that's it.  We make no comment on the remaining entries and
              # leave them as NA.
            })
-         },
+         }, {
          # Two expected lengths: definitely heterozygous.  No need to consider
          # fractions here.
-         {
            within(chunk, {
              Category <- factor(, levels = categories)
              # Exclude ambiguous sequences first.

diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R
@@ -40,6 +40,21 @@
 #' @return data frame of dereplicated sequences with added annotations.
 #'
 #' @export
+#'
+#' @examples
+#' # Starting from non-locus-specific sequences,
+#' # a locus attributes table, and requiring
+#' # three side-by-side motif repeats to register
+#' # as a motif match for a locus,
+#' raw_seq_vector <- c(test_data$seqs1$A, test_data$seqs1$B)
+#' locus_attrs <- test_data$locus_attrs
+#' num_adjacent_repeats <- 3
+#' # Convert the character vector of sequences
+#' # into a data frame with one row per
+#' # unique sequence.
+#' seq_data <- analyze_seqs(raw_seq_vector,
+#'                          locus_attrs,
+#'                          num_adjacent_repeats)
 analyze_seqs <- function(seqs, locus_attrs, nrepeats) {
   # Dereplicate sequences
   tbl <- table(seqs)
@@ -52,7 +67,7 @@ analyze_seqs <- function(seqs, locus_attrs, nrepeats) {
                      Count = count,
                      Length = nchar(seqs),
                      stringsAsFactors = FALSE)
-  data <- data[order(data$Count, decreasing = T), ]
+  data <- data[order(data$Count, decreasing = TRUE), ]
   rownames(data) <- NULL
   # Label rows with the apparent locus by checking primer sequences.  Note that
   # this uses the first matching locus for each row.
@@ -74,7 +89,7 @@ analyze_seqs <- function(seqs, locus_attrs, nrepeats) {
   data$FractionOfTotal <- data$Count / sum(data$Count)
   data$FractionOfLocus <- with(data, {
     total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc)
-      sum(data[MatchingLocus == loc, "Count"], na.rm = T)))
+      sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE)))
     names(total_per_locus) <- levels(MatchingLocus)
     Count / total_per_locus[MatchingLocus]
   })
@@ -96,7 +111,7 @@ find_matching_primer <- function(sample.data, locus_attrs) {
   matches <- do.call(cbind, lapply(rownames(locus_attrs), function(locus_name) {
     primer <- as.character(locus_attrs[locus_name, "Primer"])
     result <- grepl(primer, substr(sample.data$Seq, 1, nchar(primer) + 10))
-    c(locus_name)[as.numeric( (! result) + 1)]
+    c(locus_name)[as.numeric((! result) + 1)]
   }))
   # Collapse that set down to just the first match for each entry.
   first.matches <- apply(matches, 1, function(m) m[match(TRUE, !is.na(m))])
@@ -172,7 +187,7 @@ find_stutter <- function(sample.data, locus_attrs,
   # across rows for this locus,
   for (locus_name in rownames(locus_attrs)) {
     idxl_main <- sample.data$MatchingLocus == locus_name
-    idxl_main[is.na(idxl_main)] <- F
+    idxl_main[is.na(idxl_main)] <- FALSE
     d <- sample.data[idxl_main, ]
     motif.len <- nchar(as.character(locus_attrs[locus_name, "Motif"]))
     # Any given index in d could be stutter.
@@ -221,7 +236,7 @@ find_artifact <- function(sample.data, locus_attrs,
   # across rows for this locus,
   for (locus_name in rownames(locus_attrs)) {
     idxl_main <- sample.data$MatchingLocus == locus_name
-    idxl_main[is.na(idxl_main)] <- F
+    idxl_main[is.na(idxl_main)] <- FALSE
     d <- sample.data[idxl_main, ]
     # for each length it's the index of the first (highest-count) entry with
     # that length.