Merge pull request #15 from ShawHahnLab/release-0.2.1

Release 0.2.1
ShawHahnLab · Jul 24, 2018 · 9bb8c02 · 9bb8c02
2 parents b0217d4 + 2c00319
commit 9bb8c02
Show file tree

Hide file tree

Showing 23 changed files with 656 additions and 70 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,12 +1,13 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.travis.yml$
+.utils
 environment.yml
 install_linux_conda.sh
 install_linux.sh
 install_windows.cmd
+install_windows.R
 install_mac.command
 README.md
 GUIDE.Rmd
 GUIDE.pdf
-prep_release.sh
diff --git a/.utils/lint.R b/.utils/lint.R
@@ -0,0 +1,17 @@
+#!/usr/bin/env Rscript
+
+# Lint the package that contains this file's directory, minus some lint
+# categories that just annoy me.
+
+args <- commandArgs()
+f <- gsub("^--file=", "", args[grep("^--file=", args)])
+f <- normalizePath(f)
+path <- dirname(dirname(f))
+
+linters_no <- c("multiple_dots", # "Don't use dots in names"
+                "camel_case",    # "Don't capitalize stuff"
+                "object_usage")  # "I don't see that variable"
+linters_no <- paste0(linters_no, "_linter")
+linters <- lintr::default_linters[-match(linters_no,
+                                         names(lintr::default_linters))]
+lintr::lint_package(path = path, linters = linters)
diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -e
+
+VERSION=$1
+
+chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'
+
+# Update version in download link in README
+VER_MSG="The most recent released version is"
+TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
+SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
+sed -i -r "$SED_README" README.md
+
+# Update version in DESCRIPTION and NEWS.md
+sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
+sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
+
+R --slave --vanilla -e "$chiimp_check"
+R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"
+
+# Create bundled ZIP and TGZ versions without hidden top level files (such as
+# the git and travis stuff) and with the GUIDE.pdf.
+pushd ..
+zip -r chiimp-v${VERISON}.zip chiimp/*
+tar czvf chiimp-v${VERSION}.tgz chiimp/*
+popd
+
+# TODO show reminder of checks before tagging a release:
+# * full test on all three platforms
+# * make sure NEWS.md contains all updates under a heading matching this version
+# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chiimp
 Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
-Version: 0.2.0
+Version: 0.2.1
 Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
 Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
 Depends: R (>= 3.2.3)

diff --git a/GUIDE.Rmd b/GUIDE.Rmd
@@ -4,7 +4,7 @@
 
 title: "CHIIMP User Guide"
 author: "Jesse Connell"
-date: "2018/03/26"
+date: "2018/07/23"
 output:
   pdf_document:
     toc: true
@@ -290,9 +290,13 @@ For inter-sample comparisons, the alleles identified across samples for each
 locus are aligned to one another.  The genotypes for each sample are clustered 
 by number of matching alleles, showing similarity between samples.  If a 
 spreadsheet of known genotypes was given, the sample genotypes are also compared
-to the known genotypes, with any close matches reported.  A single report 
-document summarizes the genotyping and these other details.  See the Output Data
-Organization section below for more information on the output.
+to the known genotypes, with any close matches reported.  If a Name column was
+provided with the sample definition table as well as a known genotypes
+spreadsheet, the known-correct genotypes will be paired with applicable samples
+and a column tracking the result of the genotyping (Correct, Incorrect, Blank,
+or Dropped Allele) will be added.  A single report document summarizes the
+genotyping and these other details.  See the Output Data Organization section
+below for more information on the output.
 
 These steps are handled by the `full_analysis` function in the R package.
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(analyze_sample_guided)
 export(analyze_sample_naive)
 export(analyze_seqs)
 export(calc_genotype_distance)
+export(categorize_genotype_results)
 export(config.defaults)
 export(find_closest_matches)
 export(full_analysis)
@@ -20,6 +21,7 @@ export(load_seqs)
 export(main)
 export(make_dist_mat)
 export(make_dist_mat_known)
+export(match_known_genotypes)
 export(plot_alignment)
 export(plot_cts_per_locus)
 export(plot_dist_mat)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,24 @@
+# chiimp 0.2.1
+
+ * Minor improvements to release process ([#14]).
+ * Fixed install script for Mac OS ([#13]).
+ * Fixed file-saving on Windows ([#12]).
+ * Fixed installation on Windows for usernames with spaces ([#11]).
+ * Added automatic categorization of genotyping results for samples from known
+ individuals ([#8]).
+   * Added function to pair samples with known correct genotypes,
+   `match_known_genotypes`.
+   * Added function to categorize results of genotyping for known individuals,
+   `categorize_genotype_results`.
+   * Enabled categorization features in `summarize_dataset` when Name column is
+   supplied in results summary data frame.
+
+[#14]: https://github.com/ShawHahnLab/chiimp/issues/14
+[#13]: https://github.com/ShawHahnLab/chiimp/issues/13
+[#12]: https://github.com/ShawHahnLab/chiimp/issues/12
+[#11]: https://github.com/ShawHahnLab/chiimp/issues/11
+[#8]: https://github.com/ShawHahnLab/chiimp/issues/8
+
 # chiimp 0.2.0
 
  * Restructured code to avoid analyzing multiplexed samples more than once ([#3]).

diff --git a/R/categorize.R b/R/categorize.R
@@ -0,0 +1,130 @@
+# Interpret genotyping results for samples with known identity.
+
+#' Associate known genotypes with samples
+#'
+#' Using the Name column of the given results summary data frame, pair each
+#' called genotype with the known alleles.  A data frame with two columns,
+#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
+#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
+#' this point the two allele entries should match up directly for genotypes that
+#' were called correctly.
+#'
+#' @param results_summary cross-sample summary data frame as produced by
+#'   \code{\link{analyze_dataset}}.
+#' @param genotypes.known data frame of known genotypes that should be compared
+#'   to the observed genotypes in the results, as loaded by
+#'   \code{\link{load_genotypes}}.
+#'
+#' @return data frame with two columns for the two correct alleles, and rows
+#'   matching the input summary table.
+#'
+#' @export
+match_known_genotypes <- function(results_summary, genotypes.known) {
+  # match name/locus combos with genotypes
+  id_tbl <- paste(results_summary$Name, results_summary$Locus)
+  id_kg <- paste(genotypes.known$Name, genotypes.known$Locus)
+  idx <- match(id_tbl, id_kg)
+  # Build data frame of correct allele sequences
+  result <- data.frame(CorrectAllele1Seq = genotypes.known[idx, "Allele1Seq"],
+                       CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
+                       stringsAsFactors = FALSE)
+  # Ensure ordering within pairs matches samples, if possible.
+  for (i in 1:nrow(result)) {
+    a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
+    kg <- result[i, ]
+    idx <- match(a, kg)
+    if (idx[1] %in% 2 || idx[2] %in% 1)
+      result[i, ] <- rev(kg)
+  }
+  result
+}
+
+#' Categorize genotyping results
+#'
+#' For a given results summary data frame that has CorrectAllele1Seq and Correct
+#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
+#' added, create a factor labeling every row of the input data frame by its
+#' genotyping outcome.
+#'
+#' @details
+#' Levels in the returned factor, in order:
+#'
+#' * Correct: one/two alleles match.
+#' * Incorrect at least one allele does not match.
+#' * Blank: No alleles were called in the analysis even though known genotypes
+#'    were supplied.
+#' * Dropped Allele: One called allele is correct for a heterozygous individual,
+#'   but no second allele was called.
+#'
+#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
+#' both set to NA, map to NA in the returned factor.
+#' @md
+#'
+#' @param results_summary cross-sample summary data frame as produced by
+#'   \code{\link{analyze_dataset}} with extra columns as produced by
+#'   \code{\link{match_known_genotypes}}.
+#'
+#' @return factor defining genotyping result category for every row of the input
+#'   data frame.
+#'
+#' @export
+categorize_genotype_results <- function(results_summary) {
+  # Five possibilities for either NA/not NA plus outcome of non-NA pair
+  # All five possibilities for a single allele check:
+  #   0: Both non-NA, simple mismatch
+  #   1: A not NA, C NA (no correct allele matched this one)
+  #   2: A NA, C not NA (we missed a correct allele and left this blank)
+  #   3: A NA, C NA (correctly did not report an allele)
+  #   4: Both non-NA, match
+  check_allele <- function(allele, ref) {
+    a <- is.na(allele) * 2 + is.na(ref) # NA: 1, not NA: 0
+    a[a == 0 & allele == ref] <- 4 # special distinction for one case
+    a
+  }
+
+  # Now, combine for both alleles to have all possible outcomes, and offset by
+  # one to account for R's indexing.
+  a1 <- check_allele(results_summary$Allele1Seq,
+                     results_summary$CorrectAllele1Seq)
+  a2 <- check_allele(results_summary$Allele2Seq,
+                     results_summary$CorrectAllele2Seq)
+  a <- a1 * 5 + a2 + 1
+
+  # Here's all the possible outcomes, categorized.  Cases that should never come
+  # up for correctly-labeled genotypes will evaluate to NA.
+  lvls <- c(
+    # A1 0: first allele simple mismatch.  Whatever A2 is, this is Incorrect.
+    "Incorrect", # both mismatch
+    "Incorrect", # extra allele, mismatch
+    "Incorrect", # drop
+    "Incorrect", # correctly missing
+    "Incorrect", # second correct
+    # A1 1: first allele called, but no correct allele listed.  Still Incorrect.
+    "Incorrect", # simple mismatch
+    NA,          # second allele also not present?? weird case
+    "Incorrect", # both mismatch
+    NA,          # no correct allele listed for second either?? weird case
+    "Incorrect", # second is correct but first was wrong
+    # A1 2: first allele incorrectly blank.
+    "Incorrect", # simple mismatch
+    "Incorrect", # wrong
+    "Blank",     # second allele also incorrectly blank
+    "Incorrect", # though this *was* homozygous; we at least got that right.
+    "Dropped Allele", # Got one right, but missed A1.
+    # A1 3: first allele correctly blank (expecting true homozygote).
+    "Incorrect", # simple mismatch
+    NA,          # but C2 also NA? weird case
+    "Blank",     # A2 also blank
+    NA,          # A2 NA but C2 also NA? weird case
+    "Correct",   # correct homozygote
+    # A1 4: first allele correct.
+    "Incorrect", # but second wrong.
+    "Incorrect", # second wrongly given when should be blank.
+    "Dropped Allele", # Got one right, but missed A2.
+    "Correct",   # correctly did not report a second allele (homozygote)
+    "Correct"    # correctly did report a second allele (heterozygote)
+  )
+
+  # Map the integers for each case to text categories and create factor.
+  factor(lvls[a], levels = c("Correct", "Dropped Allele", "Blank", "Incorrect"))
+}
diff --git a/R/summarize_dataset.R b/R/summarize_dataset.R
@@ -15,6 +15,19 @@
 #'   * dist_mat_known: if genotypes.known is given, this distance matrix of
 #'     sample-to-individual values will be present, from
 #'     \code{\link{make_dist_mat_known}}.
+#'
+#' If genotypes.known is given *and* a Name column is present in
+#' \code{results$summary}, samples will be matched with the genotypes in
+#' genotypes.known and additional columns will be present in the summary data
+#' frame:
+#'   * CorrectAllele1Seq: One correct allele sequence for the individual.  The
+#'   order of this and \code{CorrectAllele2Seq} will be matched to
+#'   \code{Allele1Seq} and \code{Allele2Seq} if possible.  See
+#'   \code{\link{match_known_genotypes}}.
+#'   * CorrectAllele2Seq: A second correct allele sequence, as above.
+#'   * GenotypeResult: Categorization for each entry as Correct, Incorrect,
+#'   Blank, or Dropped Allele.  See \code{\link{categorize_genotype_results}}.
+#'
 #' @md
 #'
 #' @param results list containing summary data frame and sample-specific data
@@ -35,6 +48,12 @@ summarize_dataset <- function(results, genotypes.known=NULL) {
     results$dist_mat_known <- make_dist_mat_known(results$summary,
                                                   genotypes.known)
     results$genotypes.known <- genotypes.known
+    if ("Name" %in% colnames(results$summary)) {
+      results$summary <- cbind(results$summary,
+                match_known_genotypes(results$summary, results$genotypes.known))
+      results$summary$GenotypeResult <- categorize_genotype_results(
+        results$summary)
+    }
   }
   return(results)
 }

diff --git a/R/util.R b/R/util.R
@@ -156,14 +156,16 @@ name_alleles_in_table <- function(data, known_alleles=NULL, name_args=list()) {
 #' Remove shared path from file paths
 #'
 #' For the given character vector of file paths, create a modified version with
-#' any common prefix path removed.
+#' any common prefix path removed.  Forward slashes are used as the path
+#' separator on all platforms.
 #'
 #' @param fps_full character vector of file paths.
 #'
 #' @return character vector of same length as input, with any common directory
 #'   structure trimmed off.
 remove_shared_root_dir <- function(fps_full) {
-  fps <- normalizePath(fps_full, mustWork = FALSE)
+  fps <- gsub("\\\\", "/", fps_full)
+  fps <- normalizePath(fps, mustWork = FALSE, winslash = "/")
   chunks <- lapply(strsplit(fps, "/"), function(segs) segs[segs != ""])
   minlen <- min(sapply(chunks, length))
   dirs <- do.call(rbind, lapply(chunks, "[", 1:minlen))

diff --git a/README.md b/README.md
@@ -8,5 +8,5 @@ high-throughput sequencing datasets.
 
 For automated installation and program usage see GUIDE.pdf in a
 [released version](https://github.com/ShawHahnLab/chiimp/releases).
-The most recent released version is [0.2.0](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.0).
+The most recent released version is [0.2.1](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.1).
 For usage as an R package also see the built-in package documentation.
diff --git a/install_mac.command b/install_mac.command
diff --git a/install_windows.R b/install_windows.R
@@ -0,0 +1,65 @@
+# Install CHIIMP on Windows.
+
+# Find the path to the directory containing this script.  We need this for
+# package testing and installation below.
+args <- commandArgs()
+f <- gsub("^--file=", "", args[grep("^--file=", args)])
+f <- normalizePath(f)
+path <- dirname(f)
+
+UPROF <- Sys.getenv("USERPROFILE")
+
+# If no library paths are writeable, try creating a user library.
+if (! any(file.access(.libPaths(), 2) == 0)) {
+  # This is the directory I see RStudio create automatically on first start,
+  # and the command-line R also detects it.
+  ver <- paste(version$major, sub("\\..*", "", version$minor), sep = ".")
+  dp <- file.path(UPROF, "Documents", "R", "win-library", ver)
+  dir.create(dp, recursive = TRUE)
+  # On a second run through this will get picked up automatically,
+  # but if we want it right now we have to add it to the list manually.
+  .libPaths(dp)
+}
+
+cat("\n")
+cat("### Installing devtools\n")
+cat("\n")
+install.packages("devtools", repos = "https://cloud.r-project.org")
+
+cat("\n")
+cat("### Installing Bioconductor and MSA\n")
+cat("\n")
+source("https://bioconductor.org/biocLite.R")
+biocLite("msa")
+
+cat("\n")
+cat("### Installing dependencies\n")
+cat("\n")
+devtools::install_deps(path, dependencies = TRUE)
+
+cat("\n")
+cat("### Testing CHIIMP\n")
+cat("\n")
+status <- sum(as.data.frame(devtools::test(path))$failed)
+if (status == 1) {
+  cat("\n")
+  cat("\n")
+  cat("    Warning: Tests indicated failures.\n")
+  cat("\n")
+  cat("\n")
+}
+
+cat("\n")
+cat("### Installing CHIIMP\n")
+cat("\n")
+devtools::install(path)
+
+shortcut_path <- file.path(UPROF, "Desktop", "CHIIMP.lnk")
+chiimp_path <- system.file("bin", "chiimp.cmd", package = "chiimp")
+# https://stackoverflow.com/a/30029955/6073858
+args <- c(paste0("$s=(New-Object -COM WScript.Shell).CreateShortcut('",
+				         shortcut_path,
+				         "');"),
+		      paste0("$s.TargetPath='", chiimp_path, "';"),
+	        "$s.Save();")
+system2("powershell", args)