Skip to content

Commit

Permalink
Merge pull request #15 from ShawHahnLab/release-0.2.1
Browse files Browse the repository at this point in the history
Release 0.2.1
  • Loading branch information
ressy authored Jul 24, 2018
2 parents b0217d4 + 2c00319 commit 9bb8c02
Show file tree
Hide file tree
Showing 23 changed files with 656 additions and 70 deletions.
3 changes: 2 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.travis.yml$
.utils
environment.yml
install_linux_conda.sh
install_linux.sh
install_windows.cmd
install_windows.R
install_mac.command
README.md
GUIDE.Rmd
GUIDE.pdf
prep_release.sh
17 changes: 17 additions & 0 deletions .utils/lint.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env Rscript

# Lint the package that contains this file's directory, minus some lint
# categories that just annoy me.

args <- commandArgs()
f <- gsub("^--file=", "", args[grep("^--file=", args)])
f <- normalizePath(f)
path <- dirname(dirname(f))

linters_no <- c("multiple_dots", # "Don't use dots in names"
"camel_case", # "Don't capitalize stuff"
"object_usage") # "I don't see that variable"
linters_no <- paste0(linters_no, "_linter")
linters <- lintr::default_linters[-match(linters_no,
names(lintr::default_linters))]
lintr::lint_package(path = path, linters = linters)
32 changes: 32 additions & 0 deletions .utils/prep_release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

set -e

VERSION=$1

chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'

# Update version in download link in README
VER_MSG="The most recent released version is"
TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
sed -i -r "$SED_README" README.md

# Update version in DESCRIPTION and NEWS.md
sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md

R --slave --vanilla -e "$chiimp_check"
R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"

# Create bundled ZIP and TGZ versions without hidden top level files (such as
# the git and travis stuff) and with the GUIDE.pdf.
pushd ..
zip -r chiimp-v${VERISON}.zip chiimp/*
tar czvf chiimp-v${VERSION}.tgz chiimp/*
popd

# TODO show reminder of checks before tagging a release:
# * full test on all three platforms
# * make sure NEWS.md contains all updates under a heading matching this version
# * make sure GUIDE.Rmd is up-to-date and the rendered GUIDE.pdf is correct
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: chiimp
Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
Version: 0.2.0
Version: 0.2.1
Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
Depends: R (>= 3.2.3)
Expand Down
12 changes: 8 additions & 4 deletions GUIDE.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

title: "CHIIMP User Guide"
author: "Jesse Connell"
date: "2018/03/26"
date: "2018/07/23"
output:
pdf_document:
toc: true
Expand Down Expand Up @@ -290,9 +290,13 @@ For inter-sample comparisons, the alleles identified across samples for each
locus are aligned to one another. The genotypes for each sample are clustered
by number of matching alleles, showing similarity between samples. If a
spreadsheet of known genotypes was given, the sample genotypes are also compared
to the known genotypes, with any close matches reported. A single report
document summarizes the genotyping and these other details. See the Output Data
Organization section below for more information on the output.
to the known genotypes, with any close matches reported. If a Name column was
provided with the sample definition table as well as a known genotypes
spreadsheet, the known-correct genotypes will be paired with applicable samples
and a column tracking the result of the genotyping (Correct, Incorrect, Blank,
or Dropped Allele) will be added. A single report document summarizes the
genotyping and these other details. See the Output Data Organization section
below for more information on the output.

These steps are handled by the `full_analysis` function in the R package.

Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export(analyze_sample_guided)
export(analyze_sample_naive)
export(analyze_seqs)
export(calc_genotype_distance)
export(categorize_genotype_results)
export(config.defaults)
export(find_closest_matches)
export(full_analysis)
Expand All @@ -20,6 +21,7 @@ export(load_seqs)
export(main)
export(make_dist_mat)
export(make_dist_mat_known)
export(match_known_genotypes)
export(plot_alignment)
export(plot_cts_per_locus)
export(plot_dist_mat)
Expand Down
21 changes: 21 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
# chiimp 0.2.1

* Minor improvements to release process ([#14]).
* Fixed install script for Mac OS ([#13]).
* Fixed file-saving on Windows ([#12]).
* Fixed installation on Windows for usernames with spaces ([#11]).
* Added automatic categorization of genotyping results for samples from known
individuals ([#8]).
* Added function to pair samples with known correct genotypes,
`match_known_genotypes`.
* Added function to categorize results of genotyping for known individuals,
`categorize_genotype_results`.
* Enabled categorization features in `summarize_dataset` when Name column is
supplied in results summary data frame.

[#14]: https://github.com/ShawHahnLab/chiimp/issues/14
[#13]: https://github.com/ShawHahnLab/chiimp/issues/13
[#12]: https://github.com/ShawHahnLab/chiimp/issues/12
[#11]: https://github.com/ShawHahnLab/chiimp/issues/11
[#8]: https://github.com/ShawHahnLab/chiimp/issues/8

# chiimp 0.2.0

* Restructured code to avoid analyzing multiplexed samples more than once ([#3]).
Expand Down
130 changes: 130 additions & 0 deletions R/categorize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Interpret genotyping results for samples with known identity.

#' Associate known genotypes with samples
#'
#' Using the Name column of the given results summary data frame, pair each
#' called genotype with the known alleles. A data frame with two columns,
#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
#' this point the two allele entries should match up directly for genotypes that
#' were called correctly.
#'
#' @param results_summary cross-sample summary data frame as produced by
#' \code{\link{analyze_dataset}}.
#' @param genotypes.known data frame of known genotypes that should be compared
#' to the observed genotypes in the results, as loaded by
#' \code{\link{load_genotypes}}.
#'
#' @return data frame with two columns for the two correct alleles, and rows
#' matching the input summary table.
#'
#' @export
match_known_genotypes <- function(results_summary, genotypes.known) {
# match name/locus combos with genotypes
id_tbl <- paste(results_summary$Name, results_summary$Locus)
id_kg <- paste(genotypes.known$Name, genotypes.known$Locus)
idx <- match(id_tbl, id_kg)
# Build data frame of correct allele sequences
result <- data.frame(CorrectAllele1Seq = genotypes.known[idx, "Allele1Seq"],
CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
stringsAsFactors = FALSE)
# Ensure ordering within pairs matches samples, if possible.
for (i in 1:nrow(result)) {
a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
kg <- result[i, ]
idx <- match(a, kg)
if (idx[1] %in% 2 || idx[2] %in% 1)
result[i, ] <- rev(kg)
}
result
}

#' Categorize genotyping results
#'
#' For a given results summary data frame that has CorrectAllele1Seq and Correct
#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
#' added, create a factor labeling every row of the input data frame by its
#' genotyping outcome.
#'
#' @details
#' Levels in the returned factor, in order:
#'
#' * Correct: one/two alleles match.
#' * Incorrect at least one allele does not match.
#' * Blank: No alleles were called in the analysis even though known genotypes
#' were supplied.
#' * Dropped Allele: One called allele is correct for a heterozygous individual,
#' but no second allele was called.
#'
#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
#' both set to NA, map to NA in the returned factor.
#' @md
#'
#' @param results_summary cross-sample summary data frame as produced by
#' \code{\link{analyze_dataset}} with extra columns as produced by
#' \code{\link{match_known_genotypes}}.
#'
#' @return factor defining genotyping result category for every row of the input
#' data frame.
#'
#' @export
categorize_genotype_results <- function(results_summary) {
# Five possibilities for either NA/not NA plus outcome of non-NA pair
# All five possibilities for a single allele check:
# 0: Both non-NA, simple mismatch
# 1: A not NA, C NA (no correct allele matched this one)
# 2: A NA, C not NA (we missed a correct allele and left this blank)
# 3: A NA, C NA (correctly did not report an allele)
# 4: Both non-NA, match
check_allele <- function(allele, ref) {
a <- is.na(allele) * 2 + is.na(ref) # NA: 1, not NA: 0
a[a == 0 & allele == ref] <- 4 # special distinction for one case
a
}

# Now, combine for both alleles to have all possible outcomes, and offset by
# one to account for R's indexing.
a1 <- check_allele(results_summary$Allele1Seq,
results_summary$CorrectAllele1Seq)
a2 <- check_allele(results_summary$Allele2Seq,
results_summary$CorrectAllele2Seq)
a <- a1 * 5 + a2 + 1

# Here's all the possible outcomes, categorized. Cases that should never come
# up for correctly-labeled genotypes will evaluate to NA.
lvls <- c(
# A1 0: first allele simple mismatch. Whatever A2 is, this is Incorrect.
"Incorrect", # both mismatch
"Incorrect", # extra allele, mismatch
"Incorrect", # drop
"Incorrect", # correctly missing
"Incorrect", # second correct
# A1 1: first allele called, but no correct allele listed. Still Incorrect.
"Incorrect", # simple mismatch
NA, # second allele also not present?? weird case
"Incorrect", # both mismatch
NA, # no correct allele listed for second either?? weird case
"Incorrect", # second is correct but first was wrong
# A1 2: first allele incorrectly blank.
"Incorrect", # simple mismatch
"Incorrect", # wrong
"Blank", # second allele also incorrectly blank
"Incorrect", # though this *was* homozygous; we at least got that right.
"Dropped Allele", # Got one right, but missed A1.
# A1 3: first allele correctly blank (expecting true homozygote).
"Incorrect", # simple mismatch
NA, # but C2 also NA? weird case
"Blank", # A2 also blank
NA, # A2 NA but C2 also NA? weird case
"Correct", # correct homozygote
# A1 4: first allele correct.
"Incorrect", # but second wrong.
"Incorrect", # second wrongly given when should be blank.
"Dropped Allele", # Got one right, but missed A2.
"Correct", # correctly did not report a second allele (homozygote)
"Correct" # correctly did report a second allele (heterozygote)
)

# Map the integers for each case to text categories and create factor.
factor(lvls[a], levels = c("Correct", "Dropped Allele", "Blank", "Incorrect"))
}
19 changes: 19 additions & 0 deletions R/summarize_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
#' * dist_mat_known: if genotypes.known is given, this distance matrix of
#' sample-to-individual values will be present, from
#' \code{\link{make_dist_mat_known}}.
#'
#' If genotypes.known is given *and* a Name column is present in
#' \code{results$summary}, samples will be matched with the genotypes in
#' genotypes.known and additional columns will be present in the summary data
#' frame:
#' * CorrectAllele1Seq: One correct allele sequence for the individual. The
#' order of this and \code{CorrectAllele2Seq} will be matched to
#' \code{Allele1Seq} and \code{Allele2Seq} if possible. See
#' \code{\link{match_known_genotypes}}.
#' * CorrectAllele2Seq: A second correct allele sequence, as above.
#' * GenotypeResult: Categorization for each entry as Correct, Incorrect,
#' Blank, or Dropped Allele. See \code{\link{categorize_genotype_results}}.
#'
#' @md
#'
#' @param results list containing summary data frame and sample-specific data
Expand All @@ -35,6 +48,12 @@ summarize_dataset <- function(results, genotypes.known=NULL) {
results$dist_mat_known <- make_dist_mat_known(results$summary,
genotypes.known)
results$genotypes.known <- genotypes.known
if ("Name" %in% colnames(results$summary)) {
results$summary <- cbind(results$summary,
match_known_genotypes(results$summary, results$genotypes.known))
results$summary$GenotypeResult <- categorize_genotype_results(
results$summary)
}
}
return(results)
}
Expand Down
6 changes: 4 additions & 2 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,16 @@ name_alleles_in_table <- function(data, known_alleles=NULL, name_args=list()) {
#' Remove shared path from file paths
#'
#' For the given character vector of file paths, create a modified version with
#' any common prefix path removed.
#' any common prefix path removed. Forward slashes are used as the path
#' separator on all platforms.
#'
#' @param fps_full character vector of file paths.
#'
#' @return character vector of same length as input, with any common directory
#' structure trimmed off.
remove_shared_root_dir <- function(fps_full) {
fps <- normalizePath(fps_full, mustWork = FALSE)
fps <- gsub("\\\\", "/", fps_full)
fps <- normalizePath(fps, mustWork = FALSE, winslash = "/")
chunks <- lapply(strsplit(fps, "/"), function(segs) segs[segs != ""])
minlen <- min(sapply(chunks, length))
dirs <- do.call(rbind, lapply(chunks, "[", 1:minlen))
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ high-throughput sequencing datasets.

For automated installation and program usage see GUIDE.pdf in a
[released version](https://github.com/ShawHahnLab/chiimp/releases).
The most recent released version is [0.2.0](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.0).
The most recent released version is [0.2.1](https://github.com/ShawHahnLab/chiimp/releases/tag/0.2.1).
For usage as an R package also see the built-in package documentation.
Empty file modified install_mac.command
100644 → 100755
Empty file.
65 changes: 65 additions & 0 deletions install_windows.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Install CHIIMP on Windows.

# Find the path to the directory containing this script. We need this for
# package testing and installation below.
args <- commandArgs()
f <- gsub("^--file=", "", args[grep("^--file=", args)])
f <- normalizePath(f)
path <- dirname(f)

UPROF <- Sys.getenv("USERPROFILE")

# If no library paths are writeable, try creating a user library.
if (! any(file.access(.libPaths(), 2) == 0)) {
# This is the directory I see RStudio create automatically on first start,
# and the command-line R also detects it.
ver <- paste(version$major, sub("\\..*", "", version$minor), sep = ".")
dp <- file.path(UPROF, "Documents", "R", "win-library", ver)
dir.create(dp, recursive = TRUE)
# On a second run through this will get picked up automatically,
# but if we want it right now we have to add it to the list manually.
.libPaths(dp)
}

cat("\n")
cat("### Installing devtools\n")
cat("\n")
install.packages("devtools", repos = "https://cloud.r-project.org")

cat("\n")
cat("### Installing Bioconductor and MSA\n")
cat("\n")
source("https://bioconductor.org/biocLite.R")
biocLite("msa")

cat("\n")
cat("### Installing dependencies\n")
cat("\n")
devtools::install_deps(path, dependencies = TRUE)

cat("\n")
cat("### Testing CHIIMP\n")
cat("\n")
status <- sum(as.data.frame(devtools::test(path))$failed)
if (status == 1) {
cat("\n")
cat("\n")
cat(" Warning: Tests indicated failures.\n")
cat("\n")
cat("\n")
}

cat("\n")
cat("### Installing CHIIMP\n")
cat("\n")
devtools::install(path)

shortcut_path <- file.path(UPROF, "Desktop", "CHIIMP.lnk")
chiimp_path <- system.file("bin", "chiimp.cmd", package = "chiimp")
# https://stackoverflow.com/a/30029955/6073858
args <- c(paste0("$s=(New-Object -COM WScript.Shell).CreateShortcut('",
shortcut_path,
"');"),
paste0("$s.TargetPath='", chiimp_path, "';"),
"$s.Save();")
system2("powershell", args)
Loading

0 comments on commit 9bb8c02

Please sign in to comment.