Skip to content

Commit

Permalink
Merge pull request #93 from ShawHahnLab/release-0.4.1
Browse files Browse the repository at this point in the history
Release 0.4.1
  • Loading branch information
ressy authored Jan 10, 2023
2 parents 229ffbd + 73aca8f commit 646f52e
Show file tree
Hide file tree
Showing 131 changed files with 2,873 additions and 2,400 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,4 @@ workflows:
- build:
matrix:
parameters:
rversion: ["3.6.3", "4.2.1"]
rversion: ["3.6.3", "4.2.1", "4.2.2"]
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: chiimp
Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
Version: 0.4.0
Version: 0.4.1
Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
Depends: R (>= 3.6)
Expand Down Expand Up @@ -33,7 +33,7 @@ Imports:
Remotes:
github::sherrillmix/dnar,
github::sherrillmix/dnaplotr
RoxygenNote: 7.1.1
RoxygenNote: 7.2.0
Suggests:
testthat,
roxygen2
16 changes: 16 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# chiimp 0.4.1

* Overhauled unit tests with more modular saved inputs and expected outputs
([#92])
* Made warnings encountered in the chiimp script display as they occur, rather
than at the end ([#91])
* Made `load_dataset` warn about repeated Sample+Replicate+Locus entries
across rows ([#90])
* Made `load_config` warn about any unrecognized configuration file entries
([#88])

[#92]: https://github.com/ShawHahnLab/chiimp/pull/92
[#91]: https://github.com/ShawHahnLab/chiimp/pull/91
[#90]: https://github.com/ShawHahnLab/chiimp/pull/90
[#88]: https://github.com/ShawHahnLab/chiimp/pull/88

# chiimp 0.4.0

* Fixed desktop icon drag-and-drop for recent Linux distributions ([#85])
Expand Down
40 changes: 20 additions & 20 deletions R/analyze_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,24 @@
#'
#' @export
analyze_dataset <- function(
dataset,
locus_attrs,
nrepeats,
stutter.count.ratio_max = config.defaults$seq_analysis$
stutter.count.ratio_max,
artifact.count.ratio_max = config.defaults$seq_analysis$
artifact.count.ratio_max,
use_reverse_primers = config.defaults$seq_analysis$
use_reverse_primers,
reverse_primer_r1 = config.defaults$seq_analysis$
reverse_primer_r1,
ncores = 0,
analysis_opts,
summary_opts,
analysis_function=analyze_sample,
summary_function=summarize_sample,
known_alleles=NULL,
name_args=list()) {
dataset,
locus_attrs,
nrepeats,
stutter.count.ratio_max = config.defaults$seq_analysis$
stutter.count.ratio_max,
artifact.count.ratio_max = config.defaults$seq_analysis$
artifact.count.ratio_max,
use_reverse_primers = config.defaults$seq_analysis$
use_reverse_primers,
reverse_primer_r1 = config.defaults$seq_analysis$
reverse_primer_r1,
ncores = 0,
analysis_opts,
summary_opts,
analysis_function = analyze_sample,
summary_function = summarize_sample,
known_alleles = NULL,
name_args = list()) {
if (! all(dataset$Locus %in% locus_attrs$Locus)) {
rogue_loci <- unique(dataset$Locus[! dataset$Locus %in% locus_attrs$Locus])
msg <- paste("ERROR: Locus names in dataset not in attributes table:",
Expand Down Expand Up @@ -241,8 +241,8 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
#' name.
name_known_sequences <- function(results, known_alleles, name_args) {
# Name all of the called alleles across samples
results$summary <- name_alleles_in_table(results$summary, known_alleles,
name_args)
results$summary <- name_alleles_in_table(
results$summary, known_alleles, name_args)

# Create table of allele names for current dataset
a1 <- results$summary[, c("Locus", "Allele1Seq", "Allele1Name")]
Expand Down
17 changes: 17 additions & 0 deletions R/analyze_sample.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,20 @@ analyze_sample_naive <- function(seq_data, sample.attrs, fraction.min) {
Category[is.na(Category)] <- "Prominent"
})
}

#' Check Sample Data for Potential Allele Matches
#'
#' Check the entries in a processed sample data frame for potential matches to a
#' given locus.
#'
#' @param sample_data data frame of processed data for sample as produced by
#' \code{\link{analyze_seqs}}.
#' @param locus.name character name of locus to match against.
#'
#' @return logical vector of entries for potential alleles.
allele_match <- function(sample_data, locus.name) {
with(sample_data,
as.character(MatchingLocus) == locus.name &
MotifMatch &
LengthMatch)
}
31 changes: 16 additions & 15 deletions R/analyze_seqs.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@
#' locus_attrs,
#' num_adjacent_repeats)
analyze_seqs <- function(
seqs, locus_attrs, nrepeats,
stutter.count.ratio_max = config.defaults$seq_analysis$
stutter.count.ratio_max,
artifact.count.ratio_max = config.defaults$seq_analysis$
artifact.count.ratio_max,
use_reverse_primers = config.defaults$seq_analysis$
use_reverse_primers,
reverse_primer_r1 = config.defaults$seq_analysis$reverse_primer_r1) {
seqs, locus_attrs, nrepeats,
stutter.count.ratio_max = config.defaults$seq_analysis$
stutter.count.ratio_max,
artifact.count.ratio_max = config.defaults$seq_analysis$
artifact.count.ratio_max,
use_reverse_primers = config.defaults$seq_analysis$
use_reverse_primers,
reverse_primer_r1 = config.defaults$seq_analysis$reverse_primer_r1) {
# Dereplicate sequences
tbl <- table(seqs)
count <- as.integer(tbl)
Expand Down Expand Up @@ -117,8 +117,9 @@ analyze_seqs <- function(
# subsets of the original sample data.
data$FractionOfTotal <- data$Count / sum(data$Count)
data$FractionOfLocus <- with(data, {
total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc)
sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE)))
total_per_locus <- unlist(lapply(levels(MatchingLocus), function(loc) {
sum(data[MatchingLocus == loc, "Count"], na.rm = TRUE)
}))
names(total_per_locus) <- levels(MatchingLocus)
Count / total_per_locus[MatchingLocus]
})
Expand All @@ -138,8 +139,8 @@ analyze_seqs <- function(
#' should be reverse complemented before comparing.
#'
#' @return factor of locus names corresponding the matched primer sequences.
find_matching_primer <- function(sample.data, locus_attrs,
forward=TRUE, reverse_primer_r1=TRUE) {
find_matching_primer <- function(
sample.data, locus_attrs, forward = TRUE, reverse_primer_r1 = TRUE) {
seqs <- sample.data$Seq
if (forward) {
primercol <- "Primer"
Expand Down Expand Up @@ -232,7 +233,7 @@ check_length <- function(sample.data, locus_attrs) {
#' entry that may have produced each entry as a stutter band.
find_stutter <- function(
sample.data, locus_attrs,
count.ratio_max=config.defaults$seq_analysis$stutter.count.ratio_max) {
count.ratio_max = config.defaults$seq_analysis$stutter.count.ratio_max) {

stutter <- integer(nrow(sample.data)) * NA

Expand Down Expand Up @@ -281,8 +282,8 @@ find_stutter <- function(
#' @return integer vector specifying, for each entry, the row index for another
#' entry that may have produced each entry as an artifactual sequence.
find_artifact <- function(
sample.data, locus_attrs,
count.ratio_max=config.defaults$seq_analysis$artifact.count.ratio_max) {
sample.data, locus_attrs,
count.ratio_max = config.defaults$seq_analysis$artifact.count.ratio_max) {

artifact <- integer(nrow(sample.data)) * NA

Expand Down
2 changes: 1 addition & 1 deletion R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ match_known_genotypes <- function(results_summary, genotypes.known) {
CorrectAllele2Seq = genotypes.known[idx, "Allele2Seq"],
stringsAsFactors = FALSE)
# Ensure ordering within pairs matches samples, if possible.
for (i in 1:nrow(result)) {
for (i in seq_len(nrow(result))) {
a <- results_summary[i, c("Allele1Seq", "Allele2Seq")]
kg <- result[i, ]
idx <- match(a, kg)
Expand Down
42 changes: 20 additions & 22 deletions R/chiimp.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@
#' }
#'
#' @export
full_analysis <- function(config, dataset=NULL) {
full_analysis <- function(config, dataset = NULL) { # nolint: cyclocomp_linter.
# Overaly explicit configuration onto the default settings
config_full <- utils::modifyList(config.defaults, config)

Expand Down Expand Up @@ -191,23 +191,20 @@ full_analysis <- function(config, dataset=NULL) {
allele.names <- NULL
if (!is.null(cfg$fp_allele_names))
allele.names <- load_allele_names(cfg$fp_allele_names)
results <- analyze_dataset(dataset, locus_attrs,
nrepeats = cfg$seq_analysis$nrepeats,
stutter.count.ratio_max = cfg$seq_analysis$
stutter.count.ratio_max,
artifact.count.ratio_max = cfg$seq_analysis$
artifact.count.ratio_max,
use_reverse_primers = cfg$seq_analysis$
use_reverse_primers,
reverse_primer_r1 = cfg$seq_analysis$
reverse_primer_r1,
ncores = cfg$dataset_analysis$ncores,
analysis_opts = cfg$sample_analysis_opts,
summary_opts = cfg$sample_summary_opts,
analysis_function = sample_analysis_func,
summary_function = sample_summary_func,
known_alleles = allele.names,
name_args = cfg$dataset_analysis$name_args)
results <- analyze_dataset(
dataset, locus_attrs,
nrepeats = cfg$seq_analysis$nrepeats,
stutter.count.ratio_max = cfg$seq_analysis$stutter.count.ratio_max,
artifact.count.ratio_max = cfg$seq_analysis$artifact.count.ratio_max,
use_reverse_primers = cfg$seq_analysis$use_reverse_primers,
reverse_primer_r1 = cfg$seq_analysis$reverse_primer_r1,
ncores = cfg$dataset_analysis$ncores,
analysis_opts = cfg$sample_analysis_opts,
summary_opts = cfg$sample_summary_opts,
analysis_function = sample_analysis_func,
summary_function = sample_summary_func,
known_alleles = allele.names,
name_args = cfg$dataset_analysis$name_args)
results$allele.names <- allele.names
results$locus_attrs <- locus_attrs
if (cfg$verbose) logmsg("Summarizing results...")
Expand All @@ -216,9 +213,10 @@ full_analysis <- function(config, dataset=NULL) {
genotypes.known <- load_genotypes(cfg$fp_genotypes_known)
results <- summarize_dataset(results, genotypes.known)
if (!is.null(cfg$fp_genotypes_known))
results$closest_matches <- find_closest_matches(results$dist_mat_known,
range = cfg$report.dist_range,
maximum = cfg$report.dist_max)
results$closest_matches <- find_closest_matches(
results$dist_mat_known,
range = cfg$report.dist_range,
maximum = cfg$report.dist_max)
results$config <- config_full
if (cfg$verbose) logmsg("Saving output files...")
save_data(results, results$config)
Expand Down Expand Up @@ -260,7 +258,7 @@ full_analysis <- function(config, dataset=NULL) {
#' }
#'
#' @export
main <- function(args=NULL) {
main <- function(args = NULL) {
if (missing(args))
args <- commandArgs(trailingOnly = TRUE)
desc <- "Identify microsatellite alleles fasta/fastq files"
Expand Down
61 changes: 30 additions & 31 deletions R/histogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ histogram <- function(seq_data,
# mark top edges of allele sequences (to handle the same-length case)
# label bars$topknown by name? positioning may get tricky.
bars <- str_hist_setup(seq_data, sample_data)
if (render & nrow(seq_data) > 0) {
if (render && nrow(seq_data) > 0) {
if (is.null(cutoff_fraction)) {
cutoff_fraction <- attr(sample_data, "fraction.min")
}
Expand All @@ -47,7 +47,7 @@ histogram <- function(seq_data,
#' @return list of data frames for the sets of counts-versus-length bars drawn
#' in the plot, split by category.
str_hist_setup <- function(seq_data, sample_data = NULL) {
vec_to_df <- function(data, cols=c("Length", "Count")) {
vec_to_df <- function(data, cols = c("Length", "Count")) {
if (length(data) == 0) {
df <- data.frame(Length = as.integer(NA), Count = as.integer(NA))[0, ]
} else {
Expand Down Expand Up @@ -86,9 +86,9 @@ str_hist_setup <- function(seq_data, sample_data = NULL) {
"Count"])
}
bars <- lapply(bars, function(b) {
rownames(b) <- NULL
b
})
rownames(b) <- NULL
b
})
bars
}

Expand All @@ -108,25 +108,27 @@ str_hist_render <- function(bars, main, xlim, cutoff_fraction) {
categories <- str_hist_setup_legend(bars)

ylim <- range(bars$orig$Count)
graphics::plot(c(),
c(),
main = main,
xlab = "Length (nt)",
ylab = "Sequence Count",
xlim = xlim,
ylim = ylim)
graphics::plot(
c(),
c(),
main = main,
xlab = "Length (nt)",
ylab = "Sequence Count",
xlim = xlim,
ylim = ylim)

# How wide should each bar be, in pixels, to be flush with the adjacent bars?
lwd <- max(1, get_px_width()) # at least one pixel

for (nm in names(bars)) {
if (nrow(bars[[nm]]) > 0) {
graphics::points(bars[[nm]]$Length,
bars[[nm]]$Count,
type = "h",
col = categories[nm, "col"],
lend = 1,
lwd = lwd)
graphics::points(
bars[[nm]]$Length,
bars[[nm]]$Count,
type = "h",
col = categories[nm, "col"],
lend = 1,
lwd = lwd)
}
}

Expand All @@ -144,20 +146,18 @@ str_hist_render <- function(bars, main, xlim, cutoff_fraction) {
# Draw domain of sample data
ymax <- max(bars$orig$Count)
xlim_filt <- range(as.integer(bars$filt$Length))
graphics::polygon(x = rep(xlim_filt, each = 2),
y = c(0, ymax, ymax, 0),
col = "#0000001E",
border = NA)
graphics::polygon(
x = rep(xlim_filt, each = 2),
y = c(0, ymax, ymax, 0),
col = "#0000001E",
border = NA)
}

# Draw legend
leg <- categories[categories$Render, c("legend", "col", "pch", "lty")]
filt <- sapply(leg, function(x) all(is.na(x)))
leg <- leg[! filt]
do.call(graphics::legend,
c(list(x = "topright",
bty = "n"),
leg))
do.call(graphics::legend, c(list(x = "topright", bty = "n"), leg))
}

#' Setup display attributes for STR histogram
Expand All @@ -181,9 +181,8 @@ str_hist_setup_legend <- function(bars) {
col = c("black", "gray", "pink", "blue", "red", "#00000080"),
pch = c(15, 15, 15, 15, 15, NA),
lty = c(NA, NA, NA, NA, NA, 1),
stringsAsFactors = FALSE
)
categories$Render[1:length(bars)] <- TRUE
stringsAsFactors = FALSE)
categories$Render[seq_len(length(bars))] <- TRUE
rownames(categories) <- categories$Name
categories
}
Expand All @@ -195,8 +194,8 @@ str_hist_setup_legend <- function(bars) {
get_px_width <- function() {
# https://stackoverflow.com/questions/17213293/how-to-get-r-plot-window-size
px_width_fig <- grDevices::dev.size("px")[1] # width of whole figure in pixels
px_width_plt <- diff(graphics::par("plt")[1:2] * px_width_fig) # just the plot region
width_plt <- diff(graphics::par("usr")[1:2]) # width of plot region in plot units
px_width_plt <- diff(graphics::par("plt")[1:2] * px_width_fig) # plt region
width_plt <- diff(graphics::par("usr")[1:2]) # plot region (in plot units)
step_width <- px_width_plt / width_plt # pixels per plot unit increment
step_width
}
Loading

0 comments on commit 646f52e

Please sign in to comment.