Merge pull request #31 from ShawHahnLab/release-0.2.3

Release 0.2.3
ShawHahnLab · Mar 14, 2019 · c519b98 · c519b98
2 parents 1d08387 + 3491d23
commit c519b98
Show file tree

Hide file tree

Showing 45 changed files with 447 additions and 330 deletions.
diff --git a/.utils/prep_release.sh b/.utils/prep_release.sh
@@ -5,37 +5,55 @@
 set -e
 
 VERSION=$1
+SEP="==="
 
 chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'
 
+echo "$SEP Running spell check"
+./.utils/spellcheck.R
+
 # Run lint script
-echo "Running lint check"
+echo "$SEP Running lint check"
 ./.utils/lint.R
 
-# Update version in download link in README
-VER_MSG="The most recent released version is"
-TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
-SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
-sed -i -r "$SED_README" README.md
+if [[ $VERSION != "" ]]; then
+	# Update version in download link in README
+	VER_MSG="The most recent released version is"
+	TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
+	SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
+	sed -i -r "$SED_README" README.md
 
-# Update version in DESCRIPTION and NEWS.md
-sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
-sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
+	# Update version in DESCRIPTION and NEWS.md
+	sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
+	sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
+fi
 
+echo "$SEP Running devtools::check()"
 R --slave --vanilla -e "$chiimp_check"
+
+echo "$SEP Rendering user guide"
 R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"
 
 # Create bundled ZIP and TGZ versions without hidden top level files (such as
 # the git and travis stuff) and with the GUIDE.pdf.
-pushd ..
-zip -r chiimp-v${VERISON}.zip chiimp/*
-tar czvf chiimp-v${VERSION}.tgz chiimp/*
-popd
+if [[ $VERSION != "" ]]; then
+	echo "$SEP Creating release archives"
+	pushd ..
+	zip -r chiimp-v${VERISON}.zip chiimp/*
+	tar czvf chiimp-v${VERSION}.tgz chiimp/*
+	popd
+fi
 
 echo
 echo "REMINDER BEFORE TAGGING RELEASE $VERSION:"
 echo
 echo " * Run full test on Mac OS, Windows, and Linux"
 echo " * Update NEWS.md with all updates under a heading matching this version"
+echo " * Check README.md for link to this version"
 echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct"
 echo
+echo "ALSO:"
+echo " * Draft release from tag on github including archive files with bundled"
+echo "   GUIDE.pdf"
+echo " * Merge release-### into master, dev, and gh-pages"
+echo
diff --git a/.utils/spellcheck.R b/.utils/spellcheck.R
@@ -0,0 +1,12 @@
+#!/usr/bin/env Rscript
+
+# Spell-check the documentation files.  Note they'll have to be updated e.g.
+# with devtools::document() first.
+
+ignore <- read.table(".utils/wordlist.txt",
+                     header = FALSE,
+                     stringsAsFactors = FALSE)[, 1]
+results <- devtools::spell_check(ignore = ignore)
+if (length(results) > 0) {
+  results
+}
diff --git a/.utils/wordlist.txt b/.utils/wordlist.txt
@@ -0,0 +1,26 @@
+ABCD
+ACTG
+artifactual
+autocalculated
+Autogenerate
+CHIIMP
+CHIIMP's
+config
+Connell
+dereplicated
+Dereplicates
+FASTA
+FASTQ
+genotype
+Genotype
+heterozygous
+Heterozygous
+homozygous
+Homozygous
+MSA
+pandoc
+Pandoc
+PPI
+seqs
+STR
+YAML
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chiimp
 Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
-Version: 0.2.2
+Version: 0.2.3
 Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
 Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
 Depends: R (>= 3.2.3)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,12 @@
+# chiimp 0.2.3
+
+ * Fixed package checks and testing on latest R development releases ([#27]).
+ * Fixed test behavior on Windows and improved test organization ([#16]).
+ * Added documentation corrections and improvements.
+
+[#27]: https://github.com/ShawHahnLab/chiimp/issues/27
+[#16]: https://github.com/ShawHahnLab/chiimp/issues/16
+
 # chiimp 0.2.2
 
  * Fixed heatmap plotting via updated `plot_heatmap` for cases with blank

diff --git a/R/analyze_dataset.R b/R/analyze_dataset.R
@@ -189,9 +189,9 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
 #' For the given results list (pair of summary data frame and list of per-sample
 #' data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns
 #' to all data frames defining names for recognized sequences.  For the summary
-#' data frame this will be Allele1Name and Allele2Name.  For each sample data
-#' frame this will be SeqName, defined for any sequences represented in the
-#' summary or in a given known alleles set.
+#' data frame this will be \code{Allele1Name} and \code{Allele2Name}.  For each
+#' sample data frame this will be \code{SeqName}, defined for any sequences
+#' represented in the summary or in a given known alleles set.
 #'
 #' @param results results list as produced by
 #'   \code{\link{tidy_analyzed_dataset}}.
@@ -202,10 +202,11 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
 #'   \code{\link{make_allele_name}}.
 #'
 #' @return list of results, with \code{summary} set to the single summary data
-#'   frame and \code{data} the per-sample data frames.  A "SeqName" column in
-#'   sample data frames and "Allele1Name" and "Allele2Name" columns in the
-#'   summary data frame will associate any sequence matching a known allele (for
-#'   either the given table or the current dataset) with a text name.
+#'   frame and \code{data} the per-sample data frames.  A \code{SeqName} column
+#'   in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns
+#'   in the summary data frame will associate any sequence matching a known
+#'   allele (for either the given table or the current dataset) with a text
+#'   name.
 name_known_sequences <- function(results, known_alleles, name_args) {
   # Name all of the called alleles across samples
   results$summary <- name_alleles_in_table(results$summary, known_alleles,

diff --git a/R/analyze_sample.R b/R/analyze_sample.R
@@ -76,12 +76,13 @@ analyze_sample <- function(seq_data, sample.attrs, fraction.min) {
 }
 
 #' @describeIn analyze_sample version of sample analysis guided by expected
-#'   sequence length values.  Additional items ExpectedLength1 and optionally
-#'   ExpectedLength2 can be supplied in the \code{sample.attrs} list.  If NA or
-#'   missing the behavior will match \code{analyze_sample}.  If two expected
-#'   lengths are given, the fraction.min argument is ignored.  If at least one
-#'   expected length is given, the stutter/artifact filtering is disabled.  From
-#'   here use \code{\link{summarize_sample_guided}}.
+#'   sequence length values.  Additional items \code{ExpectedLength1} and
+#'   optionally \code{ExpectedLength2} can be supplied in the
+#'   \code{sample.attrs} list.  If NA or missing the behavior will match
+#'   \code{analyze_sample}.  If two expected lengths are given, the fraction.min
+#'   argument is ignored.  If at least one expected length is given, the
+#'   stutter/artifact filtering is disabled.  From here use
+#'   \code{\link{summarize_sample_guided}}.
 #'
 #' @export
 analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) {

diff --git a/R/analyze_seqs.R b/R/analyze_seqs.R
@@ -9,27 +9,27 @@
 #'
 #' @details
 #' Columns in the returned data frame:
-#'  * Seq: sequence text for each unique sequence
-#'  * Count: integer count of occurrences of this exact sequence
-#'  * Length: integer sequence length
-#'  * MatchingLocus: factor for the name of the locus matching each sequence,
-#'    by checking the primer
-#'  * MotifMatch: logical: are there are least \code{nrepeats} perfect
+#'  * \code{Seq}: sequence text for each unique sequence
+#'  * \code{Count}: integer count of occurrences of this exact sequence
+#'  * \code{Length}: integer sequence length
+#'  * \code{MatchingLocus}: factor for the name of the locus matching each
+#'  sequence, by checking the primer
+#'  * \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect
 #'    adjacent repeats of the STR motif for the matching locus?
-#'  * LengthMatch: logical: is the sequence length within the expected range
-#'    for the matching locus?
-#'  * Ambiguous: logical: are there unexpected characters in the sequence
+#'  * \code{LengthMatch}: logical: is the sequence length within the expected
+#'  range for the matching locus?
+#'  * \code{Ambiguous}: logical: are there unexpected characters in the sequence
 #'  content?
-#'  * Stutter: integer: for any sequence that looks like potential PCR stutter,
-#'    the index of the row that may be the source of the stutter band.
-#'  * Artifact: integer: for any sequence that looks like potential PCR artifact
-#'  (other than stutter), the index of the row that may be the source of the
-#'  stutter band.
-#'  * FractionOfTotal: numeric fraction of the number of sequences
+#'  * \code{Stutter}: integer: for any sequence that looks like potential PCR
+#'  stutter, the index of the row that may be the source of the stutter band.
+#'  * \code{Artifact}: integer: for any sequence that looks like potential PCR
+#'  artifact (other than stutter), the index of the row that may be the source
+#'  of the stutter band.
+#'  * \code{FractionOfTotal}: numeric fraction of the number of sequences
 #'    represented by each unique sequence compared to the total.
-#'  * FractionOfLocus: numeric fraction of the number of sequences represented
-#'    by each unique sequence compared to the total for that particular
-#'    matching locus.
+#'  * \code{FractionOfLocus}: numeric fraction of the number of sequences
+#'  represented by each unique sequence compared to the total for that
+#'  particular matching locus.
 #' @md
 #'
 #' @param seqs character vector containing sequences.
@@ -214,8 +214,8 @@ find_stutter <- function(sample.data, locus_attrs,
 #' Searches a processed STR sample for entries that may be PCR artifacts, other
 #' than stutter, from another entry in the sample.  Potential artifacts are
 #' sequences with counts lower than another sequence by a given ratio and
-#' sequence length within 1 bp of the other sequence.  This only considers
-#' STR-labeled rows and requires a given entry to have counts at most
+#' sequence length within 1 nucleotide of the other sequence.  This only
+#' considers STR-labeled rows and requires a given entry to have counts at most
 #' \code{count.ratio_max} compared to the candidate "source" entry to be
 #' considered an artifact.  Sequence content is not currently considered, just
 #' relative sequence lengths and counts.

diff --git a/R/categorize.R b/R/categorize.R
@@ -4,10 +4,10 @@
 #'
 #' Using the Name column of the given results summary data frame, pair each
 #' called genotype with the known alleles.  A data frame with two columns,
-#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
-#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
-#' this point the two allele entries should match up directly for genotypes that
-#' were called correctly.
+#' \code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If
+#' matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the
+#' order will be preserved, and at this point the two allele entries should
+#' match up directly for genotypes that were called correctly.
 #'
 #' @param results_summary cross-sample summary data frame as produced by
 #'   \code{\link{analyze_dataset}}.
@@ -41,10 +41,10 @@ match_known_genotypes <- function(results_summary, genotypes.known) {
 
 #' Categorize genotyping results
 #'
-#' For a given results summary data frame that has CorrectAllele1Seq and Correct
-#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
-#' added, create a factor labeling every row of the input data frame by its
-#' genotyping outcome.
+#' For a given results summary data frame that has \code{CorrectAllele1Seq} and
+#' \code{CorrectAllele2Seq} columns (such as produced by
+#' \code{\link{match_known_genotypes}}) added, create a factor labeling every
+#' row of the input data frame by its genotyping outcome.
 #'
 #' @details
 #' Levels in the returned factor, in order:
@@ -56,8 +56,8 @@ match_known_genotypes <- function(results_summary, genotypes.known) {
 #' * Dropped Allele: One called allele is correct for a heterozygous individual,
 #'   but no second allele was called.
 #'
-#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
-#' both set to NA, map to NA in the returned factor.
+#' Cases that should not occur, such as \code{CorrectAllele1Seq} and
+#' \code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor.
 #' @md
 #'
 #' @param results_summary cross-sample summary data frame as produced by

diff --git a/R/chiimp.R b/R/chiimp.R
@@ -67,48 +67,49 @@
 #' The workflow above outlines CHIIMP's behavior when called as a standalone
 #' program, where \code{\link{main}} loads a configuration file into a nested
 #' list of options and calls \code{\link{full_analysis}}.  The public functions
-#' linked above can also be used idependently; see the documentation and code
+#' linked above can also be used independently; see the documentation and code
 #' examples for the individual functions for more information.
 #'
 #'
 #' **The Package structure of the source files, grouped by topic:**
 #'  * Main Interface:
-#'    * chiimp.R: Main entry point for command-line usage (\code{\link{main}})
-#'      and R usage (\code{\link{full_analysis}}).
+#'    * \code{chiimp.R}: Main entry point for command-line usage
+#'      (\code{\link{main}}) and R usage (\code{\link{full_analysis}}).
 #'  * Data Analysis:
-#'    * analyze_dataset.R: High-level interface to analyze all samples across a
-#'      given dataset (\code{\link{analyze_dataset}}); used by
+#'    * \code{analyze_dataset.R}: High-level interface to analyze all samples
+#'      across a given dataset (\code{\link{analyze_dataset}}); used by
 #'      \code{\link{full_analysis}} to manage the main part of the processing.
-#'    * summarize_dataset.R: High-level interface to provide inter-sample and
-#'      inter-locus analyses (\code{\link{summarize_dataset}}); used by
+#'    * \code{summarize_dataset.R}: High-level interface to provide inter-sample
+#'      and inter-locus analyses (\code{\link{summarize_dataset}}); used by
 #'      \code{\link{full_analysis}} to manage the second stage of the
 #'      processing.
-#'    * analyze_seqs.R: Low-level interface to convert raw sequence input to a
-#'      data frame of unique sequences (\code{\link{analyze_seqs}}); used by
-#'      \code{\link{analyze_dataset}}.
-#'    * analyze_sample.R: Low-level interface to extract per-locus details from
-#'      a data frame of unique sequences (\code{\link{analyze_sample}}); used by
-#'      \code{\link{analyze_dataset}}.
-#'    * summarize_sample.R: Low-level interface to condense each sample data
-#'      frame into a a concise list of consistent attributes, suitable for
+#'    * \code{analyze_seqs.R}: Low-level interface to convert raw sequence input
+#'      to a data frame of unique sequences (\code{\link{analyze_seqs}}); used
+#'      by \code{\link{analyze_dataset}}.
+#'    * \code{analyze_sample.R}: Low-level interface to extract per-locus
+#'      details from a data frame of unique sequences
+#'      (\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}.
+#'    * \code{summarize_sample.R}: Low-level interface to condense each sample
+#'      data frame into a a concise list of consistent attributes, suitable for
 #'      binding together across samples for a dataset
 #'      (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}.
-#'    * categorize.R: Low-level helper functions used by
+#'    * \code{categorize.R}: Low-level helper functions used by
 #'      \code{\link{summarize_dataset}} for samples with known identity.
 #'  * Plotting and reporting:
-#'    * report.R: Various plotting and summarizing functions used when rendering
-#'      a report in \code{\link{full_analysis}}.
-#'    * histogram.R: Sequence histogram plotting tools (\code{\link{histogram}})
-#'      as used during \code{\link{full_analysis}}.
-#'    * markdown.R: Various helper functions for adding tables and plots to an R
-#'      Markdown report as used in \code{\link{full_analysis}}.
+#'    * \code{report.R}: Various plotting and summarizing functions used when
+#'      rendering a report in \code{\link{full_analysis}}.
+#'    * \code{histogram.R}: Sequence histogram plotting tools
+#'      (\code{\link{histogram}}) as used during \code{\link{full_analysis}}.
+#'    * \code{markdown.R}: Various helper functions for adding tables and plots
+#'      to an R Markdown report as used in \code{\link{full_analysis}}.
 #'  * Utility Functions and Configuration:
-#'    * configuration.R: The default configuration options
-#'    (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}.
-#'    * io.R: various helper input/output functions used loading and saving
-#'    sequence data files, spreadsheets, and plots used in multiple parts of the
-#'    package.
-#'    * util.R: Various helper functions used in multiple parts of the package.
+#'    * \code{configuration.R}: The default configuration options
+#'      (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}.
+#'    * \code{io.R}: various helper input/output functions used loading and
+#'      saving sequence data files, spreadsheets, and plots used in multiple
+#'      parts of the package.
+#'    * \code{util.R}: Various helper functions used in multiple parts of the
+#'      package.
 #'
 #' @md
 #'

diff --git a/R/configuration.R b/R/configuration.R
@@ -8,18 +8,18 @@
 #' itself to see all of the build-time defaults.
 #'
 #' Notable Options:
-#'   * dataset_opts:
-#'     * dp: directory path to input sequence files
-#'     * pattern: regular expression for the input filename pattern
-#'     * ord: order of fields Replicate, Sample, and Locus in in the input
-#'     filename pattern.  For example, if Locus is the first field followed by
-#'     Replicate and Sample, set \code{ord=c(3, 1, 2)}.
-#'   * output:
-#'     * dp: directory path for saving output data
-#'   * fp_dataset: file path to table of sample attributes to use, rather than
-#'     detecting via dataset_opts
-#'   * fp_locus_attrs: file path to locus attributes CSV file
-#'   * fp_genotypes_known: file path to known genotypes CSV file
+#'   * \code{dataset_opts}:
+#'     * \code{dp}: directory path to input sequence files
+#'     * \code{pattern}: regular expression for the input filename pattern
+#'     * \code{ord}: order of fields Replicate, Sample, and Locus in in the
+#'     input filename pattern.  For example, if Locus is the first field
+#'     followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}.
+#'   * \code{output}:
+#'     * \code{dp}: directory path for saving output data
+#'   * \code{fp_dataset}: file path to table of sample attributes to use, rather
+#'     than detecting via dataset_opts
+#'   * \code{fp_locus_attrs}: file path to locus attributes CSV file
+#'   * \code{fp_genotypes_known}: file path to known genotypes CSV file
 #' @md
 #'
 #' @export