Skip to content

Commit

Permalink
Merge pull request #31 from ShawHahnLab/release-0.2.3
Browse files Browse the repository at this point in the history
Release 0.2.3
  • Loading branch information
ressy authored Mar 14, 2019
2 parents 1d08387 + 3491d23 commit c519b98
Show file tree
Hide file tree
Showing 45 changed files with 447 additions and 330 deletions.
44 changes: 31 additions & 13 deletions .utils/prep_release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,55 @@
set -e

VERSION=$1
SEP="==="

chiimp_check='x<-devtools::check();quit(save="no",status=length(c(x$errors,x$warnings)))'

echo "$SEP Running spell check"
./.utils/spellcheck.R

# Run lint script
echo "Running lint check"
echo "$SEP Running lint check"
./.utils/lint.R

# Update version in download link in README
VER_MSG="The most recent released version is"
TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
sed -i -r "$SED_README" README.md
if [[ $VERSION != "" ]]; then
# Update version in download link in README
VER_MSG="The most recent released version is"
TAG_URL="https\\://github.com/ShawHahnLab/chiimp/releases/tag"
SED_README="s:$VER_MSG \\[[0-9.]+\\]\\($TAG_URL/[0-9.]+\\)\\.:$VER_MSG [$VERSION]($TAG_URL/$VERSION).:"
sed -i -r "$SED_README" README.md

# Update version in DESCRIPTION and NEWS.md
sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
# Update version in DESCRIPTION and NEWS.md
sed -i "s/Version: .*$/Version: $VERSION/" DESCRIPTION
sed -i "s/# chiimp dev/# chiimp $VERSION/" NEWS.md
fi

echo "$SEP Running devtools::check()"
R --slave --vanilla -e "$chiimp_check"

echo "$SEP Rendering user guide"
R --slave --vanilla -e "rmarkdown::render('GUIDE.Rmd', output_file = 'GUIDE.pdf', quiet = TRUE)"

# Create bundled ZIP and TGZ versions without hidden top level files (such as
# the git and travis stuff) and with the GUIDE.pdf.
pushd ..
zip -r chiimp-v${VERISON}.zip chiimp/*
tar czvf chiimp-v${VERSION}.tgz chiimp/*
popd
if [[ $VERSION != "" ]]; then
echo "$SEP Creating release archives"
pushd ..
zip -r chiimp-v${VERISON}.zip chiimp/*
tar czvf chiimp-v${VERSION}.tgz chiimp/*
popd
fi

echo
echo "REMINDER BEFORE TAGGING RELEASE $VERSION:"
echo
echo " * Run full test on Mac OS, Windows, and Linux"
echo " * Update NEWS.md with all updates under a heading matching this version"
echo " * Check README.md for link to this version"
echo " * Make sure GUIDE.Rmd is up-to-date and rendered GUIDE.pdf is correct"
echo
echo "ALSO:"
echo " * Draft release from tag on github including archive files with bundled"
echo " GUIDE.pdf"
echo " * Merge release-### into master, dev, and gh-pages"
echo
12 changes: 12 additions & 0 deletions .utils/spellcheck.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env Rscript

# Spell-check the documentation files. Note they'll have to be updated e.g.
# with devtools::document() first.

ignore <- read.table(".utils/wordlist.txt",
header = FALSE,
stringsAsFactors = FALSE)[, 1]
results <- devtools::spell_check(ignore = ignore)
if (length(results) > 0) {
results
}
26 changes: 26 additions & 0 deletions .utils/wordlist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
ABCD
ACTG
artifactual
autocalculated
Autogenerate
CHIIMP
CHIIMP's
config
Connell
dereplicated
Dereplicates
FASTA
FASTQ
genotype
Genotype
heterozygous
Heterozygous
homozygous
Homozygous
MSA
pandoc
Pandoc
PPI
seqs
STR
YAML
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: chiimp
Title: Computational, High-throughput Individual Identification through Microsatellite Profiling
Version: 0.2.2
Version: 0.2.3
Authors@R: person("Jesse", "Connell", email = "[email protected]", role = c("aut", "cre"))
Description: An R package to analyze microsatellites in high-throughput sequencing datasets.
Depends: R (>= 3.2.3)
Expand Down
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# chiimp 0.2.3

* Fixed package checks and testing on latest R development releases ([#27]).
* Fixed test behavior on Windows and improved test organization ([#16]).
* Added documentation corrections and improvements.

[#27]: https://github.com/ShawHahnLab/chiimp/issues/27
[#16]: https://github.com/ShawHahnLab/chiimp/issues/16

# chiimp 0.2.2

* Fixed heatmap plotting via updated `plot_heatmap` for cases with blank
Expand Down
15 changes: 8 additions & 7 deletions R/analyze_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,9 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
#' For the given results list (pair of summary data frame and list of per-sample
#' data frames as produced by \code{\link{tidy_analyzed_dataset}}), add columns
#' to all data frames defining names for recognized sequences. For the summary
#' data frame this will be Allele1Name and Allele2Name. For each sample data
#' frame this will be SeqName, defined for any sequences represented in the
#' summary or in a given known alleles set.
#' data frame this will be \code{Allele1Name} and \code{Allele2Name}. For each
#' sample data frame this will be \code{SeqName}, defined for any sequences
#' represented in the summary or in a given known alleles set.
#'
#' @param results results list as produced by
#' \code{\link{tidy_analyzed_dataset}}.
Expand All @@ -202,10 +202,11 @@ tidy_analyzed_dataset <- function(dataset, raw.results) {
#' \code{\link{make_allele_name}}.
#'
#' @return list of results, with \code{summary} set to the single summary data
#' frame and \code{data} the per-sample data frames. A "SeqName" column in
#' sample data frames and "Allele1Name" and "Allele2Name" columns in the
#' summary data frame will associate any sequence matching a known allele (for
#' either the given table or the current dataset) with a text name.
#' frame and \code{data} the per-sample data frames. A \code{SeqName} column
#' in sample data frames and \code{Allele1Name} and \code{Allele2Name} columns
#' in the summary data frame will associate any sequence matching a known
#' allele (for either the given table or the current dataset) with a text
#' name.
name_known_sequences <- function(results, known_alleles, name_args) {
# Name all of the called alleles across samples
results$summary <- name_alleles_in_table(results$summary, known_alleles,
Expand Down
13 changes: 7 additions & 6 deletions R/analyze_sample.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,13 @@ analyze_sample <- function(seq_data, sample.attrs, fraction.min) {
}

#' @describeIn analyze_sample version of sample analysis guided by expected
#' sequence length values. Additional items ExpectedLength1 and optionally
#' ExpectedLength2 can be supplied in the \code{sample.attrs} list. If NA or
#' missing the behavior will match \code{analyze_sample}. If two expected
#' lengths are given, the fraction.min argument is ignored. If at least one
#' expected length is given, the stutter/artifact filtering is disabled. From
#' here use \code{\link{summarize_sample_guided}}.
#' sequence length values. Additional items \code{ExpectedLength1} and
#' optionally \code{ExpectedLength2} can be supplied in the
#' \code{sample.attrs} list. If NA or missing the behavior will match
#' \code{analyze_sample}. If two expected lengths are given, the fraction.min
#' argument is ignored. If at least one expected length is given, the
#' stutter/artifact filtering is disabled. From here use
#' \code{\link{summarize_sample_guided}}.
#'
#' @export
analyze_sample_guided <- function(seq_data, sample.attrs, fraction.min) {
Expand Down
40 changes: 20 additions & 20 deletions R/analyze_seqs.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@
#'
#' @details
#' Columns in the returned data frame:
#' * Seq: sequence text for each unique sequence
#' * Count: integer count of occurrences of this exact sequence
#' * Length: integer sequence length
#' * MatchingLocus: factor for the name of the locus matching each sequence,
#' by checking the primer
#' * MotifMatch: logical: are there are least \code{nrepeats} perfect
#' * \code{Seq}: sequence text for each unique sequence
#' * \code{Count}: integer count of occurrences of this exact sequence
#' * \code{Length}: integer sequence length
#' * \code{MatchingLocus}: factor for the name of the locus matching each
#' sequence, by checking the primer
#' * \code{MotifMatch}: logical: are there are least \code{nrepeats} perfect
#' adjacent repeats of the STR motif for the matching locus?
#' * LengthMatch: logical: is the sequence length within the expected range
#' for the matching locus?
#' * Ambiguous: logical: are there unexpected characters in the sequence
#' * \code{LengthMatch}: logical: is the sequence length within the expected
#' range for the matching locus?
#' * \code{Ambiguous}: logical: are there unexpected characters in the sequence
#' content?
#' * Stutter: integer: for any sequence that looks like potential PCR stutter,
#' the index of the row that may be the source of the stutter band.
#' * Artifact: integer: for any sequence that looks like potential PCR artifact
#' (other than stutter), the index of the row that may be the source of the
#' stutter band.
#' * FractionOfTotal: numeric fraction of the number of sequences
#' * \code{Stutter}: integer: for any sequence that looks like potential PCR
#' stutter, the index of the row that may be the source of the stutter band.
#' * \code{Artifact}: integer: for any sequence that looks like potential PCR
#' artifact (other than stutter), the index of the row that may be the source
#' of the stutter band.
#' * \code{FractionOfTotal}: numeric fraction of the number of sequences
#' represented by each unique sequence compared to the total.
#' * FractionOfLocus: numeric fraction of the number of sequences represented
#' by each unique sequence compared to the total for that particular
#' matching locus.
#' * \code{FractionOfLocus}: numeric fraction of the number of sequences
#' represented by each unique sequence compared to the total for that
#' particular matching locus.
#' @md
#'
#' @param seqs character vector containing sequences.
Expand Down Expand Up @@ -214,8 +214,8 @@ find_stutter <- function(sample.data, locus_attrs,
#' Searches a processed STR sample for entries that may be PCR artifacts, other
#' than stutter, from another entry in the sample. Potential artifacts are
#' sequences with counts lower than another sequence by a given ratio and
#' sequence length within 1 bp of the other sequence. This only considers
#' STR-labeled rows and requires a given entry to have counts at most
#' sequence length within 1 nucleotide of the other sequence. This only
#' considers STR-labeled rows and requires a given entry to have counts at most
#' \code{count.ratio_max} compared to the candidate "source" entry to be
#' considered an artifact. Sequence content is not currently considered, just
#' relative sequence lengths and counts.
Expand Down
20 changes: 10 additions & 10 deletions R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
#'
#' Using the Name column of the given results summary data frame, pair each
#' called genotype with the known alleles. A data frame with two columns,
#' CorrectAllele1Seq and CorrectAllele2Seq, is returned. If matching entries are
#' found in Allele1Seq and/or Allele2Seq the order will be preserved, and at
#' this point the two allele entries should match up directly for genotypes that
#' were called correctly.
#' \code{CorrectAllele1Seq} and \code{CorrectAllele2Seq}, is returned. If
#' matching entries are found in \code{Allele1Seq} and/or \code{Allele2Seq} the
#' order will be preserved, and at this point the two allele entries should
#' match up directly for genotypes that were called correctly.
#'
#' @param results_summary cross-sample summary data frame as produced by
#' \code{\link{analyze_dataset}}.
Expand Down Expand Up @@ -41,10 +41,10 @@ match_known_genotypes <- function(results_summary, genotypes.known) {

#' Categorize genotyping results
#'
#' For a given results summary data frame that has CorrectAllele1Seq and Correct
#' Allele2Seq columns (such as produced by \code{\link{match_known_genotypes}})
#' added, create a factor labeling every row of the input data frame by its
#' genotyping outcome.
#' For a given results summary data frame that has \code{CorrectAllele1Seq} and
#' \code{CorrectAllele2Seq} columns (such as produced by
#' \code{\link{match_known_genotypes}}) added, create a factor labeling every
#' row of the input data frame by its genotyping outcome.
#'
#' @details
#' Levels in the returned factor, in order:
Expand All @@ -56,8 +56,8 @@ match_known_genotypes <- function(results_summary, genotypes.known) {
#' * Dropped Allele: One called allele is correct for a heterozygous individual,
#' but no second allele was called.
#'
#' Cases that should not occur, such as CorrectAllele1Seq and CorrectAllele2Seq
#' both set to NA, map to NA in the returned factor.
#' Cases that should not occur, such as \code{CorrectAllele1Seq} and
#' \code{CorrectAllele2Seq} both set to NA, map to NA in the returned factor.
#' @md
#'
#' @param results_summary cross-sample summary data frame as produced by
Expand Down
57 changes: 29 additions & 28 deletions R/chiimp.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,48 +67,49 @@
#' The workflow above outlines CHIIMP's behavior when called as a standalone
#' program, where \code{\link{main}} loads a configuration file into a nested
#' list of options and calls \code{\link{full_analysis}}. The public functions
#' linked above can also be used idependently; see the documentation and code
#' linked above can also be used independently; see the documentation and code
#' examples for the individual functions for more information.
#'
#'
#' **The Package structure of the source files, grouped by topic:**
#' * Main Interface:
#' * chiimp.R: Main entry point for command-line usage (\code{\link{main}})
#' and R usage (\code{\link{full_analysis}}).
#' * \code{chiimp.R}: Main entry point for command-line usage
#' (\code{\link{main}}) and R usage (\code{\link{full_analysis}}).
#' * Data Analysis:
#' * analyze_dataset.R: High-level interface to analyze all samples across a
#' given dataset (\code{\link{analyze_dataset}}); used by
#' * \code{analyze_dataset.R}: High-level interface to analyze all samples
#' across a given dataset (\code{\link{analyze_dataset}}); used by
#' \code{\link{full_analysis}} to manage the main part of the processing.
#' * summarize_dataset.R: High-level interface to provide inter-sample and
#' inter-locus analyses (\code{\link{summarize_dataset}}); used by
#' * \code{summarize_dataset.R}: High-level interface to provide inter-sample
#' and inter-locus analyses (\code{\link{summarize_dataset}}); used by
#' \code{\link{full_analysis}} to manage the second stage of the
#' processing.
#' * analyze_seqs.R: Low-level interface to convert raw sequence input to a
#' data frame of unique sequences (\code{\link{analyze_seqs}}); used by
#' \code{\link{analyze_dataset}}.
#' * analyze_sample.R: Low-level interface to extract per-locus details from
#' a data frame of unique sequences (\code{\link{analyze_sample}}); used by
#' \code{\link{analyze_dataset}}.
#' * summarize_sample.R: Low-level interface to condense each sample data
#' frame into a a concise list of consistent attributes, suitable for
#' * \code{analyze_seqs.R}: Low-level interface to convert raw sequence input
#' to a data frame of unique sequences (\code{\link{analyze_seqs}}); used
#' by \code{\link{analyze_dataset}}.
#' * \code{analyze_sample.R}: Low-level interface to extract per-locus
#' details from a data frame of unique sequences
#' (\code{\link{analyze_sample}}); used by \code{\link{analyze_dataset}}.
#' * \code{summarize_sample.R}: Low-level interface to condense each sample
#' data frame into a a concise list of consistent attributes, suitable for
#' binding together across samples for a dataset
#' (\code{\link{summarize_sample}}); used by \code{\link{analyze_dataset}}.
#' * categorize.R: Low-level helper functions used by
#' * \code{categorize.R}: Low-level helper functions used by
#' \code{\link{summarize_dataset}} for samples with known identity.
#' * Plotting and reporting:
#' * report.R: Various plotting and summarizing functions used when rendering
#' a report in \code{\link{full_analysis}}.
#' * histogram.R: Sequence histogram plotting tools (\code{\link{histogram}})
#' as used during \code{\link{full_analysis}}.
#' * markdown.R: Various helper functions for adding tables and plots to an R
#' Markdown report as used in \code{\link{full_analysis}}.
#' * \code{report.R}: Various plotting and summarizing functions used when
#' rendering a report in \code{\link{full_analysis}}.
#' * \code{histogram.R}: Sequence histogram plotting tools
#' (\code{\link{histogram}}) as used during \code{\link{full_analysis}}.
#' * \code{markdown.R}: Various helper functions for adding tables and plots
#' to an R Markdown report as used in \code{\link{full_analysis}}.
#' * Utility Functions and Configuration:
#' * configuration.R: The default configuration options
#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}.
#' * io.R: various helper input/output functions used loading and saving
#' sequence data files, spreadsheets, and plots used in multiple parts of the
#' package.
#' * util.R: Various helper functions used in multiple parts of the package.
#' * \code{configuration.R}: The default configuration options
#' (\code{\link{config.defaults}}) used by \code{\link{full_analysis}}.
#' * \code{io.R}: various helper input/output functions used loading and
#' saving sequence data files, spreadsheets, and plots used in multiple
#' parts of the package.
#' * \code{util.R}: Various helper functions used in multiple parts of the
#' package.
#'
#' @md
#'
Expand Down
24 changes: 12 additions & 12 deletions R/configuration.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
#' itself to see all of the build-time defaults.
#'
#' Notable Options:
#' * dataset_opts:
#' * dp: directory path to input sequence files
#' * pattern: regular expression for the input filename pattern
#' * ord: order of fields Replicate, Sample, and Locus in in the input
#' filename pattern. For example, if Locus is the first field followed by
#' Replicate and Sample, set \code{ord=c(3, 1, 2)}.
#' * output:
#' * dp: directory path for saving output data
#' * fp_dataset: file path to table of sample attributes to use, rather than
#' detecting via dataset_opts
#' * fp_locus_attrs: file path to locus attributes CSV file
#' * fp_genotypes_known: file path to known genotypes CSV file
#' * \code{dataset_opts}:
#' * \code{dp}: directory path to input sequence files
#' * \code{pattern}: regular expression for the input filename pattern
#' * \code{ord}: order of fields Replicate, Sample, and Locus in in the
#' input filename pattern. For example, if Locus is the first field
#' followed by Replicate and Sample, set \code{ord=c(3, 1, 2)}.
#' * \code{output}:
#' * \code{dp}: directory path for saving output data
#' * \code{fp_dataset}: file path to table of sample attributes to use, rather
#' than detecting via dataset_opts
#' * \code{fp_locus_attrs}: file path to locus attributes CSV file
#' * \code{fp_genotypes_known}: file path to known genotypes CSV file
#' @md
#'
#' @export
Expand Down
Loading

0 comments on commit c519b98

Please sign in to comment.