Skip to content

Commit

Permalink
Merge pull request #22 from ShawHahnLab/release-0.2.0
Browse files Browse the repository at this point in the history
Version 0.2.0
  • Loading branch information
ressy authored Feb 15, 2022
2 parents 3e121fe + 41972c4 commit ea96c91
Show file tree
Hide file tree
Showing 63 changed files with 3,138 additions and 260 deletions.
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
# Changelog

## 0.2.0 - 2022-02-15

### Added

* Human germline FASTAs from IMGT ([#21])
* support for FASTA/FASTQ/CSV/TSV query inputs for the igblast and related
commands ([#18], [#19])
* convert command for FASTA/FASTQ/CSV/TSV file conversion, in place of the
more limited tab2seq command ([#14], [#16])
* Rhesus germline HV and HJ allele FASTAs from
[10.4049/jimmunol.1800342](https://doi.org/10.4049/jimmunol.1800342) ([#13])

[#21]: https://github.com/ShawHahnLab/igseq/pull/21
[#19]: https://github.com/ShawHahnLab/igseq/pull/19
[#18]: https://github.com/ShawHahnLab/igseq/pull/18
[#16]: https://github.com/ShawHahnLab/igseq/pull/16
[#14]: https://github.com/ShawHahnLab/igseq/pull/14
[#13]: https://github.com/ShawHahnLab/igseq/pull/13

## 0.1.1 - 2021-12-07

### Changed
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html
{% set version = "0.1.1" %}
{% set version = "0.2.0" %}
{% set build = "0" %}

package:
Expand Down
120 changes: 84 additions & 36 deletions igseq/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from . import summarize
from . import vdj_gather
from . import vdj_match
from . import tab2seq
from . import convert
from . import show
from .util import IgSeqError
from .version import __version__
Expand All @@ -38,6 +38,22 @@ def wrap(txt):
chunks = txt.strip().split("\n\n")
return "\n\n".join([wrap(chunk) for chunk in chunks])

def args_to_colmap(args):
"""Make dictionary of column name mappings from cmd-line arguments.
This is used for commands that work with tabular inputs/outputs.
"""
# convert arguments like "col_seq_id" to "sequence_id"
colmap = {}
longer = {"desc": "description", "seq": "sequence", "qual": "quality"}
for key, val in vars(args).items():
if key.startswith("col") and val is not None:
key_long = key.split("_")[1:]
key_long = [longer.get(word, word) for word in key_long]
key_long = "_".join(key_long)
colmap[key_long] = val
return colmap

def main(arglist=None):
"""Command-line interface.
Expand Down Expand Up @@ -138,22 +154,28 @@ def _main_list(args):
show.list_files(text_items=args.text)

def _main_igblast(args, extra_igblastn_args=None):
colmap = args_to_colmap(args)
igblast.igblast(
query_path=args.query,
ref_paths=args.reference,
db_path=args.database,
species=args.species,
fmt_in=args.input_format,
colmap=colmap,
extra_args=extra_igblastn_args,
dry_run=args.dry_run,
threads=args.threads)

def _main_summarize(args):
colmap = args_to_colmap(args)
summarize.summarize(
ref_paths=args.reference,
query=args.query,
output=args.output,
showtxt=args.show,
species=args.species,
fmt_in=args.input_format,
colmap=colmap,
dry_run=args.dry_run)

def _main_vdj_gather(args):
Expand All @@ -163,24 +185,27 @@ def _main_vdj_gather(args):
dry_run=args.dry_run)

def _main_vdj_match(args):
colmap = args_to_colmap(args)
vdj_match.vdj_match(
ref_paths=args.reference,
query=args.query,
output=args.output,
showtxt=args.show,
species=args.species,
fmt_in=args.input_format,
colmap=colmap,
dry_run=args.dry_run)

def _main_tab2seq(args):
tab2seq.tab2seq(
tab_path_in=args.input,
seq_path_out=args.output,
seq_col=args.seq_col,
seq_id_col=args.seq_id_col,
seq_desc_col=args.seq_desc_col,
qual_col=args.seq_qual_col,
tab_fmt=args.tab_fmt,
seq_fmt=args.seq_fmt)
def _main_convert(args):
colmap = args_to_colmap(args)
convert.convert(
path_in=args.input,
path_out=args.output,
fmt_in=args.input_format,
fmt_out=args.output_format,
colmap=colmap,
dummyqual=args.dummy_qual,
dry_run=args.dry_run)

def _setup_log(verbose, quiet, prefix):
# Handle warnings via logging
Expand Down Expand Up @@ -240,9 +265,9 @@ def __setup_arg_parser():
help="Find closest-matching germline VDJ sequences",
description=rewrap(vdj_match.__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter)
p_tab2seq = subps.add_parser("tab2seq",
help="Convert CSV/TSV to FASTA/FASTQ",
description=rewrap(tab2seq.__doc__),
p_convert = subps.add_parser("convert",
help="Convert FASTA/FASTQ/CSV/TSV",
description=rewrap(convert.__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter)
p_show = subps.add_parser("show",
help="show file contents",
Expand Down Expand Up @@ -347,13 +372,20 @@ def __setup_arg_parser():

__add_common_args(p_igblast)
p_igblast.add_argument("-Q", "--query", required=True,
help="query FASTA")
help="query input")
p_igblast.add_argument("-r", "--reference", nargs="+",
help="one or more FASTA/directory/builtin names pointing to V/D/J FASTA files")
p_igblast.add_argument("-d", "--database",
help="optional persistent database directory name (default: use temp directory)")
p_igblast.add_argument("-S", "--species",
help="species to use (human or rhesus). Default: infer from database if possible")
p_igblast.add_argument("--input-format",
help="format of query input "
"(default: detect from input filename if possible)")
p_igblast.add_argument("--col-seq-id",
help="Name of column containing sequence IDs (for tabular query input)")
p_igblast.add_argument("--col-seq",
help="Name of column containing sequences (for tabular query input)")
p_igblast.add_argument("-t", "--threads", type=int, default=1,
help="number of threads for parallel processing (default: 1)")
p_igblast.set_defaults(func=_main_igblast)
Expand All @@ -365,6 +397,13 @@ def __setup_arg_parser():
help="query FASTA")
p_summarize.add_argument("-S", "--species",
help="species to use (human or rhesus). Default: infer from database if possible")
p_summarize.add_argument("--input-format",
help="format of query input "
"(default: detect from input filename if possible)")
p_summarize.add_argument("--col-seq-id",
help="Name of column containing sequence IDs (for tabular query input)")
p_summarize.add_argument("--col-seq",
help="Name of column containing sequences (for tabular query input)")
p_summarize.add_argument("-o", "--output",
help="Output filename")
p_summarize.add_argument("--show", action=argparse.BooleanOptionalAction,
Expand All @@ -386,34 +425,43 @@ def __setup_arg_parser():
help="query FASTA")
p_vdj_match.add_argument("-S", "--species",
help="species to use (human or rhesus). Default: infer from database if possible")
p_vdj_match.add_argument("--input-format",
help="format of query input "
"(default: detect from input filename if possible)")
p_vdj_match.add_argument("--col-seq-id",
help="Name of column containing sequence IDs (for tabular query input)")
p_vdj_match.add_argument("--col-seq",
help="Name of column containing sequences (for tabular query input)")
p_vdj_match.add_argument("-o", "--output",
help="Output filename")
p_vdj_match.add_argument("--show", action=argparse.BooleanOptionalAction,
help="Explicitly enable/disable showing the results directly on standard output "
"(default: disabled if using file output, enabled otherwise)")
p_vdj_match.set_defaults(func=_main_vdj_match)

__add_common_args(p_tab2seq)
p_tab2seq.add_argument("input",
help="one CSV or TSV file path, or a literal '-' for standard input")
p_tab2seq.add_argument("output",
help="one FASTA or FASTQ file path, or a literal '-' for standard output")
p_tab2seq.add_argument("--seq-col", required=True,
help="name of table column containing sequences")
p_tab2seq.add_argument("--seq-id-col", required=True,
help="name of table column containing sequence IDs")
p_tab2seq.add_argument("--seq-desc-col",
help="name of table column containing sequence descriptions (optional)")
p_tab2seq.add_argument("--seq-qual-col",
help="name of table column containing sequence quality "
"scores as PHRED+33 (for FASTQ output only)")
p_tab2seq.add_argument("--tab-fmt",
help="Format of input: tsv or csv. "
"default is detected from input filename if possible")
p_tab2seq.add_argument("--seq-fmt",
help="Format of output: fasta or fastq. "
"default is detected from output filename if possible")
p_tab2seq.set_defaults(func=_main_tab2seq)
__add_common_args(p_convert)
p_convert.add_argument("input",
help="input file path, or a literal '-' for standard input")
p_convert.add_argument("output",
help="output file path, or a literal '-' for standard output")
p_convert.add_argument("--input-format",
help="format of input "
"(default: detect from input filename if possible)")
p_convert.add_argument("--output-format",
help="format of output "
"(default: detect from output filename if possible)")
p_convert.add_argument("--col-seq-id",
help="Name of column containing sequence IDs (for tabular input/output)")
p_convert.add_argument("--col-seq",
help="Name of column containing sequences (for tabular input/output)")
p_convert.add_argument("--col-seq-qual",
help="Name of column containing sequence qualities (for tabular input/output)")
p_convert.add_argument("--col-seq-desc",
help="Name of column containing sequence descriptions (for tabular input/output)")
p_convert.add_argument("-d", "--dummy-qual",
help="Quality score to use for all bases for applicable output types, "
'as text (e.g. use "I" for 40)')
p_convert.set_defaults(func=_main_convert)

return parser

Expand Down
23 changes: 23 additions & 0 deletions igseq/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Convert between various sequence and tabular file formats.
Input and output formats are by default inferred from filenames but can be
given explicitly if needed. The formats are:
fa: FASTA
fagz: gzipped FASTA
fq: FASTQ
fqgz: gzipped FASTQ
csv: comma-separated values
csvgz: gzipped comma-separated values
tsv: tab-separated values
tsvgz: gzipped tab-separated values
"""

from .record import RecordReader, RecordWriter

def convert(path_in, path_out, fmt_in=None, fmt_out=None, colmap=None, dummyqual=None, dry_run=False):
with RecordReader(path_in, fmt_in, colmap, dry_run=dry_run) as reader, \
RecordWriter(path_out, fmt_out, colmap, dummyqual=dummyqual, dry_run=dry_run) as writer:
for record in reader:
writer.write(record)
22 changes: 22 additions & 0 deletions igseq/data/examples/convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

[ -v EXAMPLES ] || EXAMPLES=$(python -c 'import igseq.util; print(igseq.util.DATA)')/examples

# converting FASTA to FASTA just unwraps it
igseq convert $EXAMPLES/inputs/convert/wrapped.fasta unwrapped.fasta

# or, convert to CSV/TSV
igseq convert $EXAMPLES/inputs/convert/wrapped.fasta unwrapped.csv

# or .fastq.gz to .fasta
igseq convert $EXAMPLES/inputs/convert/unwrapped.fastq.gz unwrapped2.fasta

# a - can be used for stdin/stdout, but the format has to be given explicitly:
igseq convert --input-format fa --output-format fa - - < $EXAMPLES/inputs/convert/wrapped.fasta > unwrapped3.fasta

# other table formats can be converted to FASTA or FASTQ if the column names to
# use are specified. the default would find sequence_id and sequence columns
# from AIRR:
igseq convert $EXAMPLES/inputs/convert/airr.tsv from_airr.fasta
# or maybe we want the junctions instead:
igseq convert --col-seq junction $EXAMPLES/inputs/convert/airr.tsv from_airr_junctions.fasta
12 changes: 12 additions & 0 deletions igseq/data/examples/igblast.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# An arbitrary antibody sequence pulled from one of our datasets that looks
# complete and in-frame
QUERY=$EXAMPLES/inputs/igblast/query.fasta
# A .fastq.gz version, to show off flexibility in query formats
QUERY_FQGZ=$EXAMPLES/inputs/igblast/query.fastq.gz
QUERY_CSV=$EXAMPLES/inputs/igblast/query.csv

# using the built-in Rhesus germline reference from IMGT and using the default
# text output
Expand All @@ -30,3 +33,12 @@ igseq igblast -r rhesus -Q $QUERY -outfmt 19 | cut -f 10,62
# The -num_alignments_V argument clashes with iseq's -n, so we need to use --
# to clarify. igseq will remove the extra - when calling igblastn.
igseq igblast -r rhesus -Q $QUERY -outfmt 7 --num_alignments_V 5

# like the first example, except giving fastq.gz as the query. It'll
# automatically be converted to FASTA while being passed to the igblastn
# command.
igseq igblast -r rhesus/imgt -Q $QUERY_FQGZ

# or using tabular (CSV/TSV) input, and specifying which columns have the IDs
# and sequences
igseq igblast -r rhesus/imgt -Q $QUERY_CSV --col-seq-id SeqID --col-seq Seq -outfmt 19 -out igblast2.tsv
Loading

0 comments on commit ea96c91

Please sign in to comment.