Skip to content

Commit

Permalink
Merge pull request #29 from ShawHahnLab/release-0.3.0
Browse files Browse the repository at this point in the history
Version 0.3.0
  • Loading branch information
ressy authored Jul 14, 2022
2 parents ea96c91 + 0adb1eb commit 72257a6
Show file tree
Hide file tree
Showing 15 changed files with 1,028 additions and 908 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2.1
jobs:
build:
machine:
image: circleci/classic:latest
image: ubuntu-2004:202201-02
steps:
- checkout:
path: igseq
Expand Down
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
# Changelog

## 0.3.0 - 2022-07-14

### Added

* Support for .afa (as FASTA) and .fq (as FASTQ) in input/output handling
([#26])

### Changed

* Replaced Rhesus germline reference "bernat2021" with "kimdb" version 1.1,
containing additions and corrections since the original publication, and
added a CSV of germline reference information ([#28])

### Fixed

* vdj-match now skips references that are missing gene segments but were only
included implicitly rather than directly named ([#25])

[#28]: https://github.com/ShawHahnLab/igseq/pull/28
[#26]: https://github.com/ShawHahnLab/igseq/pull/26
[#25]: https://github.com/ShawHahnLab/igseq/pull/25

## 0.2.0 - 2022-02-15

### Added
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html
{% set version = "0.2.0" %}
{% set version = "0.3.0" %}
{% set build = "0" %}

package:
Expand Down
6 changes: 6 additions & 0 deletions igseq/data/germ.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Ref,DOI,URL,Notes
rhesus/zhang2019,10.4049/jimmunol.1800342,https://doi.org/10.4049/jimmunol.1800342,IGHV and IGHJ only
rhesus/kimdb,10.1016/j.immuni.2020.12.018,http://kimdb.gkhlab.se/datasets,IGH only; KIMDB v1.1 (updated since publication)
rhesus/sonarramesh,10.3389/fimmu.2017.01407,https://github.com/scharch/SONAR/tree/master/germDB,Trimmed subset of full published sequences
rhesus/imgt,,https://www.imgt.org/download/V-QUEST/IMGT_V-QUEST_reference_directory/Macaca_mulatta/IG,
human/imgt,,https://www.imgt.org/download/V-QUEST/IMGT_V-QUEST_reference_directory/Homo_sapiens/IG,
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ AGGATATTGTAGTGGTGGTGTCTGCTATGCC
>IGHD2-9*01
AGGATATTGTACTAGTACTACTTGCTATGCC
>IGHD3-15*01
GTATTACGAGGATGATTACGGTTACTATTACACCCACAGCGT
GTATTACGAGGATGATTACGGTTACTATTACACC
>IGHD3-16*01
GTATTACTATAGTGGTAGTTATTACTACCACAGTGT
GTATTACTATAGTGGTAGTTATTACTAC
>IGHD3-17*01
GTACTGGGGTGATTATTATGAC
>IGHD3-18*01
GTATTACTATGGTAGTGGTTATTACACCCACAGCGT
GTATTACTATGGTAGTGGTTATTACACC
>IGHD3-19*02_S4720
GTACTGGAGTGATTATTATGAC
>IGHD3-20*01
Expand Down
File renamed without changes.

Large diffs are not rendered by default.

65 changes: 35 additions & 30 deletions igseq/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,25 @@
DEFAULT_DUMMY_QUAL = "I"
DEFAULT_COLUMNS = {k: k for k in ["sequence_id", "sequence", "sequence_quality", "sequence_description"]}

# Mapping from file extensions to file format names used here
FMT_EXT_MAP = {
".csv": "csv",
".tsv": "tsv",
".tab": "tsv",
".fa": "fa",
".fasta": "fa",
".afa": "fa",
".fq": "fq",
".fastq": "fq"}

# Mapping from file format names to functions that take file handles and give
# format-specific record iterators.
READERS = {
"csv": lambda hndl: csv.DictReader(hndl),
"tsv": lambda hndl: csv.DictReader(hndl, delimiter="\t"),
"fa": lambda hndl: SeqIO.parse(hndl, "fasta"),
"fq": lambda hndl: SeqIO.parse(hndl, "fastq")}

class RecordHandler:
"""Abstract class for shared generic record reading and writing
functionality. This behaves as a context manager to handle file opening
Expand Down Expand Up @@ -63,28 +82,7 @@ def close(self):

def infer_fmt(self, fmt=None):
"""Guess file format from our path, with optional default."""
try:
path = Path(self.pathlike)
except TypeError:
path = Path(str(self.pathlike.name))
fmt_inferred = None
if path.suffix.lower() in [".csv"]:
fmt_inferred = "csv"
elif path.suffix.lower() in [".tsv", ".tab"]:
fmt_inferred = "tsv"
elif path.suffix.lower() in [".fa", ".fasta"]:
fmt_inferred = "fa"
elif path.suffix.lower() in [".fq", ".fastq"]:
fmt_inferred = "fq"
elif path.suffix.lower() == ".gz":
if Path(path.stem).suffix.lower() in [".fa", ".fasta"]:
fmt_inferred = "fagz"
elif Path(path.stem).suffix.lower() in [".fq", ".fastq"]:
fmt_inferred = "fqgz"
elif Path(path.stem).suffix.lower() in [".csv"]:
fmt_inferred = "csvgz"
elif Path(path.stem).suffix.lower() in [".tsv", ".tab"]:
fmt_inferred = "tsvgz"
fmt_inferred = self._infer_fmt(self.pathlike)
LOGGER.info("given format: %s", fmt)
LOGGER.info("inferred format: %s", fmt_inferred)
if not fmt:
Expand All @@ -95,6 +93,20 @@ def infer_fmt(self, fmt=None):
fmt = fmt_inferred
return fmt

@staticmethod
def _infer_fmt(path):
try:
path = Path(path)
except TypeError:
path = Path(str(path.name))
ext = path.suffix.lower()
ext2 = Path(path.stem).suffix.lower()
fmt2 = FMT_EXT_MAP.get(ext2)
if ext == ".gz" and fmt2:
return fmt2 + "gz"
else:
return FMT_EXT_MAP.get(ext)

def encode_record(self, record):
"""Convert record dictionary into a SeqRecord object."""
seq = record[self.colmap["sequence"]]
Expand Down Expand Up @@ -160,14 +172,7 @@ def open(self):
else:
LOGGER.info("reading from text")
self.handle = open(self.pathlike, "rt", encoding="ascii")
if self.fmt in ["csv", "csvgz"]:
self.reader = csv.DictReader(self.handle)
elif self.fmt in ["tsv", "tsvgz"]:
self.reader = csv.DictReader(self.handle, delimiter="\t")
elif self.fmt in ["fa", "fagz"]:
self.reader = SeqIO.parse(self.handle, "fasta")
elif self.fmt in ["fq", "fqgz"]:
self.reader = SeqIO.parse(self.handle, "fastq")
self.reader = READERS[self.fmt.removesuffix("gz")](self.handle)

def __iter__(self):
return self
Expand Down
29 changes: 23 additions & 6 deletions igseq/vdj_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,34 @@ def vdj_match(ref_paths, query, output=None, showtxt=None, species=None, fmt_in=
species_det = {s for s in species_det if s}
organism = igblast.detect_organism(species_det, species)

vdj_files_grouped = vdj.group(
attrs_list,
lambda x: f"{x['species']}/{x['ref']}" if x["type"] == "internal" else x["input"])
# Group references into sets with V/D/J. If a segment is missing (note
# that D is always needed by igblast even for light chain) we'll remove the
# whole reference with a warning *if* it was found internally from a
# partial text match, but will fail outright if it was selected explicitly
# or supplied from external files.
def groupkey(row):
return(f"{row['species']}/{row['ref']}" if row["type"] == "internal" else row["input"])
vdj_files_grouped = vdj.group(attrs_list, groupkey)
# This is sloppy because it'll just take whichever is the last matching row
# (with its segment-specific stuff included) but i'll do for now
orig_attrs = {groupkey(row): row for row in attrs_list}
skips = set()
for key, trio in vdj_files_grouped.items():
LOGGER.info("detected V FASTA from %s: %d", key, len(trio["V"]))
LOGGER.info("detected D FASTA from %s: %d", key, len(trio["D"]))
LOGGER.info("detected J FASTA from %s: %d", key, len(trio["J"]))
for segment, attrs_group in trio.items():
if not attrs_group:
LOGGER.critical("No FASTA for %s from %s", segment, key)
raise util.IgSeqError("Missing VDJ input for database")
if key not in skips and not attrs_group:
# Only case where we'll just warn and continue is: it's an
# internal ref, and it was found implicitly (not by name).
if orig_attrs[key]["type"] == "internal" and orig_attrs[key]["input"] != key:
LOGGER.warning("No FASTA for %s from %s; skipping", segment, key)
skips.add(key)
else:
LOGGER.critical("No FASTA for %s from %s", segment, key)
raise util.IgSeqError("Missing VDJ input for database")
for key in skips:
del vdj_files_grouped[key]

if not dry_run:
results = []
Expand Down
2 changes: 1 addition & 1 deletion igseq/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
See https://www.python.org/dev/peps/pep-0396/
"""

__version__ = "0.2.0"
__version__ = "0.3.0"
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
query cdr_lens v_call v_ident_pct v_ident_nt d_call d_ident_pct d_ident_nt j_call j_ident_pct j_ident_nt
query 8.8.14 IGHV4-117*01_rhesus/bernat2021,IGHV4-173*01_rhesus/imgt,VH4.11A*01abc_rhesus/zhang2019 99.324% 294/296 of 296 IGHD4-22*01_rhesus/sonarramesh,IGHD4-23*01_rhesus/bernat2021,IGHD4-23*01_rhesus/imgt 100.000% 9/9 of 16 IGHJ4*01_rhesus/imgt,IGHJ4*01_rhesus/sonarramesh,IGHJ4*01ac_rhesus/zhang2019 100.000% 45/45 of 48
query cdr_lens v_call v_ident_pct v_ident_nt d_call d_ident_pct d_ident_nt j_call j_ident_pct j_ident_nt
query 8.8.14 IGHV4-117*01_rhesus/kimdb,IGHV4-173*01_rhesus/imgt,VH4.11A*01abc_rhesus/zhang2019 99.324% 294/296 of 296 IGHD4-22*01_rhesus/sonarramesh,IGHD4-23*01_rhesus/imgt,IGHD4-23*01_rhesus/kimdb 100.000% 9/9 of 16 IGHJ4*01_rhesus/imgt,IGHJ4*01_rhesus/sonarramesh,IGHJ4*01ac_rhesus/zhang2019 100.000% 45/45 of 48
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
query reference segment call length identity
query rhesus/imgt V IGHV4-173*01 296 99.324
query rhesus/imgt D IGHD4-23*01,IGHD4-41*01 9 100.000
query rhesus/imgt J IGHJ4*01 45 100.000
query rhesus/kimdb V IGHV4-117*01 296 99.324
query rhesus/kimdb D IGHD4-23*01 9 100.000
query rhesus/kimdb J IGHJ4-3*01 45 100.000
query rhesus/sonarramesh V IGHV4-AFI*01 294 95.960
query rhesus/sonarramesh D IGHD4-22*01,IGHD4-37*01 9 100.000
query rhesus/sonarramesh J IGHJ4*01 45 100.000
13 changes: 13 additions & 0 deletions test_igseq/test_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from igseq import record
from igseq.util import IgSeqError
from .util import TestBase

class TestRecordHandler(TestBase):
Expand All @@ -17,6 +18,18 @@ def test_infer_fmt(self):
self.assertEqual(self.handler.fmt, "fa")
fmt = self.handler.infer_fmt("fq")

def test_all_types(self):
# The files don't get opened so they don't have to actually exist here
ext_fmts = {"tab": "tsv", "fasta": "fa", "afa": "fa", "fastq": "fq"}
for ext in ["csv", "tsv", "tab", "fa", "fasta", "afa", "fastq", "fq", "fastq"]:
handler = record.RecordHandler(self.path/f"example.{ext}")
self.assertEqual(handler.fmt, ext_fmts.get(ext, ext))
# unrecognized format is handled and can be specified manually
with self.assertRaises(IgSeqError):
record.RecordHandler(self.path/"example.xyz")
handler = record.RecordHandler(self.path/"example.xyz", "fa")
self.assertEqual(handler.fmt, "fa")

def test_encode_record(self):
obj = self.handler.encode_record({"sequence_id": "id", "sequence": "ACTG"})
self.assertEqual(obj.id, "id")
Expand Down
23 changes: 23 additions & 0 deletions test_igseq/test_vdj_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,29 @@ def test_vdj_match(self):
stdout_exp = f_in.read()
self.assertEqual(stdout_exp, stdout)

def test_vdj_match_multi(self):
# with all matching rhesus refs selected, we should get a block of
# output rows for each ref found. but, the zhang2019 ref only has V
# and J so it's not usable as a standalone IgBLAST DB. We'll get a
# warning about skipping that one, and output rows for imgt/
# sonarramesh/bernat2021.
stdout, stderr = self.redirect_streams(
lambda: vdj_match.vdj_match(["rhesus"], self.path/"input/query.fasta"))
self.assertEqual(stderr, 'No FASTA for D from rhesus/zhang2019; skipping\n')
with open(self.path/"output/stdout_multi.txt") as f_in:
stdout_exp = f_in.read()
self.assertEqual(stdout_exp, stdout)

def test_vdj_match_missing(self):
# If we explicitly request a reference that's missing segments, though,
# that warning becomes an error.
def wrapper():
with self.assertRaisesRegex(IgSeqError, "Missing VDJ input for database"):
vdj_match.vdj_match(["rhesus/zhang2019"], self.path/"input/query.fasta")
stdout, stderr = self.redirect_streams(wrapper)
self.assertEqual(stdout, "")
self.assertEqual(stderr, 'No FASTA for D from rhesus/zhang2019\n')

def test_vdj_match_csv(self):
# CSV input should be supported
stdout, stderr = self.redirect_streams(
Expand Down
2 changes: 1 addition & 1 deletion tools/condabuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# https://medium.com/analytics-vidhya/publish-a-python-package-to-conda-b352eb0bcb2e
conda build -c bioconda -c conda-forge -c defaults conda --python=3.9 --output-folder conda-out
# Then just:
# anaconda upload conda-out/linux-64/igseq-0.0.1-py39_0.tar.bz2
# anaconda upload -u ShawHahnLab conda-out/linux-64/igseq-0.0.1-py39_0.tar.bz2

0 comments on commit 72257a6

Please sign in to comment.