Skip to content

Commit

Permalink
Merge pull request #487 from sunbeam-labs/485-fix-filter_reads-and-re…
Browse files Browse the repository at this point in the history
…move_low_complexity-memory-inefficiency

485 fix filter reads and remove low complexity memory inefficiency
  • Loading branch information
Ulthran authored Apr 23, 2024
2 parents a67f9be + 940d041 commit 4ff9bd9
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ jobs:
with:
token: ${{ secrets.GITHUB_TOKEN }}
id: ${{ github.event.release.id }}
body: "### sunbeamlabs/sunbeam\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n### sunbeamlabs/sunbeam:slim\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n### sunbeamlabs/cutadapt\n${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n### sunbeamlabs/komplexity\n${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n### sunbeamlabs/qc\n${{ needs.push-dockerhub.outputs.qc_package_versions }}\n### sunbeamlabs/reports\n${{ needs.push-dockerhub.outputs.reports_package_versions }}"
body: "**sunbeamlabs/sunbeam**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n**sunbeamlabs/sunbeam:slim**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n**sunbeamlabs/cutadapt**: ${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n**sunbeamlabs/komplexity**: ${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n**sunbeamlabs/qc**: ${{ needs.push-dockerhub.outputs.qc_package_versions }}\n**sunbeamlabs/reports**: ${{ needs.push-dockerhub.outputs.reports_package_versions }}"
replacebody: false

run-integration-tests:
Expand Down
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

__conda_url=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.5.2; fi) # <--- Update this on each version release
__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.6.0; fi) # <--- Update this on each version release
__version_tag="${__version_tag:1}" # Remove the 'v' prefix

read -r -d '' __usage <<-'EOF'
Expand Down
9 changes: 3 additions & 6 deletions src/sunbeamlib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def write_fasta(record: Tuple[str, str], f: TextIO) -> None:


def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:
for g in grouper(f.readlines(), 4):
for g in grouper(f, 4):
header_str = g[0][1:].strip()
seq_str = g[1].strip()
plus_str = g[2].strip()
Expand All @@ -58,11 +58,8 @@ def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:


def write_fastq(record: Tuple[str, str, str, str], f: TextIO) -> None:
for i, l in enumerate(record):
if i == 0:
f.write(f"@{l}\n")
else:
f.write(f"{l}\n")
s = f"@{record[0]}\n{record[1]}\n{record[2]}\n{record[3]}\n"
f.write(s)


def write_many_fastq(record_list: List[Tuple[str, str, str, str]], f: TextIO) -> None:
Expand Down
31 changes: 13 additions & 18 deletions src/sunbeamlib/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,20 @@
"""

import gzip
import sys
from pathlib import Path
from sunbeamlib.parse import parse_fastq, write_fastq
from typing import List, TextIO
from typing import Set, TextIO


from typing import List, TextIO
from pathlib import Path
import gzip


def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
def filter_ids(fp_in: Path, fp_out: Path, ids: Set[str], log: TextIO) -> None:
"""
Filter FASTQ records based on a list of IDs.
Filter FASTQ records based on a set of IDs to remove.
Args:
fp_in (Path): Path to the input FASTQ file.
fp_out (Path): Path to the output FASTQ file.
ids (List[str]): List of IDs to filter.
ids (Set[str]): Set of IDs to filter.
log (TextIO): TextIO object to write log messages.
Returns:
Expand All @@ -31,15 +27,14 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
"""
with gzip.open(fp_in, "rt") as f_in, gzip.open(fp_out, "wt") as f_out:
ids_set = set(ids)
num_ids = len(ids_set)
num_ids = len(ids)
counter = 0
counter_kept = 0
for record in parse_fastq(f_in):
counter += 1
record = (remove_pair_id(record[0], log), record[1], record[2], record[3])
if record[0] in ids_set:
ids_set.remove(record[0])
if record[0] in ids:
ids.remove(record[0])
else:
counter_kept += 1
write_fastq(record, f_out)
Expand All @@ -48,19 +43,19 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
log.write(
f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})\n"
)
log.write(f"IDs not found: {ids_set}\n")
log.write(f"IDs not found: {ids}\n")
assert (
False
), f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})"

if len(ids_set) > 0:
log.write(f"WARNING: {len(ids_set)} ids not found in FASTQ\n")
log.write(f"IDs not found: {ids_set}\n")
if len(ids) > 0:
log.write(f"WARNING: {len(ids)} ids not found in FASTQ\n")
log.write(f"IDs not found: {ids}\n")
else:
log.write("IDs list empty, finished filtering\n")


def remove_pair_id(id: str, log: TextIO) -> str:
def remove_pair_id(id: str, log: TextIO = sys.stdout) -> str:
"""
Removes the pair identifier from the given ID.
Expand Down
26 changes: 13 additions & 13 deletions workflow/scripts/filter_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,20 @@
from sunbeamlib.parse import parse_fastq, write_fastq


def count_host_reads(fp: str, hostdict: dict, net_hostlist: set):
def count_host_reads(fp: str, hostdict: dict) -> set:
hostname = os.path.basename(os.path.dirname(fp))
hostcts = int(sp.getoutput("cat {} | wc -l".format(fp)).strip())
hostdict[hostname] = hostcts

with open(fp) as f:
for l in f.readlines():
net_hostlist.add(l) # Only adds unique ids
return set(l.strip() for l in f.readlines())


def calculate_counts(fp: str, net_hostlist: set) -> tuple:
def calculate_counts(fp: str, len_hostlist: int) -> tuple:
original = int(str(sp.getoutput("zcat {} | wc -l".format(fp))).strip()) // 4
host = len(net_hostlist)
nonhost = int(original - host)
nonhost = int(original - len_hostlist)

return host, nonhost
return len_hostlist, nonhost


def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
Expand All @@ -39,28 +37,30 @@ def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
done = False
net_hostlist = set()
for hostid in sorted(snakemake.input.hostids):
count_host_reads(hostid, hostdict, net_hostlist)
net_hostlist.update(count_host_reads(hostid, hostdict))

host, nonhost = calculate_counts(snakemake.input.reads, net_hostlist)
host, nonhost = calculate_counts(snakemake.input.reads, len(net_hostlist))

# Check for empty host reads file
with open(snakemake.input.hostreads) as f:
if not f.readlines():
# TODO: Remove aggregate_reads rule and just handle the host ids files here
if not f.readline():
s = f"WARNING: {snakemake.input.hostreads} is empty, skipping...\n"
l.write(s)
sys.stderr.write(s)
shutil.copyfile(snakemake.input.reads, snakemake.output.reads)
done = True

# Perform filtering if host reads file is not empty
if not done:
with gzip.open(snakemake.input.reads, "rt") as f_in, gzip.open(
snakemake.output.reads, "wt"
) as f_out, open(snakemake.input.hostreads) as f_ids:
ids = {k.strip(): 1 for k in f_ids.readlines()}
) as f_out:
for header_str, seq_str, plus_str, quality_str in parse_fastq(f_in):
parsed_header = (
header_str.split(" ")[0].replace("/1", "").replace("/2", "")
)
if not parsed_header in ids:
if not parsed_header in net_hostlist:
write_fastq([header_str, seq_str, plus_str, quality_str], f_out)

# Check that the output file is about the right size given the number of ids removed
Expand Down
3 changes: 1 addition & 2 deletions workflow/scripts/remove_low_complexity.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from sunbeamlib.qc import filter_ids, remove_pair_id

with open(snakemake.log[0], "w") as log:
ids = []
with open(snakemake.input.ids) as f:
ids = [remove_pair_id(id, log) for id in f.readlines()]
ids = set(remove_pair_id(id, log) for id in f.readlines())
log.write(f"Num Komplexity IDs to be filtered: {len(ids)}\n")

filter_ids(snakemake.input.reads, snakemake.output[0], ids, log)

0 comments on commit 4ff9bd9

Please sign in to comment.