Skip to content

Commit

Permalink
Adding script to collapse and aggregate information at different taxo…
Browse files Browse the repository at this point in the history
…nomic levels
  • Loading branch information
skchronicles committed Jan 27, 2025
1 parent aabf808 commit 603d8c5
Showing 1 changed file with 243 additions and 0 deletions.
243 changes: 243 additions & 0 deletions workflow/scripts/collapse_on_taxa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# Author: Skyler Kuhn

# Standard Library
from __future__ import print_function
import sys, os
import textwrap

# 3rd party packages,
# Install from pypi
import argparse # added in python/3.0


_help = textwrap.dedent("""collapse_on_taxa.py:
@Usage:
$ ./collapse_on_taxa.py [-h] [--version] \\
--collapse-on-taxa {family,genus,species} \\
--output OUTPUT \\
--input FILE
@About:
Given an annotated viral table file, an output
file name, and a taxa column name to collapse
on, this script aggregate counts and coverage
information for a given sample.
@Required Arguments:
--input FILE
An annotated input viral table file.
This file should contain column the
following columns:
• contig
• cov
• count
• taxid
• genus, family, or species
Taxon IDs were annotated with taxonkit.
--output OUTPUT
Output file name. The file name of
the resulting matrix file. Please
note that is resulting output file
will be tab-delimited.
--collapse-on-taxa family,genus,species
Taxa column name to collapse info on.
If more than one contig is annotated
to the same taxonomy, its counts and
cov will be summated. Please note the
value provided to this option must
exist as a column in the `--input`
file.
@Options
--h, --help Shows this help message and exits.
"""
)


def err(*message, **kwargs):
"""Prints any provided args to standard error.
kwargs can be provided to modify print functions
behavior.
@param message <any>:
Values printed to standard error
@params kwargs <print()>
Key words to modify print function behavior
"""
print(*message, file=sys.stderr, **kwargs)



def fatal(*message, **kwargs):
"""Prints any provided args to standard error
and exits with an exit code of 1.
@param message <any>:
Values printed to standard error
@params kwargs <print()>
Key words to modify print function behavior
"""
err(*message, **kwargs)
sys.exit(1)


def index_taxa(file, collapse_on_taxa, collapse_features = ['contig', 'count', 'cov', 'taxid']):
"""Indexes taxa information into a dictionary, where
each key is a taxa name and its value is a nested dictionary
containing 1:M information of the following fields:
contig, count, cov, taxid
@param file <str>:
Path to input file with values to extract/collapse via `collapse_on_taxa`.
@param collapse_on_taxa <str>:
Column name to collapse taxanomic information on.
Examples: family or genus or species, etc
@params collapse_features list[<str>]:
Features to aggregate for collapsing.
Default: ['contig', 'count', 'cov', 'taxid']
@return collapsed_dict dict[`taxa_name`][`collapse_feature`] = [val1,val2,...]
Example:
{'Betabaculovirus':
{'contig': ['k141_1852_flag=0_multi=1.0000_len=316', 'k142_1853_flag=0_multi=1.0000_len=316'],
'count': ['63', '123'],
'cov': ['1.0000', '2.0000'],
'taxid': ['56947', '56947']}, ...}
"""
# Parse the input file features
# of interest
collapsed_dict = {}
with open(file, 'r') as fh:
header = fh.readline().lstrip().rstrip().split('\t')
taxa_idx = header.index(collapse_on_taxa)
for line in fh:
linelist = line.lstrip().rstrip().split('\t')
# Taxa name to collapse info on
taxa_name = linelist[taxa_idx]
if taxa_name not in collapsed_dict:
# Initialize nested dict
collapsed_dict[taxa_name] = {}
for f in collapse_features:
collapsed_dict[taxa_name][f] = []
# Append collapsed feature values to list
for f in collapse_features:
collapsed_dict[taxa_name][f].append(linelist[header.index(f)])
return collapsed_dict


def main():
"""Collect command line args and collape info per-sample."""
# Parse command-line arguments
# Create a top-level parser
parser = argparse.ArgumentParser(
usage = argparse.SUPPRESS,
formatter_class = argparse.RawDescriptionHelpFormatter,
description = _help,
add_help = False
)

# Required Positional Arguments
# List of input files
parser.add_argument(
'--input',
required = True,
type = str,
help = argparse.SUPPRESS
)
# Output file name
parser.add_argument(
'--output',
required = True,
type = str,
help = argparse.SUPPRESS
)
# Collapse on taxa column name
parser.add_argument(
'--collapse-on-taxa',
required = True,
type = str,
help = argparse.SUPPRESS,
choices = [
'kingdom',
'phylum',
'class',
'order',
'family',
'genus',
'species',
'strain'
]
)

# Add custom help message
parser.add_argument(
'-h', '--help',
action='help',
help=argparse.SUPPRESS
)

# Collect parsed arguments
args = parser.parse_args()

# Sanity check for usage
if len(sys.argv) == 1:
# Nothing was provided
fatal('Invalid usage: collapse_on_taxa.py [-h] ...')

# Get indices of relevent columns
# for later parsing
file = args.input
with open(file, 'r') as fh:
header = fh.readline().lstrip().rstrip().split('\t')

# Sanity check to see if required
# columns are present in input file
collapse_features = ["cov", "count", "taxid", "contig"]
required_columns = [args.collapse_on_taxa] + collapse_features
try:
for c in required_columns:
header.index(c)
except ValueError:
# Mising a required column
err('Error: Missing one of the follow columns: {0}'.format(c))
err('in the following --input {} file'.format(file))
fatal('Please check that each file contains each column name!')

# Index taxa information
# key is name within a given
# taxa and its value is a dict
# with collapsed concat(contig),
# sum(count), sum(cov), and
# concat(taxid)
taxa_idx = index_taxa(
file = file,
collapse_on_taxa = args.collapse_on_taxa,
collapse_features = collapse_features
)

with open(args.output, 'w') as ofh:
# Add header to output file
ofh.write("{0}\n".format('\t'.join([args.collapse_on_taxa] + collapse_features)))
for taxa_name, aggr_features in taxa_idx.items():
collapsed_contigs = ','.join(aggr_features['contig']) # concat string
collapsed_taxids = ','.join(aggr_features['taxid']) # concat string
sum_count = sum([ int(c) for c in aggr_features['count'] ]) # summate counts
sum_cov = sum([ float(c) for c in aggr_features['cov'] ]) # summate cov values
# Write collapse taxa information
# to the output file
ofh.write(
"{0}\t{1}\t{2}\t{3}\t{4}\n".format(
taxa_name,
sum_cov,
sum_count,
collapsed_taxids,
collapsed_contigs
)
)


if __name__ == '__main__':
# Collapse information on the given taxa level
try:
main()
except BrokenPipeError:
err("WARNING: Caught except related to pipefail! Did you use control-C?")

0 comments on commit 603d8c5

Please sign in to comment.