-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
605 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
"""This script provides a command line tool for generating Xena metadata for a | ||
Xena matrix. | ||
Supported types of data are genomic data, phenotype data and survival data. | ||
Please check help message with "-h" option for details. | ||
""" | ||
|
||
# Ensure Python 2 and 3 compatibility | ||
from __future__ import print_function | ||
|
||
import argparse | ||
import os | ||
import sys | ||
import time | ||
|
||
import jinja2 | ||
|
||
|
||
def metadata(matrix, xena_dtypes): | ||
"""Generating Xena metadata for a Xena matrix. | ||
Args: | ||
matrix (str): The path, including the filename, of the Xena matrix. | ||
xena_dtypes (str): One data type code indication the data type in | ||
matrices to be merged. Supported data type codes include (without | ||
quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq", | ||
"mirna", "masked_cnv", "muse_snv", "mutect2_snv", | ||
"somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival", | ||
"methylation27". | ||
""" | ||
|
||
# Map xena_dtype to corresponding metadata template. | ||
meta_templates = {'htseq_counts': 'template.rna.meta.json', | ||
'htseq_fpkm': 'template.rna.meta.json', | ||
'htseq_fpkm-uq': 'template.rna.meta.json', | ||
'mirna': 'template.mirna.meta.json', | ||
'mirna_isoform': 'template.mirna_isoform.meta.json', | ||
'cnv': 'template.cnv.meta.json', | ||
'masked_cnv': 'template.cnv.meta.json', | ||
'muse_snv': 'template.snv.meta.json', | ||
'mutect2_snv': 'template.snv.meta.json', | ||
'somaticsniper_snv': 'template.snv.meta.json', | ||
'varscan2_snv': 'template.snv.meta.json', | ||
'GDC_phenotype': 'template.phenotype.meta.json', | ||
'survival': 'template.survival.meta.json', | ||
'methylation27': 'template.methylation.meta.json', | ||
'methylation450': 'template.methylation.meta.json'} | ||
meta_templates_dir = os.path.join( | ||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), | ||
'Resources') | ||
meta_templates = { | ||
k: os.path.join(meta_templates_dir, v) | ||
for k, v in meta_templates.items() | ||
} | ||
# Jinja2 template variables for corresponding "xena_dtype". | ||
meta_vars = { | ||
'htseq_counts': {'gdc_type': 'HTSeq - Counts',}, | ||
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', | ||
'unit': 'fpkm'}, | ||
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', | ||
'unit': 'fpkm-uq'}, | ||
'mirna': {'gdc_type': 'miRNA Expression Quantification'}, | ||
'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'}, | ||
'cnv': {'gdc_type': 'Copy Number Segment'}, | ||
'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'}, | ||
'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'}, | ||
'mutect2_snv': { | ||
'gdc_type': 'MuTect2 Variant Aggregation and Masking' | ||
}, | ||
'somaticsniper_snv': { | ||
'gdc_type': 'SomaticSniper Variant Aggregation and Masking' | ||
}, | ||
'varscan2_snv': { | ||
'gdc_type': 'VarScan2 Variant Aggregation and Masking' | ||
}, | ||
'methylation27': {'platform_num': '27'}, | ||
'methylation450': {'platform_num': '450'} | ||
} | ||
|
||
# Generate metadata. | ||
print('Creating metadata file ...', end='') | ||
sys.stdout.flush() | ||
template_json = meta_templates[xena_dtypes] | ||
file_dir = os.path.dirname(template_json) | ||
file_name = os.path.basename(template_json) | ||
jinja2_env = jinja2.Environment( | ||
loader=jinja2.FileSystemLoader(file_dir) | ||
) | ||
metadata_template = jinja2_env.get_template(file_name) | ||
matrix_date = time.strftime("%m-%d-%Y", | ||
time.gmtime(os.path.getmtime(matrix))) | ||
variables = { | ||
'project_id': 'GDC-PANCAN', | ||
'date': matrix_date, | ||
'gdc_release': 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90', | ||
'xena_cohort': 'GDC Pan-Cancer (PANCAN)' | ||
} | ||
try: | ||
variables.update(meta_vars[xena_dtypes]) | ||
except KeyError: | ||
pass | ||
outmetadata = matrix + '.json' | ||
with open(outmetadata, 'w') as f: | ||
f.write(metadata_template.render(**variables)) | ||
print('\rMetadata JSON is saved at {}.'.format(outmetadata)) | ||
|
||
|
||
def main(): | ||
valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna', | ||
'masked_cnv', 'muse_snv', 'mutect2_snv', | ||
'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype', | ||
'survival', 'methylation27', 'methylation450'] | ||
parser = argparse.ArgumentParser( | ||
description='Generating Xena metadata for a Xena matrix.' | ||
) | ||
parser.add_argument('-m', '--matrix', type=str, required=True, | ||
help='The path, including the filename, of the Xena ' | ||
'matrix.') | ||
parser.add_argument('-t', '--datatype', type=str, required=True, | ||
help='One data type code indication the data type in ' | ||
'matrices to be merged. Supported data type ' | ||
'codes include: {}'.format(str(valid_dtype))) | ||
args = parser.parse_args() | ||
metadata(args.matrix, args.datatype) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
"""This script provides a command line tool for testing the equality of 2 Xena | ||
matrices. | ||
""" | ||
|
||
# Ensure Python 2 and 3 compatibility | ||
from __future__ import print_function | ||
|
||
import argparse | ||
|
||
import pandas as pd | ||
from pandas.util.testing import assert_frame_equal | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description='Test the equality of 2 Xena matrices.' | ||
) | ||
parser.add_argument('df1', type=str, | ||
help='Directory for the first matrix.') | ||
parser.add_argument('df2', type=str, | ||
help='Directory for the second matrix.') | ||
args = parser.parse_args() | ||
df1 = pd.read_table(args.df1, header=0, | ||
index_col=0).sort_index(axis=0).sort_index(axis=1) | ||
df2 = pd.read_table(args.df2, header=0, | ||
index_col=0).sort_index(axis=0).sort_index(axis=1) | ||
try: | ||
assert_frame_equal(df1, df2) | ||
print('Equal.') | ||
except: # appeantly AssertionError doesn't catch all | ||
print('Not equal.') | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/bin/bash | ||
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146 | ||
|
||
set -e | ||
|
||
usage () { | ||
echo 'Combine Xena matrices by shared column (row name), i.e. grow horizontally.' | ||
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]' | ||
echo $'\npositional arguments:' | ||
echo ' file matrix(es) to be joined' | ||
echo $'\noptional arguments:' | ||
echo ' -h, --help show this help message and exit' | ||
echo ' -o, --output OUTPUT path to output file, including filename. Directory must' | ||
echo ' exist and file must not exist (no overwritting).' | ||
exit 0 | ||
} | ||
|
||
# Get the list of matrices to be joined | ||
IFS=$'\n' # allows spaces in path | ||
matrixlist=() | ||
while [[ $# -gt 0 ]] | ||
do | ||
key="$1" | ||
case $key in | ||
-o|--output) | ||
if [ -e "$2" ]; then | ||
echo "Output file $2 exist! Overwrite is not supported."$'\n' | ||
usage | ||
fi | ||
outdir=$(dirname "$2") | ||
if [ ! -d "$outdir" ]; then | ||
echo "Output directory $outdir doesn't exist!"$'\n' | ||
usage | ||
fi | ||
output="$2" | ||
shift # past argument | ||
shift # past value | ||
;; | ||
-h|--help) | ||
usage | ||
shift # past argument | ||
;; | ||
*) # unknown option; should be positional argument | ||
for path in "$1" | ||
do | ||
matrixlist+=("$path") | ||
done | ||
shift | ||
;; | ||
esac | ||
done | ||
|
||
# Setup files | ||
touch "$output" | ||
temp="$output.$RANDOM.tmp" | ||
sorted=$(mktemp -p "$outdir") | ||
trap "{ rm -f $sorted; }" EXIT | ||
|
||
# Real outer join one by one | ||
for file in ${matrixlist[@]} | ||
do | ||
echo "Merging $file ..." | ||
#LC_ALL=C sort -k 1b,1 "$file" > "$sorted" | ||
(head -n 1 "$file" && tail -n +2 "$file" | LC_ALL=C sort -k 1b,1) > "$sorted" | ||
LC_ALL=C join -t $'\t' -a1 -a2 -o auto --header --check-order "$output" "$sorted" > "$temp" | ||
mv "$temp" "$output" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#!/bin/bash | ||
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146 | ||
|
||
set -e | ||
|
||
usage () { | ||
echo 'Combine Xena matrices by shared header row (1st row only; column name), i.e. grow vertically.' | ||
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]' | ||
echo $'\npositional arguments:' | ||
echo ' file matrix(es) to be joined' | ||
echo $'\noptional arguments:' | ||
echo ' -h, --help show this help message and exit' | ||
echo ' -o, --output OUTPUT path to output file, including filename. Directory must' | ||
echo ' exist and file must not exist (no overwritting).' | ||
exit 0 | ||
} | ||
|
||
colorder=`head -1 -q "$@" | awk ' | ||
BEGIN { | ||
ncol = 0 | ||
FS = "\t" | ||
RS = "\n|\r\n" | ||
} | ||
{ | ||
for (i = 1; i <= NF; i++) { | ||
if (! ($i in allfields)) { | ||
ncol++ | ||
colorder[ncol] = $i | ||
allfields[$i] = "" | ||
} | ||
} | ||
} | ||
END { | ||
printf "%s", colorder[1] | ||
for (i = 2; i <= ncol; i++) { | ||
printf "\t%s", colorder[i] | ||
} | ||
} | ||
'` | ||
|
||
awk -v c="$colorder" ' | ||
BEGIN { | ||
ncol = split(c, colorder, "\t") | ||
print c | ||
FS = "\t" | ||
RS = "\n|\r\n" | ||
} | ||
FNR==1 { | ||
for (i = 1; i <= NF; i++) { | ||
thisfields[i] = $i | ||
} | ||
next | ||
} | ||
{ | ||
for (i in colorder) { | ||
output[colorder[i]] = "" | ||
} | ||
for (i = 1; i <= NF; i++) { | ||
output[thisfields[i]] = $i | ||
} | ||
printf "%s", output[colorder[1]] | ||
for (i = 2; i <= ncol; i++) { | ||
printf "\t%s", output[colorder[i]] | ||
} | ||
print "" | ||
} | ||
' "$@" |
Oops, something went wrong.