Skip to content

Commit

Permalink
Update and add scripts facilitating ETL
Browse files Browse the repository at this point in the history
  • Loading branch information
yunhailuo committed Mar 12, 2019
1 parent 6ec5684 commit 23c5dcb
Show file tree
Hide file tree
Showing 7 changed files with 605 additions and 31 deletions.
131 changes: 131 additions & 0 deletions Resources/make_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""This script provides a command line tool for generating Xena metadata for a
Xena matrix.
Supported types of data are genomic data, phenotype data and survival data.
Please check help message with "-h" option for details.
"""

# Ensure Python 2 and 3 compatibility
from __future__ import print_function

import argparse
import os
import sys
import time

import jinja2


def metadata(matrix, xena_dtypes):
"""Generating Xena metadata for a Xena matrix.
Args:
matrix (str): The path, including the filename, of the Xena matrix.
xena_dtypes (str): One data type code indication the data type in
matrices to be merged. Supported data type codes include (without
quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq",
"mirna", "masked_cnv", "muse_snv", "mutect2_snv",
"somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival",
"methylation27".
"""

# Map xena_dtype to corresponding metadata template.
meta_templates = {'htseq_counts': 'template.rna.meta.json',
'htseq_fpkm': 'template.rna.meta.json',
'htseq_fpkm-uq': 'template.rna.meta.json',
'mirna': 'template.mirna.meta.json',
'mirna_isoform': 'template.mirna_isoform.meta.json',
'cnv': 'template.cnv.meta.json',
'masked_cnv': 'template.cnv.meta.json',
'muse_snv': 'template.snv.meta.json',
'mutect2_snv': 'template.snv.meta.json',
'somaticsniper_snv': 'template.snv.meta.json',
'varscan2_snv': 'template.snv.meta.json',
'GDC_phenotype': 'template.phenotype.meta.json',
'survival': 'template.survival.meta.json',
'methylation27': 'template.methylation.meta.json',
'methylation450': 'template.methylation.meta.json'}
meta_templates_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'Resources')
meta_templates = {
k: os.path.join(meta_templates_dir, v)
for k, v in meta_templates.items()
}
# Jinja2 template variables for corresponding "xena_dtype".
meta_vars = {
'htseq_counts': {'gdc_type': 'HTSeq - Counts',},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
'unit': 'fpkm-uq'},
'mirna': {'gdc_type': 'miRNA Expression Quantification'},
'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
'cnv': {'gdc_type': 'Copy Number Segment'},
'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
'mutect2_snv': {
'gdc_type': 'MuTect2 Variant Aggregation and Masking'
},
'somaticsniper_snv': {
'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
},
'varscan2_snv': {
'gdc_type': 'VarScan2 Variant Aggregation and Masking'
},
'methylation27': {'platform_num': '27'},
'methylation450': {'platform_num': '450'}
}

# Generate metadata.
print('Creating metadata file ...', end='')
sys.stdout.flush()
template_json = meta_templates[xena_dtypes]
file_dir = os.path.dirname(template_json)
file_name = os.path.basename(template_json)
jinja2_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(file_dir)
)
metadata_template = jinja2_env.get_template(file_name)
matrix_date = time.strftime("%m-%d-%Y",
time.gmtime(os.path.getmtime(matrix)))
variables = {
'project_id': 'GDC-PANCAN',
'date': matrix_date,
'gdc_release': 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90',
'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
}
try:
variables.update(meta_vars[xena_dtypes])
except KeyError:
pass
outmetadata = matrix + '.json'
with open(outmetadata, 'w') as f:
f.write(metadata_template.render(**variables))
print('\rMetadata JSON is saved at {}.'.format(outmetadata))


def main():
valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv',
'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
'survival', 'methylation27', 'methylation450']
parser = argparse.ArgumentParser(
description='Generating Xena metadata for a Xena matrix.'
)
parser.add_argument('-m', '--matrix', type=str, required=True,
help='The path, including the filename, of the Xena '
'matrix.')
parser.add_argument('-t', '--datatype', type=str, required=True,
help='One data type code indication the data type in '
'matrices to be merged. Supported data type '
'codes include: {}'.format(str(valid_dtype)))
args = parser.parse_args()
metadata(args.matrix, args.datatype)


if __name__ == '__main__':
main()
37 changes: 37 additions & 0 deletions Scripts/equal_matrices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""This script provides a command line tool for testing the equality of 2 Xena
matrices.
"""

# Ensure Python 2 and 3 compatibility
from __future__ import print_function

import argparse

import pandas as pd
from pandas.util.testing import assert_frame_equal


def main():
parser = argparse.ArgumentParser(
description='Test the equality of 2 Xena matrices.'
)
parser.add_argument('df1', type=str,
help='Directory for the first matrix.')
parser.add_argument('df2', type=str,
help='Directory for the second matrix.')
args = parser.parse_args()
df1 = pd.read_table(args.df1, header=0,
index_col=0).sort_index(axis=0).sort_index(axis=1)
df2 = pd.read_table(args.df2, header=0,
index_col=0).sort_index(axis=0).sort_index(axis=1)
try:
assert_frame_equal(df1, df2)
print('Equal.')
except: # appeantly AssertionError doesn't catch all
print('Not equal.')

if __name__ == '__main__':
main()
67 changes: 67 additions & 0 deletions Scripts/join_xena.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146

set -e

usage () {
echo 'Combine Xena matrices by shared column (row name), i.e. grow horizontally.'
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
echo $'\npositional arguments:'
echo ' file matrix(es) to be joined'
echo $'\noptional arguments:'
echo ' -h, --help show this help message and exit'
echo ' -o, --output OUTPUT path to output file, including filename. Directory must'
echo ' exist and file must not exist (no overwritting).'
exit 0
}

# Get the list of matrices to be joined
IFS=$'\n' # allows spaces in path
matrixlist=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-o|--output)
if [ -e "$2" ]; then
echo "Output file $2 exist! Overwrite is not supported."$'\n'
usage
fi
outdir=$(dirname "$2")
if [ ! -d "$outdir" ]; then
echo "Output directory $outdir doesn't exist!"$'\n'
usage
fi
output="$2"
shift # past argument
shift # past value
;;
-h|--help)
usage
shift # past argument
;;
*) # unknown option; should be positional argument
for path in "$1"
do
matrixlist+=("$path")
done
shift
;;
esac
done

# Setup files
touch "$output"
temp="$output.$RANDOM.tmp"
sorted=$(mktemp -p "$outdir")
trap "{ rm -f $sorted; }" EXIT

# Real outer join one by one
for file in ${matrixlist[@]}
do
echo "Merging $file ..."
#LC_ALL=C sort -k 1b,1 "$file" > "$sorted"
(head -n 1 "$file" && tail -n +2 "$file" | LC_ALL=C sort -k 1b,1) > "$sorted"
LC_ALL=C join -t $'\t' -a1 -a2 -o auto --header --check-order "$output" "$sorted" > "$temp"
mv "$temp" "$output"
done
12 changes: 6 additions & 6 deletions Scripts/panTCGA.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@


def main():
root_dir = r'/mnt/gdc/xena/files'
out_dir = r'/mnt/gdc/TCGA-PANCAN/Xena_Matrices'
root_dir = r'/mnt/gdc/updates'
out_dir = r'/mnt/gdc/updates/GDC-PANCAN/Xena_Matrices'
datatypes = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv', 'somaticsniper_snv',
'varscan2_snv', 'survival']
gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90'
gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-100'

# Map xena_dtype to corresponding metadata template.
meta_templates = {'htseq_counts': 'template.rna.meta.json',
Expand Down Expand Up @@ -111,7 +111,7 @@ def main():
# Merge matrices
print('\rMerging {} "{}" matrices ...'.format(len(matrices), dtype))
merged = pd.concat(matrices, axis=merge_axis)
outmatrix = os.path.join(out_dir, 'TCGA-PANCAN.{}.tsv'.format(dtype))
outmatrix = os.path.join(out_dir, 'GDC-PANCAN.{}.tsv'.format(dtype))
print('Saving merged matrix to {} ...'.format(outmatrix), end='')
sys.stdout.flush()
merged.to_csv(outmatrix, sep='\t', encoding='utf-8')
Expand All @@ -128,12 +128,12 @@ def main():
)
metadata_template = jinja2_env.get_template(file_name)
variables = {
'project_id': 'TCGA-PANCAN',
'project_id': 'GDC-PANCAN',
'date': time.strftime(
"%m-%d-%Y", time.gmtime(os.path.getmtime(outmatrix))
),
'gdc_release': gdc_release,
'xena_cohort': 'GDC TCGA Pan-Cancer (PANCAN)'
'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
}
try:
variables.update(meta_vars[dtype])
Expand Down
68 changes: 68 additions & 0 deletions Scripts/union_xena.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146

set -e

usage () {
echo 'Combine Xena matrices by shared header row (1st row only; column name), i.e. grow vertically.'
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
echo $'\npositional arguments:'
echo ' file matrix(es) to be joined'
echo $'\noptional arguments:'
echo ' -h, --help show this help message and exit'
echo ' -o, --output OUTPUT path to output file, including filename. Directory must'
echo ' exist and file must not exist (no overwritting).'
exit 0
}

colorder=`head -1 -q "$@" | awk '
BEGIN {
ncol = 0
FS = "\t"
RS = "\n|\r\n"
}
{
for (i = 1; i <= NF; i++) {
if (! ($i in allfields)) {
ncol++
colorder[ncol] = $i
allfields[$i] = ""
}
}
}
END {
printf "%s", colorder[1]
for (i = 2; i <= ncol; i++) {
printf "\t%s", colorder[i]
}
}
'`

awk -v c="$colorder" '
BEGIN {
ncol = split(c, colorder, "\t")
print c
FS = "\t"
RS = "\n|\r\n"
}
FNR==1 {
for (i = 1; i <= NF; i++) {
thisfields[i] = $i
}
next
}
{
for (i in colorder) {
output[colorder[i]] = ""
}
for (i = 1; i <= NF; i++) {
output[thisfields[i]] = $i
}
printf "%s", output[colorder[1]]
for (i = 2; i <= ncol; i++) {
printf "\t%s", output[colorder[i]]
}
print ""
}
' "$@"
Loading

0 comments on commit 23c5dcb

Please sign in to comment.