Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize memory usage. #4

Merged
merged 2 commits into from
Mar 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions Resources/make_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""This script provides a command line tool for generating Xena metadata for a
Xena matrix.

Supported types of data are genomic data, phenotype data and survival data.
Please check help message with "-h" option for details.
"""

# Ensure Python 2 and 3 compatibility
from __future__ import print_function

import argparse
import os
import sys
import time

import jinja2


def metadata(matrix, xena_dtypes):
"""Generating Xena metadata for a Xena matrix.

Args:
matrix (str): The path, including the filename, of the Xena matrix.
xena_dtypes (str): One data type code indication the data type in
matrices to be merged. Supported data type codes include (without
quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq",
"mirna", "masked_cnv", "muse_snv", "mutect2_snv",
"somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival",
"methylation27".
"""

# Map xena_dtype to corresponding metadata template.
meta_templates = {'htseq_counts': 'template.rna.meta.json',
'htseq_fpkm': 'template.rna.meta.json',
'htseq_fpkm-uq': 'template.rna.meta.json',
'mirna': 'template.mirna.meta.json',
'mirna_isoform': 'template.mirna_isoform.meta.json',
'cnv': 'template.cnv.meta.json',
'masked_cnv': 'template.cnv.meta.json',
'muse_snv': 'template.snv.meta.json',
'mutect2_snv': 'template.snv.meta.json',
'somaticsniper_snv': 'template.snv.meta.json',
'varscan2_snv': 'template.snv.meta.json',
'GDC_phenotype': 'template.phenotype.meta.json',
'survival': 'template.survival.meta.json',
'methylation27': 'template.methylation.meta.json',
'methylation450': 'template.methylation.meta.json'}
meta_templates_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'Resources')
meta_templates = {
k: os.path.join(meta_templates_dir, v)
for k, v in meta_templates.items()
}
# Jinja2 template variables for corresponding "xena_dtype".
meta_vars = {
'htseq_counts': {'gdc_type': 'HTSeq - Counts',},
'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
'unit': 'fpkm'},
'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
'unit': 'fpkm-uq'},
'mirna': {'gdc_type': 'miRNA Expression Quantification'},
'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
'cnv': {'gdc_type': 'Copy Number Segment'},
'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
'mutect2_snv': {
'gdc_type': 'MuTect2 Variant Aggregation and Masking'
},
'somaticsniper_snv': {
'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
},
'varscan2_snv': {
'gdc_type': 'VarScan2 Variant Aggregation and Masking'
},
'methylation27': {'platform_num': '27'},
'methylation450': {'platform_num': '450'}
}

# Generate metadata.
print('Creating metadata file ...', end='')
sys.stdout.flush()
template_json = meta_templates[xena_dtypes]
file_dir = os.path.dirname(template_json)
file_name = os.path.basename(template_json)
jinja2_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(file_dir)
)
metadata_template = jinja2_env.get_template(file_name)
matrix_date = time.strftime("%m-%d-%Y",
time.gmtime(os.path.getmtime(matrix)))
variables = {
'project_id': 'GDC-PANCAN',
'date': matrix_date,
'gdc_release': 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90',
'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
}
try:
variables.update(meta_vars[xena_dtypes])
except KeyError:
pass
outmetadata = matrix + '.json'
with open(outmetadata, 'w') as f:
f.write(metadata_template.render(**variables))
print('\rMetadata JSON is saved at {}.'.format(outmetadata))


def main():
valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv',
'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
'survival', 'methylation27', 'methylation450']
parser = argparse.ArgumentParser(
description='Generating Xena metadata for a Xena matrix.'
)
parser.add_argument('-m', '--matrix', type=str, required=True,
help='The path, including the filename, of the Xena '
'matrix.')
parser.add_argument('-t', '--datatype', type=str, required=True,
help='One data type code indication the data type in '
'matrices to be merged. Supported data type '
'codes include: {}'.format(str(valid_dtype)))
args = parser.parse_args()
metadata(args.matrix, args.datatype)


if __name__ == '__main__':
main()
37 changes: 37 additions & 0 deletions Scripts/equal_matrices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""This script provides a command line tool for testing the equality of 2 Xena
matrices.
"""

# Ensure Python 2 and 3 compatibility
from __future__ import print_function

import argparse

import pandas as pd
from pandas.util.testing import assert_frame_equal


def main():
parser = argparse.ArgumentParser(
description='Test the equality of 2 Xena matrices.'
)
parser.add_argument('df1', type=str,
help='Directory for the first matrix.')
parser.add_argument('df2', type=str,
help='Directory for the second matrix.')
args = parser.parse_args()
df1 = pd.read_table(args.df1, header=0,
index_col=0).sort_index(axis=0).sort_index(axis=1)
df2 = pd.read_table(args.df2, header=0,
index_col=0).sort_index(axis=0).sort_index(axis=1)
try:
assert_frame_equal(df1, df2)
print('Equal.')
except: # appeantly AssertionError doesn't catch all
print('Not equal.')

if __name__ == '__main__':
main()
67 changes: 67 additions & 0 deletions Scripts/join_xena.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146

set -e

usage () {
echo 'Combine Xena matrices by shared column (row name), i.e. grow horizontally.'
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
echo $'\npositional arguments:'
echo ' file matrix(es) to be joined'
echo $'\noptional arguments:'
echo ' -h, --help show this help message and exit'
echo ' -o, --output OUTPUT path to output file, including filename. Directory must'
echo ' exist and file must not exist (no overwritting).'
exit 0
}

# Get the list of matrices to be joined
IFS=$'\n' # allows spaces in path
matrixlist=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-o|--output)
if [ -e "$2" ]; then
echo "Output file $2 exist! Overwrite is not supported."$'\n'
usage
fi
outdir=$(dirname "$2")
if [ ! -d "$outdir" ]; then
echo "Output directory $outdir doesn't exist!"$'\n'
usage
fi
output="$2"
shift # past argument
shift # past value
;;
-h|--help)
usage
shift # past argument
;;
*) # unknown option; should be positional argument
for path in "$1"
do
matrixlist+=("$path")
done
shift
;;
esac
done

# Setup files
touch "$output"
temp="$output.$RANDOM.tmp"
sorted=$(mktemp -p "$outdir")
trap "{ rm -f $sorted; }" EXIT

# Real outer join one by one
for file in ${matrixlist[@]}
do
echo "Merging $file ..."
#LC_ALL=C sort -k 1b,1 "$file" > "$sorted"
(head -n 1 "$file" && tail -n +2 "$file" | LC_ALL=C sort -k 1b,1) > "$sorted"
LC_ALL=C join -t $'\t' -a1 -a2 -o auto --header --check-order "$output" "$sorted" > "$temp"
mv "$temp" "$output"
done
12 changes: 6 additions & 6 deletions Scripts/panTCGA.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@


def main():
root_dir = r'/mnt/gdc/xena/files'
out_dir = r'/mnt/gdc/TCGA-PANCAN/Xena_Matrices'
root_dir = r'/mnt/gdc/updates'
out_dir = r'/mnt/gdc/updates/GDC-PANCAN/Xena_Matrices'
datatypes = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
'masked_cnv', 'muse_snv', 'mutect2_snv', 'somaticsniper_snv',
'varscan2_snv', 'survival']
gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90'
gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-100'

# Map xena_dtype to corresponding metadata template.
meta_templates = {'htseq_counts': 'template.rna.meta.json',
Expand Down Expand Up @@ -111,7 +111,7 @@ def main():
# Merge matrices
print('\rMerging {} "{}" matrices ...'.format(len(matrices), dtype))
merged = pd.concat(matrices, axis=merge_axis)
outmatrix = os.path.join(out_dir, 'TCGA-PANCAN.{}.tsv'.format(dtype))
outmatrix = os.path.join(out_dir, 'GDC-PANCAN.{}.tsv'.format(dtype))
print('Saving merged matrix to {} ...'.format(outmatrix), end='')
sys.stdout.flush()
merged.to_csv(outmatrix, sep='\t', encoding='utf-8')
Expand All @@ -128,12 +128,12 @@ def main():
)
metadata_template = jinja2_env.get_template(file_name)
variables = {
'project_id': 'TCGA-PANCAN',
'project_id': 'GDC-PANCAN',
'date': time.strftime(
"%m-%d-%Y", time.gmtime(os.path.getmtime(outmatrix))
),
'gdc_release': gdc_release,
'xena_cohort': 'GDC TCGA Pan-Cancer (PANCAN)'
'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
}
try:
variables.update(meta_vars[dtype])
Expand Down
68 changes: 68 additions & 0 deletions Scripts/union_xena.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146

set -e

usage () {
echo 'Combine Xena matrices by shared header row (1st row only; column name), i.e. grow vertically.'
echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
echo $'\npositional arguments:'
echo ' file matrix(es) to be joined'
echo $'\noptional arguments:'
echo ' -h, --help show this help message and exit'
echo ' -o, --output OUTPUT path to output file, including filename. Directory must'
echo ' exist and file must not exist (no overwritting).'
exit 0
}

colorder=`head -1 -q "$@" | awk '
BEGIN {
ncol = 0
FS = "\t"
RS = "\n|\r\n"
}
{
for (i = 1; i <= NF; i++) {
if (! ($i in allfields)) {
ncol++
colorder[ncol] = $i
allfields[$i] = ""
}
}
}
END {
printf "%s", colorder[1]
for (i = 2; i <= ncol; i++) {
printf "\t%s", colorder[i]
}
}
'`

awk -v c="$colorder" '
BEGIN {
ncol = split(c, colorder, "\t")
print c
FS = "\t"
RS = "\n|\r\n"
}
FNR==1 {
for (i = 1; i <= NF; i++) {
thisfields[i] = $i
}
next
}

{
for (i in colorder) {
output[colorder[i]] = ""
}
for (i = 1; i <= NF; i++) {
output[thisfields[i]] = $i
}
printf "%s", output[colorder[1]]
for (i = 2; i <= ncol; i++) {
printf "\t%s", output[colorder[i]]
}
print ""
}
' "$@"
Loading