Update and add scripts facilitating ETL

ucscXena · Mar 12, 2019 · 23c5dcb · 23c5dcb
1 parent 6ec5684
commit 23c5dcb
Show file tree

Hide file tree

Showing 7 changed files with 605 additions and 31 deletions.
diff --git a/Resources/make_metadata.py b/Resources/make_metadata.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""This script provides a command line tool for generating Xena metadata for a
+Xena matrix.
+
+Supported types of data are genomic data, phenotype data and survival data.
+Please check help message with "-h" option for details.
+"""
+
+# Ensure Python 2 and 3 compatibility
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+import time
+
+import jinja2
+
+
+def metadata(matrix, xena_dtypes):
+    """Generating Xena metadata for a Xena matrix.
+    
+    Args:
+        matrix (str): The path, including the filename, of the Xena matrix.
+        xena_dtypes (str): One data type code indication the data type in
+            matrices to be merged. Supported data type codes include (without
+            quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq",
+            "mirna", "masked_cnv", "muse_snv", "mutect2_snv",
+            "somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival",
+            "methylation27".
+    """
+
+    # Map xena_dtype to corresponding metadata template.
+    meta_templates = {'htseq_counts': 'template.rna.meta.json',
+                      'htseq_fpkm': 'template.rna.meta.json',
+                      'htseq_fpkm-uq': 'template.rna.meta.json',
+                      'mirna': 'template.mirna.meta.json',
+                      'mirna_isoform': 'template.mirna_isoform.meta.json',
+                      'cnv': 'template.cnv.meta.json',
+                      'masked_cnv': 'template.cnv.meta.json',
+                      'muse_snv': 'template.snv.meta.json',
+                      'mutect2_snv': 'template.snv.meta.json',
+                      'somaticsniper_snv': 'template.snv.meta.json',
+                      'varscan2_snv': 'template.snv.meta.json',
+                      'GDC_phenotype': 'template.phenotype.meta.json',
+                      'survival': 'template.survival.meta.json',
+                      'methylation27': 'template.methylation.meta.json',
+                      'methylation450': 'template.methylation.meta.json'}
+    meta_templates_dir = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            'Resources')
+    meta_templates = {
+            k: os.path.join(meta_templates_dir, v)
+            for k, v in meta_templates.items()
+        }
+    # Jinja2 template variables for corresponding "xena_dtype".
+    meta_vars = {
+            'htseq_counts': {'gdc_type': 'HTSeq - Counts',},
+            'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM',
+                           'unit': 'fpkm'},
+            'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ',
+                              'unit': 'fpkm-uq'},
+            'mirna': {'gdc_type': 'miRNA Expression Quantification'},
+            'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'},
+            'cnv': {'gdc_type': 'Copy Number Segment'},
+            'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'},
+            'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'},
+            'mutect2_snv': {
+                    'gdc_type': 'MuTect2 Variant Aggregation and Masking'
+                },
+            'somaticsniper_snv': {
+                    'gdc_type': 'SomaticSniper Variant Aggregation and Masking'
+                },
+            'varscan2_snv': {
+                    'gdc_type': 'VarScan2 Variant Aggregation and Masking'
+                },
+            'methylation27': {'platform_num': '27'},
+            'methylation450': {'platform_num': '450'}
+        }
+
+    # Generate metadata.
+    print('Creating metadata file ...', end='')
+    sys.stdout.flush()
+    template_json = meta_templates[xena_dtypes]
+    file_dir = os.path.dirname(template_json)
+    file_name = os.path.basename(template_json)
+    jinja2_env = jinja2.Environment(
+            loader=jinja2.FileSystemLoader(file_dir)
+        )
+    metadata_template = jinja2_env.get_template(file_name)
+    matrix_date = time.strftime("%m-%d-%Y",
+                                time.gmtime(os.path.getmtime(matrix)))
+    variables = {
+            'project_id': 'GDC-PANCAN',
+            'date': matrix_date,
+            'gdc_release': 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90',
+            'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
+        }
+    try:
+        variables.update(meta_vars[xena_dtypes])
+    except KeyError:
+        pass
+    outmetadata = matrix + '.json'
+    with open(outmetadata, 'w') as f:
+        f.write(metadata_template.render(**variables))
+    print('\rMetadata JSON is saved at {}.'.format(outmetadata))
+
+
+def main():
+    valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
+                   'masked_cnv', 'muse_snv', 'mutect2_snv',
+                   'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype',
+                   'survival', 'methylation27', 'methylation450']
+    parser = argparse.ArgumentParser(
+            description='Generating Xena metadata for a Xena matrix.'
+        )
+    parser.add_argument('-m', '--matrix', type=str, required=True,
+                        help='The path, including the filename, of the Xena '
+                             'matrix.')
+    parser.add_argument('-t', '--datatype', type=str, required=True,
+                        help='One data type code indication the data type in '
+                             'matrices to be merged. Supported data type '
+                             'codes include: {}'.format(str(valid_dtype)))
+    args = parser.parse_args()
+    metadata(args.matrix, args.datatype)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Scripts/equal_matrices.py b/Scripts/equal_matrices.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""This script provides a command line tool for testing the equality of 2 Xena
+matrices.
+"""
+
+# Ensure Python 2 and 3 compatibility
+from __future__ import print_function
+
+import argparse
+
+import pandas as pd
+from pandas.util.testing import assert_frame_equal
+
+
+def main():
+    parser = argparse.ArgumentParser(
+            description='Test the equality of 2 Xena matrices.'
+        )
+    parser.add_argument('df1', type=str,
+                        help='Directory for the first matrix.')
+    parser.add_argument('df2', type=str,
+                        help='Directory for the second matrix.')
+    args = parser.parse_args()
+    df1 = pd.read_table(args.df1, header=0,
+                        index_col=0).sort_index(axis=0).sort_index(axis=1)
+    df2 = pd.read_table(args.df2, header=0,
+                        index_col=0).sort_index(axis=0).sort_index(axis=1)
+    try:
+        assert_frame_equal(df1, df2)
+        print('Equal.')
+    except:  # appeantly AssertionError doesn't catch all
+        print('Not equal.')
+
+if __name__ == '__main__':
+    main()
diff --git a/Scripts/join_xena.sh b/Scripts/join_xena.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146
+
+set -e
+
+usage () {
+    echo 'Combine Xena matrices by shared column (row name), i.e. grow horizontally.'
+    echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
+    echo $'\npositional arguments:'
+    echo ' file                   matrix(es) to be joined'
+    echo $'\noptional arguments:'
+    echo ' -h, --help             show this help message and exit'
+    echo ' -o, --output OUTPUT    path to output file, including filename. Directory must'
+    echo '                        exist and file must not exist (no overwritting).'
+    exit 0
+}
+
+# Get the list of matrices to be joined
+IFS=$'\n' # allows spaces in path
+matrixlist=()
+while [[ $# -gt 0 ]]
+do
+    key="$1"
+    case $key in
+        -o|--output)
+        if [ -e "$2" ]; then
+            echo "Output file $2 exist! Overwrite is not supported."$'\n'
+            usage
+        fi
+        outdir=$(dirname "$2")
+        if [ ! -d "$outdir" ]; then
+            echo "Output directory $outdir doesn't exist!"$'\n'
+            usage
+        fi
+        output="$2"
+        shift # past argument
+        shift # past value
+        ;;
+        -h|--help)
+            usage
+        shift # past argument
+        ;;
+        *)    # unknown option; should be positional argument
+        for path in "$1"
+        do
+            matrixlist+=("$path")
+        done
+        shift
+        ;;
+    esac
+done
+
+# Setup files
+touch "$output"
+temp="$output.$RANDOM.tmp"
+sorted=$(mktemp -p "$outdir")
+trap "{ rm -f $sorted; }" EXIT
+
+# Real outer join one by one
+for file in ${matrixlist[@]}
+do
+    echo "Merging $file ..."
+    #LC_ALL=C sort -k 1b,1 "$file" > "$sorted"
+    (head -n 1 "$file" && tail -n +2 "$file" | LC_ALL=C sort -k 1b,1) > "$sorted"
+    LC_ALL=C join -t $'\t' -a1 -a2 -o auto --header --check-order "$output" "$sorted" > "$temp"
+    mv "$temp" "$output"
+done
diff --git a/Scripts/panTCGA.py b/Scripts/panTCGA.py
@@ -33,12 +33,12 @@
 
 
 def main():
-    root_dir = r'/mnt/gdc/xena/files'
-    out_dir = r'/mnt/gdc/TCGA-PANCAN/Xena_Matrices'
+    root_dir = r'/mnt/gdc/updates'
+    out_dir = r'/mnt/gdc/updates/GDC-PANCAN/Xena_Matrices'
     datatypes = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna',
                  'masked_cnv', 'muse_snv', 'mutect2_snv', 'somaticsniper_snv',
                  'varscan2_snv', 'survival']
-    gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90'
+    gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-100'
 
     # Map xena_dtype to corresponding metadata template.
     meta_templates = {'htseq_counts': 'template.rna.meta.json',
@@ -111,7 +111,7 @@ def main():
         # Merge matrices
         print('\rMerging {} "{}" matrices ...'.format(len(matrices), dtype))
         merged = pd.concat(matrices, axis=merge_axis)
-        outmatrix = os.path.join(out_dir, 'TCGA-PANCAN.{}.tsv'.format(dtype))
+        outmatrix = os.path.join(out_dir, 'GDC-PANCAN.{}.tsv'.format(dtype))
         print('Saving merged matrix to {} ...'.format(outmatrix), end='')
         sys.stdout.flush()
         merged.to_csv(outmatrix, sep='\t', encoding='utf-8')
@@ -128,12 +128,12 @@ def main():
             )
         metadata_template = jinja2_env.get_template(file_name)
         variables = {
-                'project_id': 'TCGA-PANCAN',
+                'project_id': 'GDC-PANCAN',
                 'date': time.strftime(
                         "%m-%d-%Y", time.gmtime(os.path.getmtime(outmatrix))
                     ),
                 'gdc_release': gdc_release,
-                'xena_cohort': 'GDC TCGA Pan-Cancer (PANCAN)'
+                'xena_cohort': 'GDC Pan-Cancer (PANCAN)'
             }
         try:
             variables.update(meta_vars[dtype])

diff --git a/Scripts/union_xena.sh b/Scripts/union_xena.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146
+
+set -e
+
+usage () {
+    echo 'Combine Xena matrices by shared header row (1st row only; column name), i.e. grow vertically.'
+    echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]'
+    echo $'\npositional arguments:'
+    echo ' file                   matrix(es) to be joined'
+    echo $'\noptional arguments:'
+    echo ' -h, --help             show this help message and exit'
+    echo ' -o, --output OUTPUT    path to output file, including filename. Directory must'
+    echo '                        exist and file must not exist (no overwritting).'
+    exit 0
+}
+
+colorder=`head -1 -q "$@" | awk '
+BEGIN {
+    ncol = 0
+    FS = "\t"
+    RS = "\n|\r\n"
+}
+{
+    for (i = 1; i <= NF; i++) {
+        if (! ($i in allfields)) {
+            ncol++
+            colorder[ncol] = $i
+            allfields[$i] = ""
+        }
+    }
+}
+END {
+    printf "%s", colorder[1]
+    for (i = 2; i <= ncol; i++) {
+        printf "\t%s", colorder[i]
+    }
+}
+'`
+
+awk -v c="$colorder" '
+BEGIN {
+    ncol = split(c, colorder, "\t")
+    print c
+    FS = "\t"
+    RS = "\n|\r\n"
+}
+FNR==1 {
+    for (i = 1; i <= NF; i++) {
+        thisfields[i] = $i
+    }
+    next
+}
+
+{
+    for (i in colorder) {
+        output[colorder[i]] = ""
+    }
+    for (i = 1; i <= NF; i++) {
+        output[thisfields[i]] = $i
+    }
+    printf "%s", output[colorder[1]]
+    for (i = 2; i <= ncol; i++) {
+        printf "\t%s", output[colorder[i]]
+    }
+    print ""
+}
+' "$@"