diff --git a/Resources/make_metadata.py b/Resources/make_metadata.py new file mode 100644 index 0000000..7cd7185 --- /dev/null +++ b/Resources/make_metadata.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""This script provides a command line tool for generating Xena metadata for a +Xena matrix. + +Supported types of data are genomic data, phenotype data and survival data. +Please check help message with "-h" option for details. +""" + +# Ensure Python 2 and 3 compatibility +from __future__ import print_function + +import argparse +import os +import sys +import time + +import jinja2 + + +def metadata(matrix, xena_dtypes): + """Generating Xena metadata for a Xena matrix. + + Args: + matrix (str): The path, including the filename, of the Xena matrix. + xena_dtypes (str): One data type code indication the data type in + matrices to be merged. Supported data type codes include (without + quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq", + "mirna", "masked_cnv", "muse_snv", "mutect2_snv", + "somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival", + "methylation27". + """ + + # Map xena_dtype to corresponding metadata template. + meta_templates = {'htseq_counts': 'template.rna.meta.json', + 'htseq_fpkm': 'template.rna.meta.json', + 'htseq_fpkm-uq': 'template.rna.meta.json', + 'mirna': 'template.mirna.meta.json', + 'mirna_isoform': 'template.mirna_isoform.meta.json', + 'cnv': 'template.cnv.meta.json', + 'masked_cnv': 'template.cnv.meta.json', + 'muse_snv': 'template.snv.meta.json', + 'mutect2_snv': 'template.snv.meta.json', + 'somaticsniper_snv': 'template.snv.meta.json', + 'varscan2_snv': 'template.snv.meta.json', + 'GDC_phenotype': 'template.phenotype.meta.json', + 'survival': 'template.survival.meta.json', + 'methylation27': 'template.methylation.meta.json', + 'methylation450': 'template.methylation.meta.json'} + meta_templates_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'Resources') + meta_templates = { + k: os.path.join(meta_templates_dir, v) + for k, v in meta_templates.items() + } + # Jinja2 template variables for corresponding "xena_dtype". + meta_vars = { + 'htseq_counts': {'gdc_type': 'HTSeq - Counts',}, + 'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', + 'unit': 'fpkm'}, + 'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', + 'unit': 'fpkm-uq'}, + 'mirna': {'gdc_type': 'miRNA Expression Quantification'}, + 'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'}, + 'cnv': {'gdc_type': 'Copy Number Segment'}, + 'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'}, + 'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'}, + 'mutect2_snv': { + 'gdc_type': 'MuTect2 Variant Aggregation and Masking' + }, + 'somaticsniper_snv': { + 'gdc_type': 'SomaticSniper Variant Aggregation and Masking' + }, + 'varscan2_snv': { + 'gdc_type': 'VarScan2 Variant Aggregation and Masking' + }, + 'methylation27': {'platform_num': '27'}, + 'methylation450': {'platform_num': '450'} + } + + # Generate metadata. + print('Creating metadata file ...', end='') + sys.stdout.flush() + template_json = meta_templates[xena_dtypes] + file_dir = os.path.dirname(template_json) + file_name = os.path.basename(template_json) + jinja2_env = jinja2.Environment( + loader=jinja2.FileSystemLoader(file_dir) + ) + metadata_template = jinja2_env.get_template(file_name) + matrix_date = time.strftime("%m-%d-%Y", + time.gmtime(os.path.getmtime(matrix))) + variables = { + 'project_id': 'GDC-PANCAN', + 'date': matrix_date, + 'gdc_release': 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90', + 'xena_cohort': 'GDC Pan-Cancer (PANCAN)' + } + try: + variables.update(meta_vars[xena_dtypes]) + except KeyError: + pass + outmetadata = matrix + '.json' + with open(outmetadata, 'w') as f: + f.write(metadata_template.render(**variables)) + print('\rMetadata JSON is saved at {}.'.format(outmetadata)) + + +def main(): + valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna', + 'masked_cnv', 'muse_snv', 'mutect2_snv', + 'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype', + 'survival', 'methylation27', 'methylation450'] + parser = argparse.ArgumentParser( + description='Generating Xena metadata for a Xena matrix.' + ) + parser.add_argument('-m', '--matrix', type=str, required=True, + help='The path, including the filename, of the Xena ' + 'matrix.') + parser.add_argument('-t', '--datatype', type=str, required=True, + help='One data type code indication the data type in ' + 'matrices to be merged. Supported data type ' + 'codes include: {}'.format(str(valid_dtype))) + args = parser.parse_args() + metadata(args.matrix, args.datatype) + + +if __name__ == '__main__': + main() diff --git a/Scripts/equal_matrices.py b/Scripts/equal_matrices.py new file mode 100644 index 0000000..74e6186 --- /dev/null +++ b/Scripts/equal_matrices.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""This script provides a command line tool for testing the equality of 2 Xena +matrices. +""" + +# Ensure Python 2 and 3 compatibility +from __future__ import print_function + +import argparse + +import pandas as pd +from pandas.util.testing import assert_frame_equal + + +def main(): + parser = argparse.ArgumentParser( + description='Test the equality of 2 Xena matrices.' + ) + parser.add_argument('df1', type=str, + help='Directory for the first matrix.') + parser.add_argument('df2', type=str, + help='Directory for the second matrix.') + args = parser.parse_args() + df1 = pd.read_table(args.df1, header=0, + index_col=0).sort_index(axis=0).sort_index(axis=1) + df2 = pd.read_table(args.df2, header=0, + index_col=0).sort_index(axis=0).sort_index(axis=1) + try: + assert_frame_equal(df1, df2) + print('Equal.') + except: # appeantly AssertionError doesn't catch all + print('Not equal.') + +if __name__ == '__main__': + main() diff --git a/Scripts/join_xena.sh b/Scripts/join_xena.sh new file mode 100644 index 0000000..9b46114 --- /dev/null +++ b/Scripts/join_xena.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146 + +set -e + +usage () { + echo 'Combine Xena matrices by shared column (row name), i.e. grow horizontally.' + echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]' + echo $'\npositional arguments:' + echo ' file matrix(es) to be joined' + echo $'\noptional arguments:' + echo ' -h, --help show this help message and exit' + echo ' -o, --output OUTPUT path to output file, including filename. Directory must' + echo ' exist and file must not exist (no overwritting).' + exit 0 +} + +# Get the list of matrices to be joined +IFS=$'\n' # allows spaces in path +matrixlist=() +while [[ $# -gt 0 ]] +do + key="$1" + case $key in + -o|--output) + if [ -e "$2" ]; then + echo "Output file $2 exist! Overwrite is not supported."$'\n' + usage + fi + outdir=$(dirname "$2") + if [ ! -d "$outdir" ]; then + echo "Output directory $outdir doesn't exist!"$'\n' + usage + fi + output="$2" + shift # past argument + shift # past value + ;; + -h|--help) + usage + shift # past argument + ;; + *) # unknown option; should be positional argument + for path in "$1" + do + matrixlist+=("$path") + done + shift + ;; + esac +done + +# Setup files +touch "$output" +temp="$output.$RANDOM.tmp" +sorted=$(mktemp -p "$outdir") +trap "{ rm -f $sorted; }" EXIT + +# Real outer join one by one +for file in ${matrixlist[@]} +do + echo "Merging $file ..." + #LC_ALL=C sort -k 1b,1 "$file" > "$sorted" + (head -n 1 "$file" && tail -n +2 "$file" | LC_ALL=C sort -k 1b,1) > "$sorted" + LC_ALL=C join -t $'\t' -a1 -a2 -o auto --header --check-order "$output" "$sorted" > "$temp" + mv "$temp" "$output" +done diff --git a/Scripts/panTCGA.py b/Scripts/panTCGA.py index 951b72d..b53f033 100644 --- a/Scripts/panTCGA.py +++ b/Scripts/panTCGA.py @@ -33,12 +33,12 @@ def main(): - root_dir = r'/mnt/gdc/xena/files' - out_dir = r'/mnt/gdc/TCGA-PANCAN/Xena_Matrices' + root_dir = r'/mnt/gdc/updates' + out_dir = r'/mnt/gdc/updates/GDC-PANCAN/Xena_Matrices' datatypes = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna', 'masked_cnv', 'muse_snv', 'mutect2_snv', 'somaticsniper_snv', 'varscan2_snv', 'survival'] - gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90' + gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-100' # Map xena_dtype to corresponding metadata template. meta_templates = {'htseq_counts': 'template.rna.meta.json', @@ -111,7 +111,7 @@ def main(): # Merge matrices print('\rMerging {} "{}" matrices ...'.format(len(matrices), dtype)) merged = pd.concat(matrices, axis=merge_axis) - outmatrix = os.path.join(out_dir, 'TCGA-PANCAN.{}.tsv'.format(dtype)) + outmatrix = os.path.join(out_dir, 'GDC-PANCAN.{}.tsv'.format(dtype)) print('Saving merged matrix to {} ...'.format(outmatrix), end='') sys.stdout.flush() merged.to_csv(outmatrix, sep='\t', encoding='utf-8') @@ -128,12 +128,12 @@ def main(): ) metadata_template = jinja2_env.get_template(file_name) variables = { - 'project_id': 'TCGA-PANCAN', + 'project_id': 'GDC-PANCAN', 'date': time.strftime( "%m-%d-%Y", time.gmtime(os.path.getmtime(outmatrix)) ), 'gdc_release': gdc_release, - 'xena_cohort': 'GDC TCGA Pan-Cancer (PANCAN)' + 'xena_cohort': 'GDC Pan-Cancer (PANCAN)' } try: variables.update(meta_vars[dtype]) diff --git a/Scripts/union_xena.sh b/Scripts/union_xena.sh new file mode 100644 index 0000000..a750c1e --- /dev/null +++ b/Scripts/union_xena.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash#answer-14203146 + +set -e + +usage () { + echo 'Combine Xena matrices by shared header row (1st row only; column name), i.e. grow vertically.' + echo 'usage: join_xena.sh [-h] [-o OUTPUT] file [file ...]' + echo $'\npositional arguments:' + echo ' file matrix(es) to be joined' + echo $'\noptional arguments:' + echo ' -h, --help show this help message and exit' + echo ' -o, --output OUTPUT path to output file, including filename. Directory must' + echo ' exist and file must not exist (no overwritting).' + exit 0 +} + +colorder=`head -1 -q "$@" | awk ' +BEGIN { + ncol = 0 + FS = "\t" + RS = "\n|\r\n" +} +{ + for (i = 1; i <= NF; i++) { + if (! ($i in allfields)) { + ncol++ + colorder[ncol] = $i + allfields[$i] = "" + } + } +} +END { + printf "%s", colorder[1] + for (i = 2; i <= ncol; i++) { + printf "\t%s", colorder[i] + } +} +'` + +awk -v c="$colorder" ' +BEGIN { + ncol = split(c, colorder, "\t") + print c + FS = "\t" + RS = "\n|\r\n" +} +FNR==1 { + for (i = 1; i <= NF; i++) { + thisfields[i] = $i + } + next +} + +{ + for (i in colorder) { + output[colorder[i]] = "" + } + for (i = 1; i <= NF; i++) { + output[thisfields[i]] = $i + } + printf "%s", output[colorder[1]] + for (i = 2; i <= ncol; i++) { + printf "\t%s", output[colorder[i]] + } + print "" +} +' "$@" diff --git a/gdc2xena.py b/gdc2xena.py index 19083ff..949d020 100755 --- a/gdc2xena.py +++ b/gdc2xena.py @@ -101,12 +101,21 @@ def main(): 'GDC_phenotype', 'survival', 'methylation27', 'methylation450'] parser = argparse.ArgumentParser( - description='Pipeline for importing data from GDC to Xena.', + description='Pipeline for importing data from GDC to Xena.' + ) + subparsers = parser.add_subparsers(title='Subcommands', dest='subcomm', + metavar='') + + # Subcommand for full ETL (download, transform, and metadata) + etlparser = subparsers.add_parser( + 'etl', + help='Download and transform GDC data into Xena matrix, ' + 'and generate corresponding metadata.', epilog='Supported data types are: {}'.format(str(valid_dtype)) ) - parser.add_argument('-r', '--root', type=str, - help='Root directory for imported data.', default='.') - projects_group = parser.add_mutually_exclusive_group() + etlparser.add_argument('-r', '--root', type=str, default='.', + help='Root directory for imported data.') + projects_group = etlparser.add_mutually_exclusive_group() projects_group.add_argument('-p', '--projects', type=str, nargs='+', help='GDC project ID(s) to be imported; or ' '"all" if all projects on GDC are going ' @@ -118,7 +127,7 @@ def main(): 'This option and the "-p" option are ' 'mutually exclusive.', default=[]) - datatype_group = parser.add_mutually_exclusive_group() + datatype_group = etlparser.add_mutually_exclusive_group() datatype_group.add_argument('-t', '--datatype', type=str, nargs='+', help='Data type code(s) to be imported; or ' '"all" if all supported types are going ' @@ -130,27 +139,64 @@ def main(): 'This option and the "-t" option are ' 'mutually exclusive.', default=[]) + + # Subcommand for making metadata + metaparser = subparsers.add_parser( + 'metadata', + help='Generate metadata for a Xena matrix', + epilog='Supported data types are: {}'.format(str(valid_dtype)) + ) + metaparser.add_argument('-p', '--project', type=str, required=True, + help='The project of the matrix.') + metaparser.add_argument('-t', '--datatype', type=str, required=True, + help='One data type code for the matrix.') + metaparser.add_argument('-m', '--matrix', type=str, required=True, + help='Path to a Xena matrix') + metaparser.add_argument('-r', '--release', type=float, required=True, + help='GDC data release number.') + args = parser.parse_args() - root_dir = os.path.abspath(args.root) - projects = args.projects - if 'all' in [p.lower() for p in projects]: - projects = [str(x) for x in gdc.get_project_info().index] - for p in args.not_projects: - projects.remove(p) - xena_dtypes = args.datatype - if 'all' in [t.lower() for t in xena_dtypes]: - xena_dtypes = valid_dtype - for t in args.not_datatype: - xena_dtypes.remove(t) - print('#### GDC to Xena Importing Settings ####') - total_projects = len(projects) - print('Import the following {} projects:'.format(total_projects)) - print(repr(projects), end='\n\n') - print('for the following {} types of data:'.format(len(xena_dtypes))) - print(str(xena_dtypes), end='\n\n') - print('into this directory: {}'.format(root_dir)) - print('########################################', end='\n\n') - gdc2xena(root_dir, projects, xena_dtypes) + if args.subcomm == 'etl': + root_dir = os.path.abspath(args.root) + projects = args.projects + if 'all' in [p.lower() for p in projects]: + projects = [str(x) for x in gdc.get_project_info().index] + for p in args.not_projects: + projects.remove(p) + xena_dtypes = args.datatype + if 'all' in [t.lower() for t in xena_dtypes]: + xena_dtypes = valid_dtype + for t in args.not_datatype: + xena_dtypes.remove(t) + print('#### GDC to Xena Importing Settings ####') + total_projects = len(projects) + print('Import the following {} projects:'.format(total_projects)) + print(repr(projects), end='\n\n') + print('for the following {} types of data:'.format(len(xena_dtypes))) + print(str(xena_dtypes), end='\n\n') + print('into this directory: {}'.format(root_dir)) + print('########################################', end='\n\n') + gdc2xena(root_dir, projects, xena_dtypes) + elif args.subcomm == 'metadata': + root_dir = os.path.dirname(args.matrix) + if args.datatype == 'survival': + dataset = GDCSurvivalset(args.project, root_dir) + elif args.datatype == 'raw_phenotype': + if args.project.startswith('TCGA'): + dataset = GDCPhenoset(args.project, 'raw_phenotype', + root_dir) + if args.project.startswith('TARGET'): + dataset = GDCPhenoset(args.project, 'clinical', root_dir) + elif args.datatype == 'GDC_phenotype': + dataset = GDCPhenoset(args.project, 'GDC_phenotype', root_dir) + else: + dataset = GDCOmicset(args.project, args.datatype, root_dir) + dataset.matrix = args.matrix + dataset.gdc_release = ( + 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-' + + str(args.release).replace('.', '') + ) + dataset.metadata() if __name__ == '__main__': diff --git a/merge_xena.py b/merge_xena.py new file mode 100644 index 0000000..293e5c5 --- /dev/null +++ b/merge_xena.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""This script provides a command line tool for merging Xena matrices of the +same data type. + +This module also provides an stand-alone function ``merge`` which can be used +in scripts. For example:: + + script_dir = os.path.dirname(os.path.abspath(__file__)) + root_dir = os.path.join(script_dir, 'gitignore', 'test') + projects = ['TCGA-CHOL', 'TARGET-RT'] + xena_dtypes = ['htseq_counts', 'phenotype', 'survival'] + gdc2xena(root_dir, projects, xena_dtypes) + +Supported types of data are genomic data, phenotype data and survival data. +Please check help message with "-h" option for details. +""" + +# Ensure Python 2 and 3 compatibility +from __future__ import division +from __future__ import print_function + +import argparse +from datetime import date +import glob +import os +import time +import timeit +import sys + +import jinja2 +import pandas as pd + +def merge(filelist, xena_dtypes, out_matrix): + """Merge a list (``filelist``) of Xena matrices of the same data type + (``xena_dtypes``) into a single Xena matrix (``out_matrix``). + + Args: + filelist (list of path): A list of Xena matrices files to be merged, + which will be read by pandas.read_table. + xena_dtypes (str): One data type code indication the data type in + matrices to be merged. Supported data type codes include (without + quotation marks): "htseq_counts", "htseq_fpkm", "htseq_fpkm-uq", + "mirna", "masked_cnv", "muse_snv", "mutect2_snv", + "somaticsniper_snv", "varscan2_snv", "GDC_phenotype", "survival", + "methylation27". + out_matrix (str): The path, including the filename, for merged Xena + matrix. + """ + + # Map xena_dtype to corresponding metadata template. + meta_templates = {'htseq_counts': 'template.rna.meta.json', + 'htseq_fpkm': 'template.rna.meta.json', + 'htseq_fpkm-uq': 'template.rna.meta.json', + 'mirna': 'template.mirna.meta.json', + 'mirna_isoform': 'template.mirna_isoform.meta.json', + 'cnv': 'template.cnv.meta.json', + 'masked_cnv': 'template.cnv.meta.json', + 'muse_snv': 'template.snv.meta.json', + 'mutect2_snv': 'template.snv.meta.json', + 'somaticsniper_snv': 'template.snv.meta.json', + 'varscan2_snv': 'template.snv.meta.json', + 'GDC_phenotype': 'template.phenotype.meta.json', + 'survival': 'template.survival.meta.json'} + meta_templates_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'Resources') + meta_templates = { + k: os.path.join(meta_templates_dir, v) + for k, v in meta_templates.items() + } + # Jinja2 template variables for corresponding "xena_dtype". + meta_vars = { + 'htseq_counts': {'gdc_type': 'HTSeq - Counts',}, + 'htseq_fpkm': {'gdc_type': 'HTSeq - FPKM', + 'unit': 'fpkm'}, + 'htseq_fpkm-uq': {'gdc_type': 'HTSeq - FPKM-UQ', + 'unit': 'fpkm-uq'}, + 'mirna': {'gdc_type': 'miRNA Expression Quantification'}, + 'mirna_isoform': {'gdc_type': 'Isoform Expression Quantification'}, + 'cnv': {'gdc_type': 'Copy Number Segment'}, + 'masked_cnv': {'gdc_type': 'Masked Copy Number Segment'}, + 'muse_snv': {'gdc_type': 'MuSE Variant Aggregation and Masking'}, + 'mutect2_snv': { + 'gdc_type': 'MuTect2 Variant Aggregation and Masking' + }, + 'somaticsniper_snv': { + 'gdc_type': 'SomaticSniper Variant Aggregation and Masking' + }, + 'varscan2_snv': { + 'gdc_type': 'VarScan2 Variant Aggregation and Masking' + } + } + gdc_release = 'https://docs.gdc.cancer.gov/Data/Release_Notes/Data_Release_Notes/#data-release-90' + + start_time = timeit.default_timer() + + if xena_dtypes in ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', + 'mirna', 'methylation27']: + merge_axis = 1 + elif xena_dtypes in ['masked_cnv', 'muse_snv', 'mutect2_snv', + 'somaticsniper_snv', 'varscan2_snv', + 'GDC_phenotype', 'survival']: + merge_axis = 0 + else: + msg = 'Invalid datatype: {}\nSupported data types are: {}' + valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', + 'mirna', 'masked_cnv', 'muse_snv', 'mutect2_snv', + 'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype', + 'survival', 'methylation27'] + raise ValueError(msg.format(xena_dtypes, valid_dtype)) + return + # Merge by growing merged matrix one matrix at a time. + merged = pd.DataFrame() + count = 0 + total = len(filelist) + for path in filelist: + count += 1 + print('\r[{}/{}] Merging {} ...'.format(count, total, path), end='') + sys.stdout.flush() + merged = pd.concat( + [merged, pd.read_table(path, header=0, index_col=0)], + axis=merge_axis, copy=False + ) + print('\rSaving merged matrix to {} ...'.format(out_matrix), end='') + sys.stdout.flush() + merged.to_csv(out_matrix, sep='\t', encoding='utf-8') + del merged # Prevent from doubling memory usage + print('\rMerged "{}" matrix is ready at {}'.format(xena_dtypes, + out_matrix)) + return + + + + + # Generate metadata. + print('Creating metadata file ...', end='') + sys.stdout.flush() + template_json = meta_templates[xena_dtypes] + file_dir = os.path.dirname(template_json) + file_name = os.path.basename(template_json) + jinja2_env = jinja2.Environment( + loader=jinja2.FileSystemLoader(file_dir) + ) + metadata_template = jinja2_env.get_template(file_name) + matrix_date = time.strftime( + "%m-%d-%Y", time.gmtime(os.path.getmtime(out_matrix)) + ) + variables = {'date': matrix_date, + 'gdc_release': gdc_release, + 'xena_cohort': 'GDC Pan-Cancer (PANCAN)'} + try: + variables.update(meta_vars[xena_dtypes]) + except KeyError: + pass + outmetadata = out_matrix + '.json' + with open(outmetadata, 'w') as f: + f.write(metadata_template.render(**variables)) + print('\rMetadata JSON is saved at {}.'.format(outmetadata)) + + + + end_time = timeit.default_timer() + m, s = divmod(int(end_time - start_time), 60) + h, m = divmod(m, 60) + print('Finish in {:d}:{:02d}:{:02d}.'.format(h, m, s)) + + +def main(): + valid_dtype = ['htseq_counts', 'htseq_fpkm', 'htseq_fpkm-uq', 'mirna', + 'masked_cnv', 'muse_snv', 'mutect2_snv', + 'somaticsniper_snv', 'varscan2_snv', 'GDC_phenotype', + 'survival', 'methylation27'] + parser = argparse.ArgumentParser( + description='Pipeline for merging Xena matrices of the same data ' + 'type.' + ) + parser.add_argument('-f', '--files', type=str, nargs='+', required=True, + help='A list of paths for Xena matrices files to be ' + 'merged. All paths in this list support UNIX ' + 'style pathname pattern expansion with "glob". ' + 'Files will be read by pandas.read_table.') + parser.add_argument('-t', '--datatype', type=str, required=True, + help='One data type code indication the data type in ' + 'matrices to be merged. Supported data type ' + 'codes include: {}'.format(str(valid_dtype))) + parser.add_argument('-o', '--outdir', type=str, default='.', + help='A directory to put the merged matrix. Defaults ' + 'to the current working directory of python.') + parser.add_argument('-n', '--name', type=str, default=None, + help='Filename for the merged matrix. Defaults to ' + 'None. If None, the filename will be derived ' + 'from the cohort name and the data type. Check ' + '"-t" and "-c" options for details.') + parser.add_argument('-c', '--cohort', type=str, default=None, + help='A cohort name for the merged matrix. Defaults ' + 'to None. If None, it will be set to a format of ' + '"MergedCohort" by default. For example, ' + '"MergedCohort{}".' + ''.format(date.today().strftime('%m%d%Y'))) + args = parser.parse_args() + print('Checking matrices to be merged ...') + matrix_list = [] + for path in args.files: + for f in glob.glob(path): + f = os.path.abspath(f) + if os.path.isfile(f): + print('\r{}'.format(f), end='') + matrix_list.append(f) + print('\r{} matrices to be merged'.format(len(matrix_list))) + if args.name is None: + if args.cohort is None: + cohort = 'MergedCohort{}'.format(date.today().strftime('%m%d%Y')) + else: + cohort = args.cohort + matrix_name = '{}.{}.tsv'.format(cohort, args.datatype) + else: + matrix_name = args.name + matrix = os.path.join(os.path.abspath(args.outdir), matrix_name) + merge(matrix_list, args.datatype, matrix) + + +if __name__ == '__main__': + main()