diff --git a/xena_gdc_etl/gdc2xena.py b/xena_gdc_etl/gdc2xena.py index 8cb76aa..708b7cc 100644 --- a/xena_gdc_etl/gdc2xena.py +++ b/xena_gdc_etl/gdc2xena.py @@ -29,7 +29,7 @@ import time import shutil -from .xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset +from .xena_dataset import GDCOmicset, GDCPhenoset, GDCSurvivalset, TCGAPhenoset def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False): @@ -86,7 +86,7 @@ def gdc2xena(root_dir, projects, xena_dtypes, delete_raw_data=False): if project.startswith('TARGET'): dataset = GDCPhenoset(project, 'clinical', root_dir) elif dtype == 'GDC_phenotype': - dataset = GDCPhenoset(project, 'GDC_phenotype', root_dir) + dataset = TCGAPhenoset(project, root_dir) else: dataset = GDCOmicset(project, dtype, root_dir) try: diff --git a/xena_gdc_etl/xena_dataset.py b/xena_gdc_etl/xena_dataset.py index 79bf785..976558a 100644 --- a/xena_gdc_etl/xena_dataset.py +++ b/xena_gdc_etl/xena_dataset.py @@ -1864,6 +1864,419 @@ def transform(self): return self +class TCGAPhenoset(XenaDataset): + r"""TCGAPhenoset is derived from the ``XenaDataset`` class and represents + for a Xena matrix whose data is phenotype data of TCGA projects. + + This class provides a set of default configurations for downloading and + transforming phenotype data of TCGA projects, as well as generating + associated metadata for the transformed Xena matrix. These default + configurations are stored as private constants, and they can be checked + and/or changed through the following attributes: ``gdc_release``, + ``gdc_filter``, ``download_map``, ``raws2matrix``, ``metadata_template``, + and ``metadata_vars``. + + Attributes: + projects (str or list): One (string) or a list of GDC's + "cases.project.project_id". All corresponding projects will be + included in this dataset. + gdc_release (str): URL to the data release note for the dataset. It + will be used by the ``metadata`` method when making the metadata + for this dataset. It is highly recommended that this attribute is + set explicitly by the user so that it is guaranteed to match the + data (raw data) underlying this dataset. If it is not available, + the most recent data release will be queried and used. + gdc_filter (dict): A filter for querying GDC data underlying this + dataset. Each item of this dict means to be an "in" operation, + with its key being one GDC API available field and its value being + a string or a list of strings. It can be automatically derived + from ``projects`` and ``xena_dtype`` if it is not assigned + explicitly by the user when being used. Please check `GDC API + documentation + `_ + for details. + download_map (dict): A dict with the key being a URL for one raw data + to be downloaded and the value being a path for saving downloaded + raw data. If it hasn't been assigned explicitly by the user when + being used, it can be automatically generated by querying through + GDC API according to ``gdc_filter`` which are based on + ``projects`` and ``xena_dtype``. Filename of data files, by + default, will adapt a pattern of + ".." + + It is worth noting the "" prefix can be useful or + even necessary for ``transform`` method to apply correct + transformation to the file. "" is closely related + to the format of the file. + metadata_template (jinja2.environment.Template or str): A Jinja2 + template for rendering metadata of this dataset. When setting this + attribute with a string, it will be taken as a path to the + template file and the corresponding template will be retrieved and + assigned to this attribute. Defaults, if needed, can be mapped + from ``xena_dtype``. + metadata_vars (dict): A dict of variables which will be used (by \*\* + unpacking) when rendering the ``metadata_template``. Defaults, if + needed, can be derived from corresponding matrix and ``projects`` + and ``xena_dtype`` properties. + """ + + # To resovle overlapping between raw data and API data, remove columns + # according to the following lists. + _API_DROPS = [ + 'id', + 'case_id', + 'state', + 'created_datetime', + 'updated_datetime', + 'demographic_id.demographic', + 'submitter_id.demographic', + 'state.demographic', + 'created_datetime.demographic', + 'updated_datetime.demographic', + 'diagnosis_id.diagnoses', + 'submitter_id.diagnoses', + 'state.diagnoses', + 'created_datetime.diagnoses', + 'updated_datetime.diagnoses', + 'treatment_id.treatments.diagnoses', + 'submitter_id.treatments.diagnoses', + 'state.treatments.diagnoses', + 'created_datetime.treatments.diagnoses', + 'updated_datetime.treatments.diagnoses', + 'exposure_id.exposures', + 'submitter_id.exposures', + 'state.exposures', + 'created_datetime.exposures', + 'updated_datetime.exposures', + 'pathology_report_uuid.samples', + 'state.project', + 'released.project', + 'sample_id.samples', + 'created_datetime.samples', + 'updated_datetime.samples', + 'tissue_source_site_id.tissue_source_site', + ] + _RAW_DROPS = [ + 'alcohol_history_documented', + 'bcr_patient_barcode', + 'bcr_patient_uuid', + 'bcr_sample_uuid', + 'composition', + 'current_weight', + 'days_to_birth', + 'days_to_collection', + 'days_to_death', + 'days_to_last_followup', + 'days_to_sample_procurement', + 'ethnicity', + 'freezing_method', + 'gender', + 'height', + 'icd_10', + 'icd_o_3_histology', + 'icd_o_3_site', + 'initial_weight', + 'intermediate_dimension', + 'is_ffpe', + 'longest_dimension', + 'oct_embedded', + 'pathologic_stage', + 'pathology_report_uuid', + 'preservation_method', + 'primary_diagnosis', + 'race', + 'sample_type', + 'sample_type_id', + 'shortest_dimension', + 'state', + 'time_between_clamping_and_freezing', + 'time_between_excision_and_freezing', + 'tissue_type', + 'tumor_descriptor', + 'tumor_tissue_site', + 'vital_status', + ] + + @property + def gdc_release(self): + try: + return self.__gdc_release + except AttributeError: + data_release = gdc.search('status', typ='json')['data_release'] + anchor = ( + re.match(r'(Data Release [^\s]+)\s', data_release) + .group(1) + .replace(' ', '-') + .replace('.', '') + .lower() + ) + self.__gdc_release = GDC_RELEASE_URL + '#' + anchor + return self.__gdc_release + + @gdc_release.setter + def gdc_release(self, url): + self.__gdc_release = url + + # Set default query filter dict for GDC API if it hasn't been set yet. + @property + def gdc_filter(self): + try: + assert self.__gdc_filter + return self.__gdc_filter + except (AttributeError, AssertionError): + self.__gdc_filter = { + 'access': 'open', + 'cases.project.project_id': self.projects, + 'data_category': 'Clinical', + 'data_format': 'BCR XML', + } + return self.__gdc_filter + + @gdc_filter.setter + def gdc_filter(self, filter_dict): + self.__gdc_filter = filter_dict + + @XenaDataset.download_map.getter + def download_map(self): + try: + assert self._download_map + return self._download_map + except (AttributeError, AssertionError): + fields = ['file_id', 'file_name', 'data_category'] + try: + print('Searching for raw clinical data ...', end='') + file_df = gdc.search( + 'files', in_filter=self.gdc_filter, fields=fields + ) + except Exception: + file_dict = {} + else: + file_df.set_index('file_id', drop=False, inplace=True) + file_dict = ( + file_df['data_category'].astype(str) + + '.' + + file_df['file_id'].astype(str) + + '.' + + file_df['file_name'].apply(gdc.get_ext) + ).to_dict() + if not file_dict: + msg = '\rNo {} data found for project {}.' + gdc_dtype = self._XENA_GDC_DTYPE[self.xena_dtype] + print( + msg.format( + ' - '.join(sorted(gdc_dtype.values())), + str(self.projects), + ) + ) + return file_dict + file_dict = { + '{}/data/{}'.format(gdc.GDC_API_BASE, uuid): os.path.join( + self.raw_data_dir, name + ) + for uuid, name in file_dict.items() + } + self._download_map = file_dict + msg = '\r{} files found for clinical data of {}.' + print(msg.format(len(file_dict), self.projects)) + return self._download_map + + @property + def metadata_vars(self): + try: + assert self.__metadata_vars and isinstance( + self.__metadata_vars, dict + ) + return self.__metadata_vars + except (AttributeError, AssertionError): + matrix_date = time.strftime( + "%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix)) + ) + projects = ','.join(self.projects) + variables = { + 'project_id': projects, + 'date': matrix_date, + 'gdc_release': self.gdc_release, + } + if projects in GDC_XENA_COHORT: + variables['xena_cohort'] = GDC_XENA_COHORT[projects] + else: + variables['xena_cohort'] = 'GDC ' + projects + self.__metadata_vars = variables + return self.__metadata_vars + + @metadata_vars.setter + def metadata_vars(self, variables): + self.__metadata_vars = variables + + def __init__( + self, + projects, + root_dir='.', + raw_data_dir=None, + matrix_dir=None, + ): + self.projects = projects + self.xena_dtype = 'phenotype' + self.root_dir = root_dir + if matrix_dir is not None: + self.matrix_dir = matrix_dir + jinja2_env = jinja2.Environment( + loader=jinja2.PackageLoader('xena_gdc_etl', 'resources') + ) + self.metadata_template = jinja2_env.get_template( + 'template.phenotype.meta.json' + ) + + def __process_one_clinical_supplement(self, path): + """Extract info from GDC's TCGA BCR XML clinical supplement and + re-organize them into a pandas DataFrame. + + Args: + path (str): XML file of GDC's TCGA clinical supplement. + + Returns: + pandas.core.frame.DataFrame: Transformed pandas DataFrame. + """ + + # Sanity check on TCGA phenotype clinical supplement file + ext = os.path.splitext(path)[1] + if ext != '.xml': + raise IOError( + 'Unknown file type for TCGA clinical data: {}'.format(ext) + ) + + root = etree.parse(path).getroot() + ns = root.nsmap + assert ( + 'clinical' + in root.xpath('@xsi:schemaLocation', namespaces=ns)[0].lower() + ) + patient = {} + # "Dirty" extraction + for child in root.xpath('.//*[not(*)]'): + try: + patient[child.tag.split('}', 1)[-1]] = child.text.strip() + except AttributeError: + patient[child.tag.split('}', 1)[-1]] = '' + # Redo 'race' + if 'race_list' in patient: + del patient['race_list'] + try: + patient['race'] = ','.join( + [ + child.text.strip() + for child in root.find('.//clin_shared:race_list', ns) + if child.text and child.text.strip() + ] + ) + except Exception: + patient['race'] = '' + # Redo the most recent "follow_up" and update the patient dict if there + # is an overlapped key. + follow_ups = root.xpath('.//*[local-name()="follow_up"]') + if follow_ups: + most_recent = follow_ups[0] + for follow_up in follow_ups: + if follow_up.attrib['version'] > most_recent.attrib['version']: + most_recent = follow_up + for child in most_recent: + try: + patient[child.tag.split('}', 1)[-1]] = child.text.strip() + except AttributeError: + patient[child.tag.split('}', 1)[-1]] = '' + return pd.DataFrame({patient['bcr_patient_barcode']: patient}).T + + def transform(self): + """Transform TCGA phenotype data into Xena matrix. + + Raw clinical data will first be transformed individually. Then more + phenotype data will be retrieved and transfromed from GDC API. After + both types of data are transformed into Xena matrices, two matrices + will be merged on "cases.submitter_id". Finally, normal samples and + samples without genomic data will be removed. + + Returns: + self: allow method chaining. + """ + + message = 'Make Xena matrix for raw clinical data of {}.' + print(message.format(self.projects)) + total = len(self.raw_data_list) + count = 0 + clin_dfs = [] + for path in self.raw_data_list: + count = count + 1 + print('\rProcessing {}/{} file...'.format(count, total), end='') + sys.stdout.flush() + clin_dfs.append(self.__process_one_clinical_supplement(path)) + print('\rAll {} files have been processed. '.format(total)) + xena_matrix = ( + pd.concat(clin_dfs, axis=0) + .replace(r'\r\n', ' ', regex=True) + .replace(r'^\s*$', np.nan, regex=True) + .dropna(axis=1, how='all') + .rename_axis('submitter_id') + .reset_index() + ) + # Query GDC API for GDC harmonized phenotype info + api_clin = gdc.get_samples_clinical(self.projects) + # Revert hierarchy order in column names + api_clin = api_clin.rename( + columns={ + n: '.'.join(reversed(n.split('.'))) + for n in api_clin.columns + } + ) + # For overlapping columns between raw data matrix and GDC'S + # API data matrix, use API data. + for c in self._API_DROPS: + try: + api_clin.drop(c, axis=1, inplace=True) + except Exception: + pass + for c in self._RAW_DROPS: + try: + xena_matrix.drop(c, axis=1, inplace=True) + except Exception: + pass + xena_matrix = ( + pd.merge( + xena_matrix, + api_clin, + how='outer', + on='submitter_id', + ) + .replace(r'^\s*$', np.nan, regex=True) + ) + associated_data_map = gdc.map_two_fields( + 'files', + 'cases.samples.submitter_id', + 'data_category', + input_values=xena_matrix['submitter_id.samples'].tolist() + ) + sample_type_map = gdc.map_two_fields( + 'cases', + 'samples.submitter_id', + 'samples.sample_type_id', + input_values=xena_matrix['submitter_id.samples'].tolist() + ) + sample_mask = xena_matrix['submitter_id.samples'].map( + lambda s: sample_type_map[s][0] != '10' and any( + dcat not in ['Biospecimen', 'Clinical'] + for dcat in associated_data_map[s] + ) + ) + xena_matrix = xena_matrix[sample_mask].set_index( + 'submitter_id.samples' + ).dropna(axis=1, how='all') + # Transformation done + print('\rSaving matrix to {} ...'.format(self.matrix), end='') + mkdir_p(self.matrix_dir) + xena_matrix.to_csv(self.matrix, sep='\t', encoding='utf-8') + print('\rXena matrix is saved at {}.'.format(self.matrix)) + return self + + def main(): print('A python module of Xena specific importing pipeline for GDC data.') start = time.time()