From f201dbcf4ba65e93bd594f23e7e7589fffca3ca3 Mon Sep 17 00:00:00 2001 From: Yunhai Luo Date: Wed, 10 Jul 2019 23:26:09 -0700 Subject: [PATCH] Simplify TCGAPhenoset --- xena_gdc_etl/xena_dataset.py | 141 +++++++++++------------------------ 1 file changed, 43 insertions(+), 98 deletions(-) diff --git a/xena_gdc_etl/xena_dataset.py b/xena_gdc_etl/xena_dataset.py index 976558a..83cab8d 100644 --- a/xena_gdc_etl/xena_dataset.py +++ b/xena_gdc_etl/xena_dataset.py @@ -1880,12 +1880,6 @@ class TCGAPhenoset(XenaDataset): projects (str or list): One (string) or a list of GDC's "cases.project.project_id". All corresponding projects will be included in this dataset. - gdc_release (str): URL to the data release note for the dataset. It - will be used by the ``metadata`` method when making the metadata - for this dataset. It is highly recommended that this attribute is - set explicitly by the user so that it is guaranteed to match the - data (raw data) underlying this dataset. If it is not available, - the most recent data release will be queried and used. gdc_filter (dict): A filter for querying GDC data underlying this dataset. Each item of this dict means to be an "in" operation, with its key being one GDC API available field and its value being @@ -1997,45 +1991,6 @@ class TCGAPhenoset(XenaDataset): 'vital_status', ] - @property - def gdc_release(self): - try: - return self.__gdc_release - except AttributeError: - data_release = gdc.search('status', typ='json')['data_release'] - anchor = ( - re.match(r'(Data Release [^\s]+)\s', data_release) - .group(1) - .replace(' ', '-') - .replace('.', '') - .lower() - ) - self.__gdc_release = GDC_RELEASE_URL + '#' + anchor - return self.__gdc_release - - @gdc_release.setter - def gdc_release(self, url): - self.__gdc_release = url - - # Set default query filter dict for GDC API if it hasn't been set yet. - @property - def gdc_filter(self): - try: - assert self.__gdc_filter - return self.__gdc_filter - except (AttributeError, AssertionError): - self.__gdc_filter = { - 'access': 'open', - 'cases.project.project_id': self.projects, - 'data_category': 'Clinical', - 'data_format': 'BCR XML', - } - return self.__gdc_filter - - @gdc_filter.setter - def gdc_filter(self, filter_dict): - self.__gdc_filter = filter_dict - @XenaDataset.download_map.getter def download_map(self): try: @@ -2060,14 +2015,8 @@ def download_map(self): + file_df['file_name'].apply(gdc.get_ext) ).to_dict() if not file_dict: - msg = '\rNo {} data found for project {}.' - gdc_dtype = self._XENA_GDC_DTYPE[self.xena_dtype] - print( - msg.format( - ' - '.join(sorted(gdc_dtype.values())), - str(self.projects), - ) - ) + msg = '\rNo clinical data found for project {}.' + print(msg.format(str(self.projects))) return file_dict file_dict = { '{}/data/{}'.format(gdc.GDC_API_BASE, uuid): os.path.join( @@ -2080,34 +2029,6 @@ def download_map(self): print(msg.format(len(file_dict), self.projects)) return self._download_map - @property - def metadata_vars(self): - try: - assert self.__metadata_vars and isinstance( - self.__metadata_vars, dict - ) - return self.__metadata_vars - except (AttributeError, AssertionError): - matrix_date = time.strftime( - "%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix)) - ) - projects = ','.join(self.projects) - variables = { - 'project_id': projects, - 'date': matrix_date, - 'gdc_release': self.gdc_release, - } - if projects in GDC_XENA_COHORT: - variables['xena_cohort'] = GDC_XENA_COHORT[projects] - else: - variables['xena_cohort'] = 'GDC ' + projects - self.__metadata_vars = variables - return self.__metadata_vars - - @metadata_vars.setter - def metadata_vars(self, variables): - self.__metadata_vars = variables - def __init__( self, projects, @@ -2115,17 +2036,37 @@ def __init__( raw_data_dir=None, matrix_dir=None, ): - self.projects = projects - self.xena_dtype = 'phenotype' - self.root_dir = root_dir - if matrix_dir is not None: - self.matrix_dir = matrix_dir + super().__init__( + projects, + 'phenotype', + root_dir=root_dir, + raw_data_dir=raw_data_dir, + matrix_dir=matrix_dir, + ) + self.gdc_filter = { + 'access': 'open', + 'cases.project.project_id': self.projects, + 'data_category': 'Clinical', + 'data_format': 'BCR XML', + } jinja2_env = jinja2.Environment( loader=jinja2.PackageLoader('xena_gdc_etl', 'resources') ) self.metadata_template = jinja2_env.get_template( 'template.phenotype.meta.json' ) + release_anchor = re.match( + r'(Data Release [^\s]+)\s', + gdc.search('status', typ='json')['data_release'] + ).group(1).replace(' ', '-').replace('.', '').lower() + self.metadata_vars = { + 'project_id': ','.join(self.projects), + 'xena_cohort': GDC_XENA_COHORT[self.projects], + 'date': time.strftime( + "%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix)) + ), + 'gdc_release': GDC_RELEASE_URL + '#' + release_anchor, + } def __process_one_clinical_supplement(self, path): """Extract info from GDC's TCGA BCR XML clinical supplement and @@ -2239,31 +2180,35 @@ def transform(self): xena_matrix.drop(c, axis=1, inplace=True) except Exception: pass - xena_matrix = ( - pd.merge( - xena_matrix, - api_clin, - how='outer', - on='submitter_id', - ) - .replace(r'^\s*$', np.nan, regex=True) - ) + xena_matrix = pd.merge( + xena_matrix, + api_clin, + how='outer', + on='submitter_id', + ).replace(r'^\s*$', np.nan, regex=True) + # Map sample ID to data category associated_data_map = gdc.map_two_fields( 'files', 'cases.samples.submitter_id', 'data_category', input_values=xena_matrix['submitter_id.samples'].tolist() ) + # Map sample ID to sample type code sample_type_map = gdc.map_two_fields( 'cases', 'samples.submitter_id', 'samples.sample_type_id', input_values=xena_matrix['submitter_id.samples'].tolist() ) + # Remove normal samples + # Remove samples without genomic data sample_mask = xena_matrix['submitter_id.samples'].map( - lambda s: sample_type_map[s][0] != '10' and any( - dcat not in ['Biospecimen', 'Clinical'] - for dcat in associated_data_map[s] + lambda s: ( + sample_type_map[s][0] not in ['10', '11', '12', '13', '14'] + and any( + dcat not in ['Biospecimen', 'Clinical'] + for dcat in associated_data_map[s] + ) ) ) xena_matrix = xena_matrix[sample_mask].set_index(