Skip to content

Commit

Permalink
Simplify TCGAPhenoset
Browse files Browse the repository at this point in the history
  • Loading branch information
yunhailuo committed Jul 11, 2019
1 parent 88e8494 commit 3a037e2
Showing 1 changed file with 42 additions and 86 deletions.
128 changes: 42 additions & 86 deletions xena_gdc_etl/xena_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1880,12 +1880,6 @@ class TCGAPhenoset(XenaDataset):
projects (str or list): One (string) or a list of GDC's
"cases.project.project_id". All corresponding projects will be
included in this dataset.
gdc_release (str): URL to the data release note for the dataset. It
will be used by the ``metadata`` method when making the metadata
for this dataset. It is highly recommended that this attribute is
set explicitly by the user so that it is guaranteed to match the
data (raw data) underlying this dataset. If it is not available,
the most recent data release will be queried and used.
gdc_filter (dict): A filter for querying GDC data underlying this
dataset. Each item of this dict means to be an "in" operation,
with its key being one GDC API available field and its value being
Expand Down Expand Up @@ -1997,45 +1991,6 @@ class TCGAPhenoset(XenaDataset):
'vital_status',
]

@property
def gdc_release(self):
try:
return self.__gdc_release
except AttributeError:
data_release = gdc.search('status', typ='json')['data_release']
anchor = (
re.match(r'(Data Release [^\s]+)\s', data_release)
.group(1)
.replace(' ', '-')
.replace('.', '')
.lower()
)
self.__gdc_release = GDC_RELEASE_URL + '#' + anchor
return self.__gdc_release

@gdc_release.setter
def gdc_release(self, url):
self.__gdc_release = url

# Set default query filter dict for GDC API if it hasn't been set yet.
@property
def gdc_filter(self):
try:
assert self.__gdc_filter
return self.__gdc_filter
except (AttributeError, AssertionError):
self.__gdc_filter = {
'access': 'open',
'cases.project.project_id': self.projects,
'data_category': 'Clinical',
'data_format': 'BCR XML',
}
return self.__gdc_filter

@gdc_filter.setter
def gdc_filter(self, filter_dict):
self.__gdc_filter = filter_dict

@XenaDataset.download_map.getter
def download_map(self):
try:
Expand All @@ -2060,14 +2015,8 @@ def download_map(self):
+ file_df['file_name'].apply(gdc.get_ext)
).to_dict()
if not file_dict:
msg = '\rNo {} data found for project {}.'
gdc_dtype = self._XENA_GDC_DTYPE[self.xena_dtype]
print(
msg.format(
' - '.join(sorted(gdc_dtype.values())),
str(self.projects),
)
)
msg = '\rNo clinical data found for project {}.'
print(msg.format(str(self.projects)))
return file_dict
file_dict = {
'{}/data/{}'.format(gdc.GDC_API_BASE, uuid): os.path.join(
Expand All @@ -2083,25 +2032,20 @@ def download_map(self):
@property
def metadata_vars(self):
try:
assert self.__metadata_vars and isinstance(
self.__metadata_vars, dict
)
return self.__metadata_vars
except (AttributeError, AssertionError):
matrix_date = time.strftime(
"%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix))
)
projects = ','.join(self.projects)
variables = {
'project_id': projects,
'date': matrix_date,
'gdc_release': self.gdc_release,
release_anchor = re.match(
r'(Data Release [^\s]+)\s',
gdc.search('status', typ='json')['data_release']
).group(1).replace(' ', '-').replace('.', '').lower()
self.__metadata_vars = {
'project_id': ','.join(self.projects),
'xena_cohort': GDC_XENA_COHORT[','.join(self.projects)],
'date': time.strftime(
"%m-%d-%Y", time.gmtime(os.path.getmtime(self.matrix))
),
'gdc_release': GDC_RELEASE_URL + '#' + release_anchor,
}
if projects in GDC_XENA_COHORT:
variables['xena_cohort'] = GDC_XENA_COHORT[projects]
else:
variables['xena_cohort'] = 'GDC ' + projects
self.__metadata_vars = variables
return self.__metadata_vars

@metadata_vars.setter
Expand All @@ -2115,11 +2059,19 @@ def __init__(
raw_data_dir=None,
matrix_dir=None,
):
self.projects = projects
self.xena_dtype = 'phenotype'
self.root_dir = root_dir
if matrix_dir is not None:
self.matrix_dir = matrix_dir
super().__init__(
projects,
'phenotype',
root_dir=root_dir,
raw_data_dir=raw_data_dir,
matrix_dir=matrix_dir,
)
self.gdc_filter = {
'access': 'open',
'cases.project.project_id': self.projects,
'data_category': 'Clinical',
'data_format': 'BCR XML',
}
jinja2_env = jinja2.Environment(
loader=jinja2.PackageLoader('xena_gdc_etl', 'resources')
)
Expand Down Expand Up @@ -2239,31 +2191,35 @@ def transform(self):
xena_matrix.drop(c, axis=1, inplace=True)
except Exception:
pass
xena_matrix = (
pd.merge(
xena_matrix,
api_clin,
how='outer',
on='submitter_id',
)
.replace(r'^\s*$', np.nan, regex=True)
)
xena_matrix = pd.merge(
xena_matrix,
api_clin,
how='outer',
on='submitter_id',
).replace(r'^\s*$', np.nan, regex=True)
# Map sample ID to data category
associated_data_map = gdc.map_two_fields(
'files',
'cases.samples.submitter_id',
'data_category',
input_values=xena_matrix['submitter_id.samples'].tolist()
)
# Map sample ID to sample type code
sample_type_map = gdc.map_two_fields(
'cases',
'samples.submitter_id',
'samples.sample_type_id',
input_values=xena_matrix['submitter_id.samples'].tolist()
)
# Remove normal samples
# Remove samples without genomic data
sample_mask = xena_matrix['submitter_id.samples'].map(
lambda s: sample_type_map[s][0] != '10' and any(
dcat not in ['Biospecimen', 'Clinical']
for dcat in associated_data_map[s]
lambda s: (
sample_type_map[s][0] not in ['10', '11', '12', '13', '14']
and any(
dcat not in ['Biospecimen', 'Clinical']
for dcat in associated_data_map[s]
)
)
)
xena_matrix = xena_matrix[sample_mask].set_index(
Expand Down

0 comments on commit 3a037e2

Please sign in to comment.