contrib/MedicalSeg/tools/prepare_abdomen.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file structure is as following:
abdomen
|--RawData.zip
|--abdomen_raw
│   ├── RawData
│   │   ├──RawData
│   │   │   ├── Training
│   │   │   │   ├── img
│   │   │   │   │   ├── img0001.nii.gz
│   │   │   │   │   └── ...
│   │   │   │   └── ...
│   │   │   │   ├── label
│   │   │   │   │   ├── img0001.nii.gz
│   │   │   │   │   └── ...
│   │   │   │   └── ...
├── abdomen_phase0
│   ├── images
│   │   ├── img0001-0001.npy
│   │   └── ...
│   ├── labels
│   │   ├── label0001-0001.npy
│   │   └── ...
│   ├── train_list.txt
│   └── val_list.txt
support:
1. download and uncompress the file.
2. save the data as the above format.
3. split the training data and save the split result in train_list.txt and val_list.txt

"""
import os
import sys

sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))

import os.path as osp
import time
import json

import numpy as np
from tqdm import tqdm

from prepare import Prep
from preprocess_utils import HUnorm, ignore_label
from medicalseg.utils import wrapped_partial

urls = {"Reg-Training-Training.zip": ""}

label_map = {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 0,
    "6": 5,
    "7": 6,
    "8": 7,
    "9": 0,
    "10": 0,
    "11": 8,
    "12": 0,
    "13": 0,
}


class Prep_abdomen(Prep):
    def __init__(self):
        super().__init__(
            dataset_root="data/abdomen",
            raw_dataset_dir="abdomen_raw/",
            images_dir="RawData/RawData/Training/img",
            labels_dir="RawData/RawData/Training/label",
            phase_dir="abdomen_phase0/",
            urls=urls,
            valid_suffix=("nii.gz", "nii.gz"),
            filter_key=(None, None),
            uncompress_params={"format": "zip",
                               "num_files": 1})

        self.preprocess = {
            "images": [
                wrapped_partial(
                    np.clip, a_min=-125, a_max=275), wrapped_partial(
                        HUnorm, HU_min=-125, HU_max=275, multiply_255=False)
            ],
            "labels": [wrapped_partial(
                ignore_label, label_map=label_map)]
        }
        self.train_image_files = []
        self.val_image_files = []
        self.train_label_files = []
        self.val_label_files = []

        self.train_image_files_npy = []
        self.val_image_files_npy = []
        self.train_label_files_npy = []
        self.val_label_files_npy = []
        self.train_val_split()

    def load_save(self, mode='train'):
        """
        preprocess files, transfer to the correct type, and save it to the directory.
        """
        print(
            "Start convert {} images to numpy array using {}, please wait patiently"
            .format(mode, self.gpu_tag))

        tic = time.time()
        if mode == 'train':
            process_files = (self.train_image_files, self.train_label_files)
            target_files = (self.train_image_files_npy,
                            self.train_label_files_npy)
        else:
            process_files = (self.val_image_files, self.val_label_files)
            target_files = (self.val_image_files_npy, self.val_label_files_npy)
        process_tuple = ("images", "labels")

        save_tuple = (self.image_path, self.label_path)

        for i, files in enumerate(process_files):
            pre = self.preprocess[process_tuple[i]]
            savepath = save_tuple[i]

            for f in tqdm(
                    files,
                    total=len(files),
                    desc="preprocessing the {}".format(["images", "labels"][
                        i])):

                f_nps = Prep.load_medical_data(f)[0]
                # xyz to zxy
                f_nps = f_nps.transpose(2, 0, 1)
                if mode == 'train':
                    for volume_idx, f_np in enumerate(f_nps):
                        for op in pre:
                            f_np = op(f_np)
                        filename = osp.basename(f).split(".")[
                            0] + f"-{volume_idx:>04d}.npy"
                        f_np_name = os.path.join(savepath, filename)
                        np.save(f_np_name, f_np)
                        target_files[i].append(filename)
                else:
                    for op in pre:
                        f_nps = op(f_nps)
                    filename = osp.basename(f).split(".")[0] + ".npy"
                    f_np_name = os.path.join(savepath, filename)
                    np.save(f_np_name, f_nps)
                    target_files[i].append(filename)

        print("The preprocess time on {} is {}".format(self.gpu_tag,
                                                       time.time() - tic))

    def generate_txt(self, train_split=0.6):
        """generate the train_list.txt and val_list.txt"""

        txtname = [
            os.path.join(self.phase_path, 'train_list.txt'),
            os.path.join(self.phase_path, 'val_list.txt')
        ]
        self.write_txt(txtname[0], self.train_image_files_npy,
                       self.train_label_files_npy)
        self.write_txt(txtname[1], self.val_image_files_npy,
                       self.val_label_files_npy)

    def train_val_split(self, train_split=0.6):
        image_files = np.array(self.image_files)
        label_files = np.array(self.label_files)
        np.random.seed(0)
        state = np.random.get_state()
        np.random.shuffle(image_files)
        np.random.set_state(state)
        np.random.shuffle(label_files)
        train_len = round(len(self.image_files) * train_split)
        self.train_image_files = self.image_files[:train_len]
        self.val_image_files = self.image_files[train_len:]
        self.train_label_files = self.label_files[:train_len]
        self.val_label_files = self.label_files[train_len:]


if __name__ == "__main__":
    prep = Prep_abdomen()
    prep.generate_dataset_json(
        modalities=('CT', ),
        labels={
            0: 'background',
            1: 'spleen',
            2: 'right kidney',
            3: 'left kidney',
            4: 'gallbladder',
            5: 'liver',
            6: 'stomach',
            7: 'aorta',
            8: 'pancreas'
        },
        dataset_name="Abdomen CT scans",
        dataset_description="Under Institutional Review Board (IRB) supervision, 50 abdomen CT scans of were randomly selected from a combination of an ongoing colorectal cancer chemotherapy trial, and a retrospective ventral hernia study.",
        license_desc="https://creativecommons.org/licenses/by/4.0/legalcode",
        dataset_reference="https://www.synapse.org/#!Synapse:syn3193805/wiki/89480",
    )
    prep.load_save(mode='train')
    prep.load_save(mode='val')
    prep.generate_txt()