Add files needed for figures in paper

robjmcgibbon · Feb 3, 2023 · dab4be1 · dab4be1
1 parent 188e55b
commit dab4be1
Show file tree

Hide file tree

Showing 28 changed files with 4,262 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1 +1,27 @@
-# multi_epoch_2
+# multi_epoch_2
+
+To ignore/unignore config file
+`$ git update-index --skip-worktree config.yaml`
+`$ git update-index --no-skip-worktree config.yaml`
+
+Python environment 
+`$ conda create --name btml python=3.9.2`
+`$ conda activate btml`
+`$ conda install pip`
+`$ pip install -r requirements`
+
+Place virgo credentials in virgo_credentials.yaml, set permissions
+`$ chmod 600 virgo_credentials.yaml`
+Ignore virgo_credentials file to avoid committing credentials
+`$ git update-index --skip-worktree virgo_credentials.yaml`
+
+| Property                  | Units                            |
+|---------------------------|----------------------------------|
+| Ages                      | Gyr                              |
+| Mass                      | M<sub>⊙</sub>                    |
+| Black hole accretion, SFR | M<sub>⊙</sub> / yr               |
+| Lengths                   | Mpc                              |
+| Magnitudes                | mag                              |
+| Metallicity               | M<sub>z</sub> / M<sub>tot</sub>  |
+
+
diff --git a/camels/calculate_fi.py b/camels/calculate_fi.py
@@ -0,0 +1,108 @@
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+import sklearn.ensemble
+import yaml
+
+helpers_path = os.path.abspath(sys.path[0]+'/..')
+sys.path.append(helpers_path)
+import helpers
+from helpers import log
+
+base_dir = helpers.Config.get_base_dir()
+
+snapshots = np.arange(6, 34, 3)
+
+for sim in ['IllustrisTNG', 'SIMBA']:
+    log(f'Running for {sim}')
+    data_dir = f'{helpers.Config.get_base_dir()}generated/baryon_tree_ml/camels/{sim}/'
+    run_names = sorted(os.listdir(data_dir))
+    for run_name in run_names:
+        log(f'Calculating feature importance for {run_name}')
+        # TODO: Cut data so only tracked halos are used
+        histories = pd.read_pickle(f'{data_dir}{run_name}/histories.pickle')
+        histories = histories[histories[str(min(snapshots))+'stellar_mass'] != 0]
+
+        input_properties = ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
+        output_features = ['bh_mass', 'gas_mass', 'sfr', 'stellar_mass', 'stellar_metallicity']
+
+        hubble_constant = 0.6711
+        box_length = 25 / hubble_constant
+
+        # TODO: Set value for stellar mass
+        mask = histories['stellar_mass'] > 10**9
+        histories = histories[mask]
+        log(f'There are {histories.shape[0]} halos with stellar mass > 10**9')
+        min_snap = np.min(snapshots)
+        n_trackable = np.sum(histories[str(min_snap)+'dm_sub_mass'] != 0)
+        log(f'There are {n_trackable} halos which can be traced to snapshot {min_snap}')
+
+        for output_feature in ['bh_mass', 'gas_mass', 'sfr', 'stellar_metallicity']:
+            arr = histories[output_feature]
+            log(f'Fraction {output_feature} equal to zero: {np.sum(arr==0)/arr.shape[0]:.3g}')
+            min_nonzero = np.min(arr[arr != 0])
+            histories[output_feature] = np.maximum(arr, min_nonzero*np.ones_like(arr))
+
+        for output_feature in output_features:
+            histories['regr_'+output_feature] = np.log10(histories[output_feature])
+
+        input_columns = [str(snap)+prop for snap in snapshots for prop in input_properties]
+        regr_columns = ['regr_'+feat for feat in output_features]
+        output_columns = regr_columns + ['x', 'y', 'z']
+        data = histories[input_columns + output_columns]
+
+        output_features = {
+            'bh_mass': ['gas_mass', 'dm_sub_mass', 'stellar_mass'],
+            'gas_mass': ['bh_mass', 'dm_sub_mass', 'stellar_mass'],
+            'sfr': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass'],
+            'stellar_mass': ['bh_mass', 'dm_sub_mass', 'gas_mass'],
+            'stellar_metallicity': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
+        }
+        feature_importance_dir = f'{data_dir}{run_name}/feature_importance/'
+        if not os.path.exists(feature_importance_dir):
+            os.makedirs(feature_importance_dir)
+        with open(feature_importance_dir+'output_features.yaml', 'w') as yaml_file:
+            yaml.dump(output_features, yaml_file)
+        np.save(feature_importance_dir+'snapshots.npy', snapshots)
+
+        for output_feature, input_properties in output_features.items():
+            input_features = [str(snap) + prop for snap in snapshots for prop in input_properties]
+
+            n_dataset = 10
+            n_estimators = 100
+            n_process = 5
+            max_depth = 12
+            frac = 0.7
+
+            importances = np.zeros((n_dataset, len(input_features)))
+            for i_dataset in range(n_dataset):
+
+                train_box_length = np.cbrt(frac) * box_length
+
+                shift = np.random.uniform(low=0, high=box_length, size=3)
+                pos = np.array(data[['x', 'y', 'z']])
+                pos += shift
+                pos %= box_length
+
+                train_mask = np.all(pos < train_box_length, axis=1)
+                train_data = data[train_mask]
+
+                X_train = train_data[input_features]
+                y_train = train_data['regr_'+output_feature]
+
+                regr = sklearn.ensemble.ExtraTreesRegressor(n_estimators=n_estimators,
+                                                            n_jobs=n_process,
+                                                            max_depth=max_depth)
+                regr.fit(X_train, y_train)
+
+                importances[i_dataset] = regr.feature_importances_
+
+            mean_importance = np.mean(importances, axis=0)
+            sem_importance = 1.96 * np.std(importances, axis=0) / np.sqrt(n_dataset)
+
+            np.save(f'{feature_importance_dir}{output_feature}_mean_importance.npy', mean_importance)
+            np.save(f'{feature_importance_dir}{output_feature}_sem_importance.npy', sem_importance)
+
+log('Script finished')
diff --git a/camels/component_plot.py b/camels/component_plot.py
@@ -0,0 +1,82 @@
+import os
+import sys
+
+import joblib
+import matplotlib.cm
+import matplotlib.colors
+import matplotlib.pyplot as plt
+import numpy as np
+import sklearn.ensemble
+import yaml
+
+helpers_path = os.path.abspath(sys.path[0]+'/..')
+sys.path.append(helpers_path)
+import helpers
+from helpers import log
+
+base_dir = helpers.Config.get_base_dir()
+
+snapshots = np.arange(6, 34, 3)
+output_features = {
+    # 'bh_mass': ['gas_mass', 'dm_sub_mass', 'stellar_mass'],
+    # 'gas_mass': ['bh_mass', 'dm_sub_mass', 'stellar_mass'],
+    # 'sfr': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass'],
+    'stellar_mass': ['bh_mass', 'dm_sub_mass', 'gas_mass'],
+    # 'stellar_metallicity': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
+}
+
+for output_feature, input_properties in output_features.items():
+    pca_model_filepath = f'/home/rmcg/camels_pca/IllustrisTNG/{output_feature}/pca_model.joblib'
+    pca = joblib.load(pca_model_filepath)
+
+    input_features = [str(snap) + prop for snap in snapshots for prop in input_properties]
+    fig, axs = plt.subplots(nrows=4, figsize=(5, 12), sharex=True)
+    fig.subplots_adjust(hspace=0)
+    for i_component, (component_name, component_values, label, yticks) in enumerate([
+        ('mean', pca.mean_ / np.max(pca.mean_), 'Mean', [0, 0.2, 0.4, 0.6, 0.8, 1]),  # Note that mean is normalised
+        ('component_1', pca.components_[0], 'Component 1', [-0.2, 0, 0.2, 0.4]),
+        ('component_2', pca.components_[1], 'Component 2', [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]),
+        ('component_3', pca.components_[2], 'Component 3', [-0.2, 0, 0.2, 0.4, 0.6]),
+        ]):
+        ax = axs[i_component]
+        for i_prop, input_property in enumerate(input_properties):
+            # TODO: Could estimate std by bootstrapping, running PCA on 90% of camels simulations
+            mean_values, sem_values = [], []
+            for snap in snapshots:
+                idx = input_features.index(str(snap)+input_property)
+                mean_values.append(component_values[idx])
+            mean_values, sem_values = np.array(mean_values), np.array(sem_values)
+
+            p = ax.plot(snapshots, mean_values, '-o',
+                    markersize=2, color=helpers.Config.get_color(input_property))[0]
+            if i_component == i_prop:
+                ax.legend(handles=[p], labels=[helpers.Config.get_proper_name(input_property, False)],
+                          loc='upper right', fontsize=12)
+
+        padding = 0.015 * (np.max(snapshots) - np.min(snapshots))
+        ax.set_xlim([np.min(snapshots)-padding, np.max(snapshots)+padding])
+        xticks = np.linspace(np.min(snapshots), np.max(snapshots), 6, dtype=int)
+        xticks = np.round(xticks, 1)
+        ax.set_xticks(xticks)
+
+        ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1]*1.25)
+        ax.set_yticks(yticks)
+
+        ax.text(0.05, 0.88, label, fontsize=12, transform=ax.transAxes)
+
+    axs[0].set_ylabel('Feature importance')
+    for ax in axs[1:]:
+        ax.set_ylabel('Difference in\nfeature importance')
+
+    ax2 = axs[0].twiny()
+    ax2.set_xlim(axs[0].get_xlim())
+    ax2.set_xticks(axs[0].get_xticks())
+    ax2.set_xticklabels([2.63, 1.86, 1.25, 0.69, 0.33, 0])
+    ax2.set_xlabel('z')
+
+    axs[-1].set_xlabel('Snapshot')
+
+    plt.savefig('/home/rmcg/test.pdf', dpi=450, bbox_inches='tight')
+    plt.close()
+
+log('Script finished')
diff --git a/camels/download_data b/camels/download_data
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+base_dir="/disk01/rmcg/"
+#base_dir="/home/rmcg/data/"
+for sim in "IllustrisTNG" "SIMBA"; do
+    for i in {0..999}; do
+        mkdir -p "${base_dir}downloaded/camels/${sim}/LH_${i}"
+        cd "${base_dir}downloaded/camels/${sim}/LH_${i}"
+        wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/LH_${i}/trees/tree_0_0_0.dat"
+        for j in {000..033}; do
+            wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/LH_${i}/fof_subhalo_tab_${j}.hdf5"
+        done
+        mkdir -p "${base_dir}downloaded/camels/${sim}_DM/LH_${i}"
+        cd "${base_dir}downloaded/camels/${sim}_DM/LH_${i}"
+        wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/LH_${i}/trees/tree_0_0_0.dat"
+    done
+    for i in {1..6}; do
+        # 1P_{param}_0 is not available for IllustrisTNG with param != 1. I need to delete the folders
+        for run in "n5" "n4" "n3" "n2" "n1" "0" "1" "2" "3" "4" "5"; do
+            mkdir -p "${base_dir}downloaded/camels/${sim}/1P_${i}_${run}"
+            cd "${base_dir}downloaded/camels/${sim}/1P_${i}_${run}"
+            wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/1P_${i}_${run}/trees/tree_0_0_0.dat"
+            for j in {000..033}; do
+                wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/1P_${i}_${run}/fof_subhalo_tab_${j}.hdf5"
+            done
+            mkdir -p "${base_dir}downloaded/camels/${sim}_DM/1P_${i}_${run}"
+            cd "${base_dir}downloaded/camels/${sim}_DM/1P_${i}_${run}"
+            wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/1P_${i}_${run}/trees/tree_0_0_0.dat"
+        done
+    done
+    for i in {0..26}; do
+        mkdir -p "${base_dir}downloaded/camels/${sim}/CV_${i}"
+        cd "${base_dir}downloaded/camels/${sim}/CV_${i}"
+        wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/CV_${i}/trees/tree_0_0_0.dat"
+        for j in {000..033}; do
+            wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/CV_${i}/fof_subhalo_tab_${j}.hdf5"
+        done
+        mkdir -p "${base_dir}downloaded/camels/${sim}_DM/CV_${i}"
+        cd "${base_dir}downloaded/camels/${sim}_DM/CV_${i}"
+        wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/CV_${i}/trees/tree_0_0_0.dat"
+    done
+    cd "${base_dir}downloaded/camels"
+    wget "https://users.flatironinstitute.org/~camels/Sims/${sim}/CosmoAstroSeed_params.txt"
+    j=0
+    for i in {1..6}; do
+        for run in "n5" "n4" "n3" "n2" "n1" "0" "1" "2" "3" "4" "5"; do
+            sed -i "s/1P_${j} /1P_${i}_${run} /" CosmoAstroSeed_params.txt
+            j=$((j + 1))
+        done
+    done
+    grep -v 'EX' CosmoAstroSeed_params.txt > "${sim}_params.txt"
+    rm CosmoAstroSeed_params.txt
+done
+# SIMBA run LH_196 has zero gas mass after I extract it. I think it may be corrupted. I've deleted it