Skip to content

Commit

Permalink
Add files needed for figures in paper
Browse files Browse the repository at this point in the history
  • Loading branch information
robjmcgibbon committed Feb 3, 2023
1 parent 188e55b commit dab4be1
Show file tree
Hide file tree
Showing 28 changed files with 4,262 additions and 1 deletion.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
# multi_epoch_2
# multi_epoch_2

To ignore/unignore config file
`$ git update-index --skip-worktree config.yaml`
`$ git update-index --no-skip-worktree config.yaml`

Python environment
`$ conda create --name btml python=3.9.2`
`$ conda activate btml`
`$ conda install pip`
`$ pip install -r requirements`

Place virgo credentials in virgo_credentials.yaml, set permissions
`$ chmod 600 virgo_credentials.yaml`
Ignore virgo_credentials file to avoid committing credentials
`$ git update-index --skip-worktree virgo_credentials.yaml`

| Property | Units |
|---------------------------|----------------------------------|
| Ages | Gyr |
| Mass | M<sub>⊙</sub> |
| Black hole accretion, SFR | M<sub>⊙</sub> / yr |
| Lengths | Mpc |
| Magnitudes | mag |
| Metallicity | M<sub>z</sub> / M<sub>tot</sub> |


108 changes: 108 additions & 0 deletions camels/calculate_fi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import os
import sys

import numpy as np
import pandas as pd
import sklearn.ensemble
import yaml

helpers_path = os.path.abspath(sys.path[0]+'/..')
sys.path.append(helpers_path)
import helpers
from helpers import log

base_dir = helpers.Config.get_base_dir()

snapshots = np.arange(6, 34, 3)

for sim in ['IllustrisTNG', 'SIMBA']:
log(f'Running for {sim}')
data_dir = f'{helpers.Config.get_base_dir()}generated/baryon_tree_ml/camels/{sim}/'
run_names = sorted(os.listdir(data_dir))
for run_name in run_names:
log(f'Calculating feature importance for {run_name}')
# TODO: Cut data so only tracked halos are used
histories = pd.read_pickle(f'{data_dir}{run_name}/histories.pickle')
histories = histories[histories[str(min(snapshots))+'stellar_mass'] != 0]

input_properties = ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
output_features = ['bh_mass', 'gas_mass', 'sfr', 'stellar_mass', 'stellar_metallicity']

hubble_constant = 0.6711
box_length = 25 / hubble_constant

# TODO: Set value for stellar mass
mask = histories['stellar_mass'] > 10**9
histories = histories[mask]
log(f'There are {histories.shape[0]} halos with stellar mass > 10**9')
min_snap = np.min(snapshots)
n_trackable = np.sum(histories[str(min_snap)+'dm_sub_mass'] != 0)
log(f'There are {n_trackable} halos which can be traced to snapshot {min_snap}')

for output_feature in ['bh_mass', 'gas_mass', 'sfr', 'stellar_metallicity']:
arr = histories[output_feature]
log(f'Fraction {output_feature} equal to zero: {np.sum(arr==0)/arr.shape[0]:.3g}')
min_nonzero = np.min(arr[arr != 0])
histories[output_feature] = np.maximum(arr, min_nonzero*np.ones_like(arr))

for output_feature in output_features:
histories['regr_'+output_feature] = np.log10(histories[output_feature])

input_columns = [str(snap)+prop for snap in snapshots for prop in input_properties]
regr_columns = ['regr_'+feat for feat in output_features]
output_columns = regr_columns + ['x', 'y', 'z']
data = histories[input_columns + output_columns]

output_features = {
'bh_mass': ['gas_mass', 'dm_sub_mass', 'stellar_mass'],
'gas_mass': ['bh_mass', 'dm_sub_mass', 'stellar_mass'],
'sfr': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass'],
'stellar_mass': ['bh_mass', 'dm_sub_mass', 'gas_mass'],
'stellar_metallicity': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
}
feature_importance_dir = f'{data_dir}{run_name}/feature_importance/'
if not os.path.exists(feature_importance_dir):
os.makedirs(feature_importance_dir)
with open(feature_importance_dir+'output_features.yaml', 'w') as yaml_file:
yaml.dump(output_features, yaml_file)
np.save(feature_importance_dir+'snapshots.npy', snapshots)

for output_feature, input_properties in output_features.items():
input_features = [str(snap) + prop for snap in snapshots for prop in input_properties]

n_dataset = 10
n_estimators = 100
n_process = 5
max_depth = 12
frac = 0.7

importances = np.zeros((n_dataset, len(input_features)))
for i_dataset in range(n_dataset):

train_box_length = np.cbrt(frac) * box_length

shift = np.random.uniform(low=0, high=box_length, size=3)
pos = np.array(data[['x', 'y', 'z']])
pos += shift
pos %= box_length

train_mask = np.all(pos < train_box_length, axis=1)
train_data = data[train_mask]

X_train = train_data[input_features]
y_train = train_data['regr_'+output_feature]

regr = sklearn.ensemble.ExtraTreesRegressor(n_estimators=n_estimators,
n_jobs=n_process,
max_depth=max_depth)
regr.fit(X_train, y_train)

importances[i_dataset] = regr.feature_importances_

mean_importance = np.mean(importances, axis=0)
sem_importance = 1.96 * np.std(importances, axis=0) / np.sqrt(n_dataset)

np.save(f'{feature_importance_dir}{output_feature}_mean_importance.npy', mean_importance)
np.save(f'{feature_importance_dir}{output_feature}_sem_importance.npy', sem_importance)

log('Script finished')
82 changes: 82 additions & 0 deletions camels/component_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import sys

import joblib
import matplotlib.cm
import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import sklearn.ensemble
import yaml

helpers_path = os.path.abspath(sys.path[0]+'/..')
sys.path.append(helpers_path)
import helpers
from helpers import log

base_dir = helpers.Config.get_base_dir()

snapshots = np.arange(6, 34, 3)
output_features = {
# 'bh_mass': ['gas_mass', 'dm_sub_mass', 'stellar_mass'],
# 'gas_mass': ['bh_mass', 'dm_sub_mass', 'stellar_mass'],
# 'sfr': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass'],
'stellar_mass': ['bh_mass', 'dm_sub_mass', 'gas_mass'],
# 'stellar_metallicity': ['bh_mass', 'dm_sub_mass', 'gas_mass', 'stellar_mass']
}

for output_feature, input_properties in output_features.items():
pca_model_filepath = f'/home/rmcg/camels_pca/IllustrisTNG/{output_feature}/pca_model.joblib'
pca = joblib.load(pca_model_filepath)

input_features = [str(snap) + prop for snap in snapshots for prop in input_properties]
fig, axs = plt.subplots(nrows=4, figsize=(5, 12), sharex=True)
fig.subplots_adjust(hspace=0)
for i_component, (component_name, component_values, label, yticks) in enumerate([
('mean', pca.mean_ / np.max(pca.mean_), 'Mean', [0, 0.2, 0.4, 0.6, 0.8, 1]), # Note that mean is normalised
('component_1', pca.components_[0], 'Component 1', [-0.2, 0, 0.2, 0.4]),
('component_2', pca.components_[1], 'Component 2', [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3]),
('component_3', pca.components_[2], 'Component 3', [-0.2, 0, 0.2, 0.4, 0.6]),
]):
ax = axs[i_component]
for i_prop, input_property in enumerate(input_properties):
# TODO: Could estimate std by bootstrapping, running PCA on 90% of camels simulations
mean_values, sem_values = [], []
for snap in snapshots:
idx = input_features.index(str(snap)+input_property)
mean_values.append(component_values[idx])
mean_values, sem_values = np.array(mean_values), np.array(sem_values)

p = ax.plot(snapshots, mean_values, '-o',
markersize=2, color=helpers.Config.get_color(input_property))[0]
if i_component == i_prop:
ax.legend(handles=[p], labels=[helpers.Config.get_proper_name(input_property, False)],
loc='upper right', fontsize=12)

padding = 0.015 * (np.max(snapshots) - np.min(snapshots))
ax.set_xlim([np.min(snapshots)-padding, np.max(snapshots)+padding])
xticks = np.linspace(np.min(snapshots), np.max(snapshots), 6, dtype=int)
xticks = np.round(xticks, 1)
ax.set_xticks(xticks)

ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1]*1.25)
ax.set_yticks(yticks)

ax.text(0.05, 0.88, label, fontsize=12, transform=ax.transAxes)

axs[0].set_ylabel('Feature importance')
for ax in axs[1:]:
ax.set_ylabel('Difference in\nfeature importance')

ax2 = axs[0].twiny()
ax2.set_xlim(axs[0].get_xlim())
ax2.set_xticks(axs[0].get_xticks())
ax2.set_xticklabels([2.63, 1.86, 1.25, 0.69, 0.33, 0])
ax2.set_xlabel('z')

axs[-1].set_xlabel('Snapshot')

plt.savefig('/home/rmcg/test.pdf', dpi=450, bbox_inches='tight')
plt.close()

log('Script finished')
54 changes: 54 additions & 0 deletions camels/download_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

base_dir="/disk01/rmcg/"
#base_dir="/home/rmcg/data/"
for sim in "IllustrisTNG" "SIMBA"; do
for i in {0..999}; do
mkdir -p "${base_dir}downloaded/camels/${sim}/LH_${i}"
cd "${base_dir}downloaded/camels/${sim}/LH_${i}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/LH_${i}/trees/tree_0_0_0.dat"
for j in {000..033}; do
wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/LH_${i}/fof_subhalo_tab_${j}.hdf5"
done
mkdir -p "${base_dir}downloaded/camels/${sim}_DM/LH_${i}"
cd "${base_dir}downloaded/camels/${sim}_DM/LH_${i}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/LH_${i}/trees/tree_0_0_0.dat"
done
for i in {1..6}; do
# 1P_{param}_0 is not available for IllustrisTNG with param != 1. I need to delete the folders
for run in "n5" "n4" "n3" "n2" "n1" "0" "1" "2" "3" "4" "5"; do
mkdir -p "${base_dir}downloaded/camels/${sim}/1P_${i}_${run}"
cd "${base_dir}downloaded/camels/${sim}/1P_${i}_${run}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/1P_${i}_${run}/trees/tree_0_0_0.dat"
for j in {000..033}; do
wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/1P_${i}_${run}/fof_subhalo_tab_${j}.hdf5"
done
mkdir -p "${base_dir}downloaded/camels/${sim}_DM/1P_${i}_${run}"
cd "${base_dir}downloaded/camels/${sim}_DM/1P_${i}_${run}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/1P_${i}_${run}/trees/tree_0_0_0.dat"
done
done
for i in {0..26}; do
mkdir -p "${base_dir}downloaded/camels/${sim}/CV_${i}"
cd "${base_dir}downloaded/camels/${sim}/CV_${i}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}/CV_${i}/trees/tree_0_0_0.dat"
for j in {000..033}; do
wget -N "https://users.flatironinstitute.org/~camels/FOF_Subfind/${sim}/CV_${i}/fof_subhalo_tab_${j}.hdf5"
done
mkdir -p "${base_dir}downloaded/camels/${sim}_DM/CV_${i}"
cd "${base_dir}downloaded/camels/${sim}_DM/CV_${i}"
wget -N "https://users.flatironinstitute.org/~camels/Rockstar/${sim}_DM/CV_${i}/trees/tree_0_0_0.dat"
done
cd "${base_dir}downloaded/camels"
wget "https://users.flatironinstitute.org/~camels/Sims/${sim}/CosmoAstroSeed_params.txt"
j=0
for i in {1..6}; do
for run in "n5" "n4" "n3" "n2" "n1" "0" "1" "2" "3" "4" "5"; do
sed -i "s/1P_${j} /1P_${i}_${run} /" CosmoAstroSeed_params.txt
j=$((j + 1))
done
done
grep -v 'EX' CosmoAstroSeed_params.txt > "${sim}_params.txt"
rm CosmoAstroSeed_params.txt
done
# SIMBA run LH_196 has zero gas mass after I extract it. I think it may be corrupted. I've deleted it
Loading

0 comments on commit dab4be1

Please sign in to comment.