Skip to content

Commit

Permalink
Merge pull request #25 from CMB-S4/data_convert
Browse files Browse the repository at this point in the history
Data conversion tools
  • Loading branch information
tskisner authored Aug 1, 2023
2 parents 9725175 + 9d6b284 commit b3be0ce
Show file tree
Hide file tree
Showing 3 changed files with 461 additions and 0 deletions.
173 changes: 173 additions & 0 deletions dc1/noise_sim/compress_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env python3

"""
This script loads v0 uncompressed TOAST observations and writes v1 compressed files.
"""

import os
import sys
import shutil
import re
import glob

import datetime

import argparse

import numpy as np

from astropy import units as u

import toast
import toast.ops

from toast.timing import gather_timers, dump, Timer

from toast.observation import default_values as defaults


def parse_arguments():
"""
Defines and parses the arguments for the script.
"""
parser = argparse.ArgumentParser(
description="Compress CMB-S4 simulation data"
)

parser.add_argument(
"--verify",
required=False,
action="store_true",
default=False,
help="Re-load the converted data and verify consistency",
)

parser.add_argument(
"--obs",
type=str,
required=False,
nargs="+",
help="One or more observation files",
)

# The operators we want to configure from the command line or a parameter file.
operators = list()

# Parse all of the operator configuration
config, args, jobargs = toast.parse_config(parser, operators=operators)

return config, args, jobargs


def main():
env = toast.utils.Environment.get()
log = toast.utils.Logger.get()
env.enable_function_timers()
global_timer = toast.timing.GlobalTimers.get()
global_timer.start("compress HDF5 (total)")

config, args, jobargs = parse_arguments()

# Default group size
comm = toast.Comm()

# Process each observation
for obs_path in args.obs:
obs_dir = os.path.dirname(obs_path)
file_root = os.path.splitext(obs_path)[0]
if comm.world_rank == 0:
print(f"Working on {obs_path}:")
backup = f"{file_root}_uncompressed.h5"
timer = Timer()
timer.start()
obs = toast.io.load_hdf5(
obs_path,
comm,
process_rows=comm.group_size,
meta=None,
detdata=None,
shared=None,
intervals=None,
detectors=None,
force_serial=False,
)

if comm.comm_world is not None:
comm.comm_world.barrier()
timer.stop()
if comm.world_rank == 0:
print(f" Load {obs_path} in {timer.seconds()} s", flush=True)

if comm.world_rank == 0:
os.rename(obs_path, backup)

if comm.comm_world is not None:
comm.comm_world.barrier()

timer.start()
obf = toast.io.save_hdf5(
obs,
obs_dir,
meta=None,
detdata=[
(defaults.det_data, {"type": "flac"}),
(defaults.det_flags, {"type": "gzip"}),
],
shared=None,
intervals=None,
config=None,
times=defaults.times,
force_serial=False,
)
if comm.comm_world is not None:
comm.comm_world.barrier()
timer.stop()
if comm.world_rank == 0:
print(f" Save {obs_path} in {timer.seconds()} s", flush=True)

if obf != obs_path:
msg = f"Generated HDF5 ({obf}) does not match original "
msg += f"file name ({obs_path})"
raise RuntimeError(msg)

if args.verify:
timer.start()
compare = toast.io.load_hdf5(
obs_path,
comm,
process_rows=comm.group_size,
)
if comm.comm_world is not None:
comm.comm_world.barrier()
timer.stop()
if comm.world_rank == 0:
print(
f" Re-load {obs_path} for verification in {timer.seconds()} s",
flush=True
)

if compare != obs:
msg = f"Observation HDF5 verify failed:\n"
msg += f"Input = {obs}\n"
msg += f"Loaded = {compare}"
log.error(msg)
raise RuntimeError(msg)
elif comm.world_rank == 0:
print(f" Verification PASS", flush=True)
else:
if comm.world_rank == 0:
print(f" Skipping verification", flush=True)

# Dump all the timing information to the output dir

global_timer.stop("compress HDF5 (total)")
alltimers = gather_timers(comm=comm.comm_world)
if comm.world_rank == 0:
out = os.path.join(".", "timing")
dump(alltimers, out)


if __name__ == "__main__":
world, procs, rank = toast.mpi.get_world()
with toast.mpi.exception_guard(comm=world):
main()
68 changes: 68 additions & 0 deletions dc1/noise_sim/compress_hdf5.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
#SBATCH --qos=regular
#SBATCH --time=05:00:00
#SBATCH --nodes=1
#SBATCH --job-name=CMBS4_todcompress
#SBATCH --licenses=SCRATCH
#SBATCH --constraint=cpu
#SBATCH --account=mp107

# set parent directory corresponding to one frequency band
PARENT_DIRECTORY="/global/cfs/cdirs/cmbs4/dc/dc0/staging/noise_sim/outputs_rk/LAT0_CHLAT/f090"

# set range of observations to compress by this script (inclusive)
START_INDEX=1
END_INDEX=30

echo "Listing all observations in $PARENT_DIRECTORY"

# list all observations in parent directory and save to a variable
SUBDIR_LIST=$(find "$PARENT_DIRECTORY" -mindepth 1 -maxdepth 1 -type d | sort)
# extract observations names for printing to console
SUBDIR_NAMES=$(echo "$SUBDIR_LIST" | xargs -I{} basename {})
echo "Observations found: "
echo "$SUBDIR_NAMES"

echo "Proceeding to compress observations indexed in range: $START_INDEX-$END_INDEX"
# select subset of observations (subdirectories) based on range and save to a variable
SELECTED_SUBDIRS=$(echo "$SUBDIR_LIST" | sed -n "${START_INDEX},${END_INDEX}p")
# extract selected observations names for printing to console
SELECTED_SUBDIR_NAMES=$(echo "$SELECTED_SUBDIRS" | xargs -I{} basename {})
echo "Selected observations: "
echo "$SELECTED_SUBDIR_NAMES"

# loop through selected subdirectories and process each one
for subdir in $SELECTED_SUBDIRS; do
echo "Processing observation: $(basename $subdir)"
# search for files with the expected starting keywords : 'RISING' or 'SETTING'
FILE_LIST=$(find "$subdir" -type f \( -name "RISING*" -o -name "SETTING*" \) -printf "%p ")
# extract file names for printing to console
echo "Files to compress: "
for filename in $FILE_LIST; do
echo $(basename $filename)
done

# call compression script on the list of files
date
echo "Calling flac compression script ..."
srun -n 128 python compress_hdf5.py --verify --obs $FILE_LIST > "log_$(basename $subdir).txt" 2>&1

# if python script runs without error, delete backup files
if [ $? -eq 0 ]; then
echo "FLAC compression script ran successfully. Deleting backup files..."
if find "$subdir" -type f -name "*uncompressed.h5" -delete; then
echo "Backup files deleted successfully."
date
else
# If backup files deletion fails for some reason we stop the everything to avoid any risk of running out of disk memory.
echo "Error deleting backup files. Exiting loop over observations."
date
break
fi
else
echo "FLAC compression script encountered an error. Not deleting any files."
date
fi
done

echo "Observation batch $START_INDEX-$END_INDEX processing in $(basename $PARENT_DIRECTORY) band done. Please verify log files."
Loading

0 comments on commit b3be0ce

Please sign in to comment.