From 89922841dad161e1cd33779db8b6d53a2fe02672 Mon Sep 17 00:00:00 2001 From: Miroslav Stoyanov Date: Wed, 12 Feb 2025 12:04:07 -0500 Subject: [PATCH 1/2] updated the help text --- benchmarks/speed3d.h | 67 ++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/benchmarks/speed3d.h b/benchmarks/speed3d.h index 742bd81..4df16fc 100644 --- a/benchmarks/speed3d.h +++ b/benchmarks/speed3d.h @@ -322,42 +322,49 @@ int main(int argc, char *argv[]){ if (argc < 6){ if (mpi::world_rank(0)){ - cout << "\nUsage:\n mpirun -np x " << bench_executable << " \n\n" - << " options\n" - << " backend is the 1-D FFT library\n" - << " available options for this build: " << backends << "\n" - << " precision is either float or double\n" - << " use float-long or double-long to enable 64-bit indexing\n" - << " size-x/y/z are the 3D array dimensions \n\n" - << " args is a set of optional arguments that define algorithmic tweaks and variations\n" - << " -reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data\n" - << " -no-reorder: some of the 1-D will be strided (non contiguous)\n" - << " -a2a: use MPI_Alltoall() communication method\n" - << " -a2av: use MPI_Alltoallv() communication method (default)\n" - << " -p2p: use MPI_Send() and MPI_Irecv() communication methods\n" - << " -p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods\n" - << " -no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only)\n" - << " -pencils: use pencil reshape logic\n" - << " -slabs: use slab reshape logic\n" - << " -io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries \n" - << " -ingrid x y z: specifies the processor grid to use in the input, x y z must be integers \n" - << " -outgrid x y z: specifies the processor grid to use in the output, x y z must be integers \n" - << " -subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes\n" - << " -batch batch_size: specifies the size of the batch to use in the benchmark\n" - << " -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2 \n" - << " -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices\n" - << " -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50\n" - #ifdef BENCH_R2R - << "Examples:\n" + cout << "\nUsage:\n mpirun -np x " << bench_executable << " \n"; + cout << R"help1( + options + is the 1-D FFT library +)help1"; + cout << " available backends for this build: " << backends; + cout << R"help2( + is either float or double + use float-long or double-long to enable 64-bit indexing + are the 3D array dimensions + + is a set of optional arguments that define algorithmic tweaks and variations + -reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data + -no-reorder: some of the 1-D will be strided (non contiguous) + -a2a: use MPI_Alltoall() communication method + -a2av: use MPI_Alltoallv() communication method (default) + -p2p: use MPI_Send() and MPI_Irecv() communication methods + -p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods + -no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only) + -pencils: use pencil reshape logic + -slabs: use slab reshape logic + -io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries + -ingrid x y z: specifies the processor grid to use in the input, x y z must be integers + -outgrid x y z: specifies the processor grid to use in the output, x y z must be integers + -subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes + -batch batch_size: specifies the size of the batch to use in the benchmark + -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2 + -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices + -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50 + +)help2"; + + #ifdef BENCH_R2R + cout << "Examples:\n" << " mpirun -np 4 " << bench_executable << " fftw-cos double 128 128 128 -p2p\n" << " mpirun -np 8 " << bench_executable << " cufft-cos float 256 256 256\n" << " mpirun -np 12 " << bench_executable << " fftw-sin double 512 512 512 -slabs\n\n"; - #else - << "Examples:\n" + #else + cout << "Examples:\n" << " mpirun -np 4 " << bench_executable << " fftw double 128 128 128 -no-reorder\n" << " mpirun -np 8 " << bench_executable << " cufft float 256 256 256\n" << " mpirun -np 12 " << bench_executable << " fftw double 512 512 512 -p2p -slabs\n\n"; - #endif + #endif } MPI_Finalize(); From fd6cf4cb585757475922dd7e3b42274859e3f10b Mon Sep 17 00:00:00 2001 From: Miroslav Stoyanov Date: Wed, 12 Feb 2025 12:50:07 -0500 Subject: [PATCH 2/2] added option to perform arbitrary number of runs --- benchmarks/speed3d.h | 15 ++++++++++++--- test/test_common.h | 27 +++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/benchmarks/speed3d.h b/benchmarks/speed3d.h index 4df16fc..b57d0b5 100644 --- a/benchmarks/speed3d.h +++ b/benchmarks/speed3d.h @@ -215,9 +215,16 @@ void benchmark_fft(std::array size_fft, std::deque const &ar // Print results if(me==0){ - t_max = t_max / (2.0 * ntest); - double const fftsize = static_cast(world.count()); - double const floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max; + double const fftsize = static_cast(world.count()); + + double floprate = 0.0; + if (ntest > 0) { // something was tested + t_max /= (2.0 * ntest); // time per test, 2 transforms forward/backward + floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max; + } else { + t_max = 0.0; // nothing was tested + } + long long mem_usage = static_cast(fft.size_inbox()) + static_cast(fft.size_outbox()) + static_cast(fft.size_workspace()); mem_usage *= sizeof(output_type); @@ -232,6 +239,7 @@ void benchmark_fft(std::array size_fft, std::deque const &ar for(int i=0; i<5; i++) print_proc_grid(i); cout << "\n"; + cout << "Num runs: " << ntest << '\n'; cout << "Time per run: " << t_max << " (s)\n"; cout << "Performance: " << floprate << " GFlops/s\n"; cout << "Memory usage: " << mem_usage << "MB/rank\n"; @@ -351,6 +359,7 @@ int main(int argc, char *argv[]){ -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2 -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50 + -nrunsXYZ: same as -n but allows for a custom number, XYZ must be a non-negative integer, e.g., -nruns17 )help2"; diff --git a/test/test_common.h b/test/test_common.h index b8345e6..c314d42 100644 --- a/test/test_common.h +++ b/test/test_common.h @@ -356,7 +356,7 @@ bool has_option(std::deque const &args, std::string const &opt){ return false; } //! \brief Takes the three arguments after \b opt and converts them to an array of ints, throws runtime_error if no arguments or cannot convert. -std::array get_grid(std::deque const &args, std::string const &opt){ +inline std::array get_grid(std::deque const &args, std::string const &opt){ auto iopt = args.begin(); while(iopt != args.end()){ if (*iopt == opt){ // found the argument, take the next three entries @@ -374,7 +374,7 @@ std::array get_grid(std::deque const &args, std::string con throw std::runtime_error(opt + " not found"); } -int get_int_arg(std::string const &name, std::deque const &args, int default_value = -1){ +inline int get_int_arg(std::string const &name, std::deque const &args, int default_value = -1){ auto iopt = args.begin(); while(iopt != args.end()){ if (*iopt == name){ @@ -388,8 +388,27 @@ int get_int_arg(std::string const &name, std::deque const &args, in } return default_value; } - -int nruns(std::deque const &args){ +//! returns the number of runs selected in the args +inline int nruns(std::deque const &args){ + for(auto &s : args) { + std::string::size_type nr = s.find("-nruns"); + if (nr != 0) + continue; + // found a string with -nruns, get the number + int num = 0; + try { + num = std::stoi(s.substr(6)); + } catch(std::invalid_argument &) { + std::cerr << "cannot convert '" << s.substr(6) << "' to 'int'\n"; + throw; + } catch(std::out_of_range &) { + std::cerr << "provided integer '" << s.substr(6) << "' is too large\n"; + throw; + } + if (num < 0) + throw std::runtime_error("the number of of runs has to be non-negative"); + return num; + } for(auto &s : args) if (s == "-n1") return 1;