From 89922841dad161e1cd33779db8b6d53a2fe02672 Mon Sep 17 00:00:00 2001
From: Miroslav Stoyanov <stoyanovmk@ornl.gov>
Date: Wed, 12 Feb 2025 12:04:07 -0500
Subject: [PATCH 1/2] updated the help text

---
 benchmarks/speed3d.h | 67 ++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 30 deletions(-)
diff --git a/benchmarks/speed3d.h b/benchmarks/speed3d.h
index 742bd81..4df16fc 100644
--- a/benchmarks/speed3d.h
+++ b/benchmarks/speed3d.h
@@ -322,42 +322,49 @@ int main(int argc, char *argv[]){
 
     if (argc < 6){
         if (mpi::world_rank(0)){
-            cout << "\nUsage:\n    mpirun -np x " << bench_executable << " <backend> <precision> <size-x> <size-y> <size-z> <args>\n\n"
-                 << "    options\n"
-                 << "        backend is the 1-D FFT library\n"
-                 << "            available options for this build: " << backends << "\n"
-                 << "        precision is either float or double\n"
-                 << "          use float-long or double-long to enable 64-bit indexing\n"
-                 << "        size-x/y/z are the 3D array dimensions \n\n"
-                 << "        args is a set of optional arguments that define algorithmic tweaks and variations\n"
-                 << "         -reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data\n"
-                 << "         -no-reorder: some of the 1-D will be strided (non contiguous)\n"
-                 << "         -a2a: use MPI_Alltoall() communication method\n"
-                 << "         -a2av: use MPI_Alltoallv() communication method (default)\n"
-                 << "         -p2p: use MPI_Send() and MPI_Irecv() communication methods\n"
-                 << "         -p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods\n"
-                 << "         -no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only)\n"
-                 << "         -pencils: use pencil reshape logic\n"
-                 << "         -slabs: use slab reshape logic\n"
-                 << "         -io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries \n"
-                 << "         -ingrid x y z: specifies the processor grid to use in the input, x y z must be integers \n"
-                 << "         -outgrid x y z: specifies the processor grid to use in the output, x y z must be integers \n"
-                 << "         -subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes\n"
-                 << "         -batch batch_size: specifies the size of the batch to use in the benchmark\n"
-                 << "         -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2 \n"
-                 << "         -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices\n"
-                 << "         -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50\n"
-                 #ifdef BENCH_R2R
-                 << "Examples:\n"
+            cout << "\nUsage:\n    mpirun -np x " << bench_executable << " <backend> <precision> <size-x> <size-y> <size-z> <args>\n";
+            cout << R"help1(
+  options
+    <backend> is the 1-D FFT library
+)help1";
+            cout << "      available backends for this build: " << backends;
+            cout << R"help2(
+    <precision> is either float or double
+      use float-long or double-long to enable 64-bit indexing
+    <size-x/y/z> are the 3D array dimensions
+
+    <args> is a set of optional arguments that define algorithmic tweaks and variations
+     -reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data
+     -no-reorder: some of the 1-D will be strided (non contiguous)
+     -a2a: use MPI_Alltoall() communication method
+     -a2av: use MPI_Alltoallv() communication method (default)
+     -p2p: use MPI_Send() and MPI_Irecv() communication methods
+     -p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods
+     -no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only)
+     -pencils: use pencil reshape logic
+     -slabs: use slab reshape logic
+     -io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries
+     -ingrid x y z: specifies the processor grid to use in the input, x y z must be integers
+     -outgrid x y z: specifies the processor grid to use in the output, x y z must be integers
+     -subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes
+     -batch batch_size: specifies the size of the batch to use in the benchmark
+     -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2
+     -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices
+     -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50
+
+)help2";
+
+            #ifdef BENCH_R2R
+            cout << "Examples:\n"
                  << "    mpirun -np  4 " << bench_executable << " fftw-cos  double 128 128 128 -p2p\n"
                  << "    mpirun -np  8 " << bench_executable << " cufft-cos float  256 256 256\n"
                  << "    mpirun -np 12 " << bench_executable << " fftw-sin  double 512 512 512 -slabs\n\n";
-                 #else
-                 << "Examples:\n"
+            #else
+            cout << "Examples:\n"
                  << "    mpirun -np  4 " << bench_executable << " fftw  double 128 128 128 -no-reorder\n"
                  << "    mpirun -np  8 " << bench_executable << " cufft float  256 256 256\n"
                  << "    mpirun -np 12 " << bench_executable << " fftw  double 512 512 512 -p2p -slabs\n\n";
-                 #endif
+            #endif
         }
 
         MPI_Finalize();

From fd6cf4cb585757475922dd7e3b42274859e3f10b Mon Sep 17 00:00:00 2001
From: Miroslav Stoyanov <stoyanovmk@ornl.gov>
Date: Wed, 12 Feb 2025 12:50:07 -0500
Subject: [PATCH 2/2] added option to perform arbitrary number of runs

---
 benchmarks/speed3d.h | 15 ++++++++++++---
 test/test_common.h   | 27 +++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/benchmarks/speed3d.h b/benchmarks/speed3d.h
index 4df16fc..b57d0b5 100644
--- a/benchmarks/speed3d.h
+++ b/benchmarks/speed3d.h
@@ -215,9 +215,16 @@ void benchmark_fft(std::array<int,3> size_fft, std::deque<std::string> const &ar
 
     // Print results
     if(me==0){
-        t_max = t_max / (2.0 * ntest);
-        double const fftsize  = static_cast<double>(world.count());
-        double const floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max;
+        double const fftsize = static_cast<double>(world.count());
+
+        double floprate = 0.0;
+        if (ntest > 0) { // something was tested
+            t_max /= (2.0 * ntest); // time per test, 2 transforms forward/backward
+            floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max;
+        } else {
+            t_max = 0.0; // nothing was tested
+        }
+
         long long mem_usage = static_cast<long long>(fft.size_inbox()) + static_cast<long long>(fft.size_outbox())
                             + static_cast<long long>(fft.size_workspace());
         mem_usage *= sizeof(output_type);
@@ -232,6 +239,7 @@ void benchmark_fft(std::array<int,3> size_fft, std::deque<std::string> const &ar
         for(int i=0; i<5; i++)
             print_proc_grid(i);
         cout << "\n";
+        cout << "Num runs:     " << ntest << '\n';
         cout << "Time per run: " << t_max << " (s)\n";
         cout << "Performance:  " << floprate << " GFlops/s\n";
         cout << "Memory usage: " << mem_usage << "MB/rank\n";
@@ -351,6 +359,7 @@ int main(int argc, char *argv[]){
      -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2
      -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices
      -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50
+     -nrunsXYZ: same as -n but allows for a custom number, XYZ must be a non-negative integer, e.g., -nruns17
 
 )help2";
 
diff --git a/test/test_common.h b/test/test_common.h
index b8345e6..c314d42 100644
--- a/test/test_common.h
+++ b/test/test_common.h
@@ -356,7 +356,7 @@ bool has_option(std::deque<std::string> const &args, std::string const &opt){
     return false;
 }
 //! \brief Takes the three arguments after \b opt and converts them to an array of ints, throws runtime_error if no arguments or cannot convert.
-std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string const &opt){
+inline std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string const &opt){
     auto iopt = args.begin();
     while(iopt != args.end()){
         if (*iopt == opt){ // found the argument, take the next three entries
@@ -374,7 +374,7 @@ std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string con
     throw std::runtime_error(opt + " not found");
 }
 
-int get_int_arg(std::string const &name, std::deque<std::string> const &args, int default_value = -1){
+inline int get_int_arg(std::string const &name, std::deque<std::string> const &args, int default_value = -1){
     auto iopt = args.begin();
     while(iopt != args.end()){
         if (*iopt == name){
@@ -388,8 +388,27 @@ int get_int_arg(std::string const &name, std::deque<std::string> const &args, in
     }
     return default_value;
 }
-
-int nruns(std::deque<std::string> const &args){
+//! returns the number of runs selected in the args
+inline int nruns(std::deque<std::string> const &args){
+    for(auto &s : args) {
+        std::string::size_type nr = s.find("-nruns");
+        if (nr != 0)
+            continue;
+        // found a string with -nruns, get the number
+        int num = 0;
+        try {
+            num = std::stoi(s.substr(6));
+        } catch(std::invalid_argument &) {
+            std::cerr << "cannot convert '" << s.substr(6) << "' to 'int'\n";
+            throw;
+        } catch(std::out_of_range &) {
+            std::cerr << "provided integer '" << s.substr(6) << "' is too large\n";
+            throw;
+        }
+        if (num < 0)
+            throw std::runtime_error("the number of of runs has to be non-negative");
+        return num;
+    }
     for(auto &s : args)
         if (s == "-n1")
             return 1;