Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom nruns #75

Merged
merged 2 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 49 additions & 33 deletions benchmarks/speed3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,16 @@ void benchmark_fft(std::array<int,3> size_fft, std::deque<std::string> const &ar

// Print results
if(me==0){
t_max = t_max / (2.0 * ntest);
double const fftsize = static_cast<double>(world.count());
double const floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max;
double const fftsize = static_cast<double>(world.count());

double floprate = 0.0;
if (ntest > 0) { // something was tested
t_max /= (2.0 * ntest); // time per test, 2 transforms forward/backward
floprate = 5.0 * batch_size * fftsize * std::log(fftsize) * 1e-9 / std::log(2.0) / t_max;
} else {
t_max = 0.0; // nothing was tested
}

long long mem_usage = static_cast<long long>(fft.size_inbox()) + static_cast<long long>(fft.size_outbox())
+ static_cast<long long>(fft.size_workspace());
mem_usage *= sizeof(output_type);
Expand All @@ -232,6 +239,7 @@ void benchmark_fft(std::array<int,3> size_fft, std::deque<std::string> const &ar
for(int i=0; i<5; i++)
print_proc_grid(i);
cout << "\n";
cout << "Num runs: " << ntest << '\n';
cout << "Time per run: " << t_max << " (s)\n";
cout << "Performance: " << floprate << " GFlops/s\n";
cout << "Memory usage: " << mem_usage << "MB/rank\n";
Expand Down Expand Up @@ -322,42 +330,50 @@ int main(int argc, char *argv[]){

if (argc < 6){
if (mpi::world_rank(0)){
cout << "\nUsage:\n mpirun -np x " << bench_executable << " <backend> <precision> <size-x> <size-y> <size-z> <args>\n\n"
<< " options\n"
<< " backend is the 1-D FFT library\n"
<< " available options for this build: " << backends << "\n"
<< " precision is either float or double\n"
<< " use float-long or double-long to enable 64-bit indexing\n"
<< " size-x/y/z are the 3D array dimensions \n\n"
<< " args is a set of optional arguments that define algorithmic tweaks and variations\n"
<< " -reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data\n"
<< " -no-reorder: some of the 1-D will be strided (non contiguous)\n"
<< " -a2a: use MPI_Alltoall() communication method\n"
<< " -a2av: use MPI_Alltoallv() communication method (default)\n"
<< " -p2p: use MPI_Send() and MPI_Irecv() communication methods\n"
<< " -p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods\n"
<< " -no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only)\n"
<< " -pencils: use pencil reshape logic\n"
<< " -slabs: use slab reshape logic\n"
<< " -io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries \n"
<< " -ingrid x y z: specifies the processor grid to use in the input, x y z must be integers \n"
<< " -outgrid x y z: specifies the processor grid to use in the output, x y z must be integers \n"
<< " -subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes\n"
<< " -batch batch_size: specifies the size of the batch to use in the benchmark\n"
<< " -r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2 \n"
<< " -mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices\n"
<< " -nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50\n"
#ifdef BENCH_R2R
<< "Examples:\n"
cout << "\nUsage:\n mpirun -np x " << bench_executable << " <backend> <precision> <size-x> <size-y> <size-z> <args>\n";
cout << R"help1(
options
<backend> is the 1-D FFT library
)help1";
cout << " available backends for this build: " << backends;
cout << R"help2(
<precision> is either float or double
use float-long or double-long to enable 64-bit indexing
<size-x/y/z> are the 3D array dimensions

<args> is a set of optional arguments that define algorithmic tweaks and variations
-reorder: reorder the elements of the arrays so that each 1-D FFT will use contiguous data
-no-reorder: some of the 1-D will be strided (non contiguous)
-a2a: use MPI_Alltoall() communication method
-a2av: use MPI_Alltoallv() communication method (default)
-p2p: use MPI_Send() and MPI_Irecv() communication methods
-p2p_pl: use MPI_Isend() and MPI_Irecv() communication methods
-no-gpu-aware: move the data to the cpu before doing gpu operations (gpu backends only)
-pencils: use pencil reshape logic
-slabs: use slab reshape logic
-io_pencils: if input and output proc grids are pencils, useful for comparison with other libraries
-ingrid x y z: specifies the processor grid to use in the input, x y z must be integers
-outgrid x y z: specifies the processor grid to use in the output, x y z must be integers
-subcomm num_ranks: specifies the number of ranks to use in intermediate reshapes
-batch batch_size: specifies the size of the batch to use in the benchmark
-r2c_dir dir: specifies the r2c direction for the r2c tests, dir must be 0 1 or 2
-mps: for the cufft backend and multiple gpus, associate the mpi ranks with different cuda devices
-nX: number of times to repeat the run, accepted variants are -n5 (default), -n1, -n10, -n50
-nrunsXYZ: same as -n but allows for a custom number, XYZ must be a non-negative integer, e.g., -nruns17

)help2";

#ifdef BENCH_R2R
cout << "Examples:\n"
<< " mpirun -np 4 " << bench_executable << " fftw-cos double 128 128 128 -p2p\n"
<< " mpirun -np 8 " << bench_executable << " cufft-cos float 256 256 256\n"
<< " mpirun -np 12 " << bench_executable << " fftw-sin double 512 512 512 -slabs\n\n";
#else
<< "Examples:\n"
#else
cout << "Examples:\n"
<< " mpirun -np 4 " << bench_executable << " fftw double 128 128 128 -no-reorder\n"
<< " mpirun -np 8 " << bench_executable << " cufft float 256 256 256\n"
<< " mpirun -np 12 " << bench_executable << " fftw double 512 512 512 -p2p -slabs\n\n";
#endif
#endif
}

MPI_Finalize();
Expand Down
27 changes: 23 additions & 4 deletions test/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ bool has_option(std::deque<std::string> const &args, std::string const &opt){
return false;
}
//! \brief Takes the three arguments after \b opt and converts them to an array of ints, throws runtime_error if no arguments or cannot convert.
std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string const &opt){
inline std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string const &opt){
auto iopt = args.begin();
while(iopt != args.end()){
if (*iopt == opt){ // found the argument, take the next three entries
Expand All @@ -374,7 +374,7 @@ std::array<int, 3> get_grid(std::deque<std::string> const &args, std::string con
throw std::runtime_error(opt + " not found");
}

int get_int_arg(std::string const &name, std::deque<std::string> const &args, int default_value = -1){
inline int get_int_arg(std::string const &name, std::deque<std::string> const &args, int default_value = -1){
auto iopt = args.begin();
while(iopt != args.end()){
if (*iopt == name){
Expand All @@ -388,8 +388,27 @@ int get_int_arg(std::string const &name, std::deque<std::string> const &args, in
}
return default_value;
}

int nruns(std::deque<std::string> const &args){
//! returns the number of runs selected in the args
inline int nruns(std::deque<std::string> const &args){
for(auto &s : args) {
std::string::size_type nr = s.find("-nruns");
if (nr != 0)
continue;
// found a string with -nruns, get the number
int num = 0;
try {
num = std::stoi(s.substr(6));
} catch(std::invalid_argument &) {
std::cerr << "cannot convert '" << s.substr(6) << "' to 'int'\n";
throw;
} catch(std::out_of_range &) {
std::cerr << "provided integer '" << s.substr(6) << "' is too large\n";
throw;
}
if (num < 0)
throw std::runtime_error("the number of of runs has to be non-negative");
return num;
}
for(auto &s : args)
if (s == "-n1")
return 1;
Expand Down