From 0b00f45178b9222a31bd60a96a143e3d474d75c5 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Wed, 15 Nov 2023 23:48:24 +0000 Subject: [PATCH 01/10] Add sycl/device_ kernels and support for sycl complex definitions. --- include/slate/internal/device.hh | 20 +- src/sycl/device_geadd.dp.cpp | 338 ++++++ src/sycl/device_gecopy.dp.cpp | 249 +++++ src/sycl/device_genorm.dp.cpp | 666 ++++++++++++ src/sycl/device_gescale.dp.cpp | 343 ++++++ src/sycl/device_gescale_row_col.dp.cpp | 334 ++++++ src/sycl/device_geset.dp.cpp | 308 ++++++ src/sycl/device_henorm.dp.cpp | 487 +++++++++ src/sycl/device_synorm.dp.cpp | 736 +++++++++++++ src/sycl/device_transpose.dp.cpp | 918 ++++++++++++++++ src/sycl/device_trnorm.dp.cpp | 632 +++++++++++ src/sycl/device_tzadd.dp.cpp | 213 ++++ src/sycl/device_tzcopy.dp.cpp | 246 +++++ src/sycl/device_tzscale.dp.cpp | 197 ++++ src/sycl/device_tzset.dp.cpp | 329 ++++++ src/sycl/device_util.dp.hpp | 1342 ++++++++++++++++++++++++ 16 files changed, 7355 insertions(+), 3 deletions(-) create mode 100644 src/sycl/device_geadd.dp.cpp create mode 100644 src/sycl/device_gecopy.dp.cpp create mode 100644 src/sycl/device_genorm.dp.cpp create mode 100644 src/sycl/device_gescale.dp.cpp create mode 100644 src/sycl/device_gescale_row_col.dp.cpp create mode 100644 src/sycl/device_geset.dp.cpp create mode 100644 src/sycl/device_henorm.dp.cpp create mode 100644 src/sycl/device_synorm.dp.cpp create mode 100644 src/sycl/device_transpose.dp.cpp create mode 100644 src/sycl/device_trnorm.dp.cpp create mode 100644 src/sycl/device_tzadd.dp.cpp create mode 100644 src/sycl/device_tzcopy.dp.cpp create mode 100644 src/sycl/device_tzscale.dp.cpp create mode 100644 src/sycl/device_tzset.dp.cpp create mode 100644 src/sycl/device_util.dp.hpp diff --git a/include/slate/internal/device.hh b/include/slate/internal/device.hh index a7092b7f0..b9099fda1 100644 --- a/include/slate/internal/device.hh +++ b/include/slate/internal/device.hh @@ -68,7 +68,19 @@ }; } // namespace blas -#endif // #elif defined( BLAS_HAVE_ROCBLAS ) + +#elif defined( BLAS_HAVE_SYCL ) + #include + namespace blas { + + template + struct blas::real_type_traits< sycl::vec > { + using real_t = T; + }; + + } // namespace blas + +#endif // #defined( BLAS_HAVE_{CUBLAS,ROCBLAS,SYCL} ) namespace slate { @@ -76,9 +88,11 @@ namespace slate { /// GPU device implementations of kernels. namespace device { -// Use omp-target-kernels when OneMKL-SYCL is used +// Use when SYCL and oneMKL are used #if defined( BLAS_HAVE_SYCL ) - #define SLATE_HAVE_OMPTARGET + // todo: make this build automatically + // Manually uncomment to compile OMP target-offload kernels + // #define SLATE_HAVE_OMPTARGET #endif // Simplify checking for GPU device support (CUDA / ROCm / SYCL). diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp new file mode 100644 index 000000000..3344ec0b3 --- /dev/null +++ b/src/sycl/device_geadd.dp.cpp @@ -0,0 +1,338 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile addition. +/// Each thread deals with one row. +/// Launched by geadd_kernel() and geadd_batch_kernel(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +/// @param[in,out] B +/// is an m-by-n matrix stored in an ldb-by-n array. +/// +/// @param[in] ldb +/// Leading dimension of each tile in Barray. ldb >= m. +/// +/// @copydoc geadd +/// +template +void geadd_func( + int64_t m, int64_t n, + scalar_t alpha, scalar_t* A, int64_t lda, + scalar_t beta, scalar_t* B, int64_t ldb, const sycl::nd_item<3> &item_ct1) +{ + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &A[ i ]; + scalar_t* rowB = &B[ i ]; + + for (int64_t j = 0; j < n; ++j) + rowB[ j*ldb ] = (alpha * rowA[ j*lda ]) + (beta * rowB[ j*ldb ]); + // rowB[j * ldb] = dpct_operator_overloading::operator+( + // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), + // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile. +/// @copydoc geadd +template +void geadd_kernel( + int64_t m, int64_t n, + scalar_t alpha, scalar_t* A, int64_t lda, + scalar_t beta, scalar_t* B, int64_t ldb, const sycl::nd_item<3> &item_ct1) +{ + geadd_func(m, n, alpha, A, lda, beta, B, ldb, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile set. +/// @copydoc geadd_batch +template +void geadd_batch_kernel( + int64_t m, int64_t n, + scalar_t alpha, scalar_t** Aarray, int64_t lda, + scalar_t beta, scalar_t** Barray, int64_t ldb, + const sycl::nd_item<3> &item_ct1) +{ + geadd_func(m, n, alpha, Aarray[item_ct1.get_group(2)], lda, beta, + Barray[item_ct1.get_group(2)], ldb, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Routine for element-wise tile addition. +/// Sets +/// \[ +/// B = \alpha A + \beta B. +/// \] +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] alpha +/// The scalar alpha. +/// +/// @param[in] A +/// is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] beta +/// The scalar beta. +/// +/// @param[in,out] B +/// is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] ldb +/// Leading dimension of each tile in B. ldb >= m. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void geadd( + int64_t m, int64_t n, + scalar_t const& alpha, scalar_t* A, int64_t lda, + scalar_t const& beta, scalar_t* B, int64_t ldb, + blas::Queue &queue) +{ + // quick return + if (m == 0 || n == 0) + return; + + /* + DPCT1093:146: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:49: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + geadd_kernel(m, n, alpha, A, lda, beta, B, ldb, + item_ct1); + }); + + /* + DPCT1010:147: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geadd( + int64_t m, int64_t n, + float const& alpha, float* Aarray, int64_t lda, + float const& beta, float* Barray, int64_t ldb, + blas::Queue &queue); + +template +void geadd( + int64_t m, int64_t n, + double const& alpha, double* Aarray, int64_t lda, + double const& beta, double* Barray, int64_t ldb, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void geadd( + int64_t m, int64_t n, + std::complex const& alpha, std::complex* Aarray, int64_t lda, + std::complex const& beta, std::complex* Barray, int64_t ldb, + blas::Queue &queue) +{ + geadd(m, n, sycl::float2(real(alpha), imag(alpha)), (sycl::float2 *)Aarray, + lda, sycl::float2(real(beta), imag(beta)), (sycl::float2 *)Barray, + ldb, queue); +} + +template <> +void geadd( + int64_t m, int64_t n, + std::complex const& alpha, std::complex* Aarray, int64_t lda, + std::complex const& beta, std::complex* Barray, int64_t ldb, + blas::Queue &queue) +{ + geadd(m, n, sycl::double2(real(alpha), imag(alpha)), + (sycl::double2 *)Aarray, lda, sycl::double2(real(beta), imag(beta)), + (sycl::double2 *)Barray, ldb, queue); +} + +//============================================================================== +namespace batch { + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise tile addition. +/// Sets +/// \[ +/// Barray[k] = \alpha Aarray[k] + \beta Barray[k]. +/// \] +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] alpha +/// The scalar alpha. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] beta +/// The scalar beta. +/// +/// @param[in,out] Barray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] ldb +/// Leading dimension of each tile in B. ldb >= m. +/// +/// @param[in] batch_count +/// Size of Aarray and Barray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void geadd( + int64_t m, int64_t n, + scalar_t const& alpha, scalar_t** Aarray, int64_t lda, + scalar_t const& beta, scalar_t** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + // quick return + if (m == 0 || n == 0) + return; + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:148: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:50: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + geadd_batch_kernel(m, n, alpha, Aarray, lda, beta, + Barray, ldb, item_ct1); + }); + + /* + DPCT1010:149: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geadd( + int64_t m, int64_t n, + float const& alpha, float** Aarray, int64_t lda, + float const& beta, float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +template +void geadd( + int64_t m, int64_t n, + double const& alpha, double** Aarray, int64_t lda, + double const& beta, double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void geadd( + int64_t m, int64_t n, + std::complex const& alpha, std::complex** Aarray, int64_t lda, + std::complex const& beta, std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + geadd(m, n, sycl::float2(real(alpha), imag(alpha)), (sycl::float2 **)Aarray, + lda, sycl::float2(real(beta), imag(beta)), (sycl::float2 **)Barray, + ldb, batch_count, queue); +} + +template <> +void geadd( + int64_t m, int64_t n, + std::complex const& alpha, std::complex** Aarray, int64_t lda, + std::complex const& beta, std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + geadd(m, n, sycl::double2(real(alpha), imag(alpha)), + (sycl::double2 **)Aarray, lda, sycl::double2(real(beta), imag(beta)), + (sycl::double2 **)Barray, ldb, batch_count, queue); +} + +} // namespace batch +} // namespace device +} // namespace slate diff --git a/src/sycl/device_gecopy.dp.cpp b/src/sycl/device_gecopy.dp.cpp new file mode 100644 index 000000000..660946f00 --- /dev/null +++ b/src/sycl/device_gecopy.dp.cpp @@ -0,0 +1,249 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing copy and precision conversions, copying A to B. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gecopy(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +/// @param[out] Barray +/// Array of tiles of dimension gridDim.x, +/// where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. +/// +/// @param[in] ldb +/// Leading dimension of each tile in Barray. ldb >= m. +/// +template +void gecopy_kernel( + int64_t m, int64_t n, + src_scalar_t const* const* Aarray, int64_t lda, + dst_scalar_t** Barray, int64_t ldb, const sycl::nd_item<3> &item_ct1) +{ + src_scalar_t const *tileA = Aarray[item_ct1.get_group(2)]; + dst_scalar_t *tileB = Barray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + src_scalar_t const* rowA = &tileA[ i ]; + dst_scalar_t* rowB = &tileB[ i ]; + + for (int64_t j = 0; j < n; ++j) + copy(rowA[j*lda], rowB[j*ldb]); + } +} + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise copy and precision conversion, +/// copying A to B. Sets +/// \[ +/// Barray[k] = Aarray[k]. +/// \] +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[out] Barray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] ldb +/// Leading dimension of each tile in B. ldb >= m. +/// +/// @param[in] batch_count +/// Size of Aarray and Barray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void gecopy( + int64_t m, int64_t n, + src_scalar_t const* const* Aarray, int64_t lda, + dst_scalar_t** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + // quick return + if (batch_count == 0) + return; + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1093:152: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + /* + DPCT1049:59: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gecopy_kernel(m, n, Aarray, lda, Barray, ldb, + item_ct1); + }); + + /* + DPCT1010:153: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. + +// float => float +template +void gecopy( + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// float => double +template +void gecopy( + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// double => double +template +void gecopy( + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// double => float +template +void gecopy( + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. + +// complex-float => complex-float +template <> +void gecopy( + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (sycl::float2 **)Aarray, lda, (sycl::float2 **)Barray, ldb, + batch_count, queue); +} + +// complex-float => complex-double +template <> +void gecopy( + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (sycl::float2 **)Aarray, lda, (sycl::double2 **)Barray, ldb, + batch_count, queue); +} + +// complex-double => complex-double +template <> +void gecopy( + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (sycl::double2 **)Aarray, lda, (sycl::double2 **)Barray, ldb, + batch_count, queue); +} + +// complex-double => complex-float +template <> +void gecopy( + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (sycl::double2 **)Aarray, lda, (sycl::float2 **)Barray, ldb, + batch_count, queue); +} + +// float => complex-float +template <> +void gecopy( + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (float **)Aarray, lda, (sycl::float2 **)Barray, ldb, + batch_count, queue); +} + +// double => complex-double +template <> +void gecopy( + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + gecopy(m, n, (double **)Aarray, lda, (sycl::double2 **)Barray, ldb, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp new file mode 100644 index 000000000..bb20deaa9 --- /dev/null +++ b/src/sycl/device_genorm.dp.cpp @@ -0,0 +1,666 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Finds the largest absolute value of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Uses dynamic shared memory array of length sizeof(real_t) * m. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by genorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_maxima +/// Array of dimension gridDim.x. +/// On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for tile A^(k). +/// +template +void genorm_max_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_maxima, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_max = (real_t*) dynamic_data; + int chunk; + if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) { + row_max[item_ct1.get_local_id(2)] = 0; + } + + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + chunk = i % item_ct1.get_local_range(2); + scalar_t const* row = &tile[ i ]; + real_t max = 0; + + // Each thread finds max of one row. + for (int64_t j = 0; j < n; ++j) + max = max_nan(max, abs(row[j*lda])); + + // Save partial results in shared memory. + row_max[chunk] = max_nan(max, row_max[chunk]); + } + + // Reduction to find max of tile. + /* + DPCT1065:36: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2), + row_max, item_ct1); + if (item_ct1.get_local_id(2) == 0) { + tiles_maxima[item_ct1.get_group(2)] = row_max[0]; + } +} + +const int ib = 32; ///< block size for genorm_one_kernel +const int ib1 = 33; ///< ib + 1 for stride to avoid GPU bank conflicts + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each column of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one column. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by genorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) +/// for row j of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void genorm_one_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + auto dynamic_data = (char *)dpct_local; + real_t* shmem_tile = (real_t*)dynamic_data; + const int k = item_ct1.get_local_id(2); + + for (int64_t jj = 0; jj < n; jj += ib) { + real_t sum = 0.0; + for (int64_t ii = 0; ii < m; ii += ib) { + // Read 32x32 sub-tile into shared memory. + // This does coalesced reads of one column at a time in parallel. + for (int64_t j = 0; j < ib; ++j) + if (jj+j < n && ii+k < m) + shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] ); + /* + DPCT1065:37: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // shmem_tile loaded + + // Each thread sums one column. + for (int64_t i = 0; i < ib; ++i) + if (jj+k < n && ii+i < m) + sum += shmem_tile[ k*ib1 + i ]; + /* + DPCT1065:38: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // done with shmem_tile + } + + if (jj+k < n) + tiles_sums[item_ct1.get_group(2) * ldv + jj + k] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each row of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by genorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) +/// for row i of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void genorm_inf_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t const* row = &tile[ i ]; + + // Each thread sums one row. + // This does coalesced reads of one column at a time in parallel. + real_t sum = abs(row[0]); + for (int64_t j = 1; j < n; ++j) + sum += abs(row[j*lda]); + + tiles_sums[item_ct1.get_group(2) * ldv + i] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of squares, in scaled representation, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by genorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension blockDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_values +/// Array of dimension 2 * blockDim.x. +/// On exit, +/// tiles_values[2*k + 0] = scale +/// tiles_values[2*k + 1] = sumsq +/// such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 +/// for tile A^(k). +/// +template +void genorm_fro_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_values, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_scale = (real_t*) &dynamic_data[0]; + real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)]; + + real_t tile_scale = row_scale[0]; + real_t tile_sumsq = row_sumsq[0]; + + // Each thread finds sum-of-squares of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t const* row = &tile[ i ]; + real_t scale = 0; + real_t sumsq = 1; + chunk = i % item_ct1.get_local_range(2); + + for (int64_t j = 0; j < n; ++j) { + add_sumsq(scale, sumsq, abs(row[j*lda])); + } + + if (i < item_ct1.get_local_range(2)) { + row_scale[chunk] = 0; + row_sumsq[chunk] = 1; + } + + // Save partial results in shared memory. + combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); + } + + // Reduction to find sum-of-squares of tile. + // todo: parallel reduction. + /* + DPCT1065:39: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + if (item_ct1.get_local_id(2) == 0) { + tile_scale = row_scale[0]; + tile_sumsq = row_sumsq[0]; + for (int64_t chunk = 1; + chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + } + + tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; + tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq; + } +} + +//------------------------------------------------------------------------------ +// todo docs +template +void ge_col_norms_max_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* col_max, int64_t ldv, + const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + auto dynamic_data = (char *)dpct_local; + real_t* shmem_tile = (real_t*)dynamic_data; + const int k = item_ct1.get_local_id(2); + + for (int64_t jj = 0; jj < n; jj += ib) { + real_t max = 0.0; + for (int64_t ii = 0; ii < m; ii += ib) { + // Read 32x32 sub-tile into shared memory. + // This does coalesced reads of one column at a time in parallel. + for (int64_t j = 0; j < ib; ++j) + if (jj+j < n && ii+k < m) + shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] ); + /* + DPCT1065:40: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // shmem_tile loaded + + // Each thread compute max of one column. + for (int64_t i = 0; i < ib; ++i) + if (jj+k < n && ii+i < m) + max = max_nan( shmem_tile[ k*ib1 + i ], max ); + /* + DPCT1065:41: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // done with shmem_tile + } + + if (jj+k < n) + col_max[item_ct1.get_group(2) * ldv + jj + k] = max; + } +} + +//------------------------------------------------------------------------------ +/// Batched routine that computes a partial norm for each tile. +/// +/// @param[in] norm +/// Norm to compute. See values for description. +/// +/// @param[in] scope +/// Scope of the norm. +/// - NormScope::Matrix computes partial norm of each tile. +/// - NormScope::Columns computes norm of each column. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] values +/// Array in GPU memory, dimension batch_count * ldv. +/// - Norm::Max: ldv = 1. +/// On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count. +/// +/// - Norm::One: ldv >= n. +/// On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= j < n. +/// +/// - Norm::Inf: ldv >= m. +/// On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= i < m. +/// +/// - Norm::Max: ldv = 2. +/// On exit, +/// values[k*2 + 0] = scale_k +/// values[k*2 + 1] = sumsq_k +/// where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 +/// for 0 <= k < batch_count. +/// +/// @param[in] ldv +/// Leading dimension of values array. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void genorm( + lapack::Norm norm, NormScope scope, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + using real_t = blas::real_type; + int64_t nb = 512; + + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:144: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + if (scope == NormScope::Matrix) { + + //--------- + // max norm + if (norm == lapack::Norm::Max) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count, queue); + } + else { + assert(ldv == 1); + /* + DPCT1083:43: The size of local memory in the migrated code may + be different from the original code. Check that the allocated + memory size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb; + /* + DPCT1049:42: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + genorm_max_kernel( + m, n, Aarray, lda, values, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + //--------- + // one norm + else if (norm == lapack::Norm::One) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * n, queue); + } + else { + assert(ldv >= n); + /* + DPCT1083:44: The size of local memory in the migrated code may + be different from the original code. Check that the allocated + memory size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * ib * ib1; + ((sycl::queue *)(&queue.stream())) + ->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, ib), + sycl::range<3>(1, 1, ib)), + [=](sycl::nd_item<3> item_ct1) { + genorm_one_kernel( + m, n, Aarray, lda, values, ldv, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + //--------- + // inf norm + else if (norm == lapack::Norm::Inf) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * m, queue); + } + else { + assert(ldv >= m); + /* + DPCT1049:45: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + genorm_inf_kernel(m, n, Aarray, lda, values, ldv, + item_ct1); + }); + } + } + //--------- + // Frobenius norm + else if (norm == lapack::Norm::Fro) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * 2, queue); + } + else { + assert(ldv == 2); + /* + DPCT1083:47: The size of local memory in the migrated code may + be different from the original code. Check that the allocated + memory size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb * 2; + /* + DPCT1049:46: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + genorm_fro_kernel( + m, n, Aarray, lda, values, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + } + else if (scope == NormScope::Columns) { + + if (norm == Norm::Max) { + + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * n, queue); + } + else { + assert(ldv >= n); + /* + DPCT1083:48: The size of local memory in the migrated code may + be different from the original code. Check that the allocated + memory size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * ib * ib1; + ((sycl::queue *)(&queue.stream())) + ->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, ib), + sycl::range<3>(1, 1, ib)), + [=](sycl::nd_item<3> item_ct1) { + ge_col_norms_max_kernel( + m, n, Aarray, lda, values, ldv, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + else { + slate_not_implemented("The norm isn't yet supported"); + } + } + else { + slate_not_implemented("The norm scope isn't yet supported."); + } + + /* + DPCT1010:145: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void genorm( + lapack::Norm norm, NormScope scope, + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +template +void genorm( + lapack::Norm norm, NormScope scope, + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void genorm( + lapack::Norm norm, NormScope scope, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + genorm(norm, scope, m, n, (sycl::float2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +template <> +void genorm( + lapack::Norm norm, NormScope scope, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + genorm(norm, scope, m, n, (sycl::double2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp new file mode 100644 index 000000000..f76343c7f --- /dev/null +++ b/src/sycl/device_gescale.dp.cpp @@ -0,0 +1,343 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Device function implementing element-wise tile scale. +/// Each thread block deals with one tile. gridDim.x == batch_count. +/// Each thread deals with one row. +/// Called by gescale_kernel and gescale_batch_kernel. +/// +/// @copydoc gescale +/// +template +void gescale_func( + int64_t m, int64_t n, + scalar_t2 mul, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &A[ i ]; + for (int64_t j = 0; j < n; ++j) + rowA[ j*lda ] = rowA[ j*lda ] * mul; + // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], mul); + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile scale. +/// @copydoc gescale +template +void gescale_kernel( + int64_t m, int64_t n, + scalar_t2 mul, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + gescale_func(m, n, mul, A, lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile scale. +/// @copydoc gescale_batch +template +void gescale_batch_kernel( + int64_t m, int64_t n, + scalar_t2 mul, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + gescale_func(m, n, mul, Aarray[item_ct1.get_group(2)], lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile scale. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gescale(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] numer +/// Scale value numerator. +/// +/// @param[in] denom +/// Scale value denominator. +/// +/// @param[in,out] A +/// An m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +template +void gescale( + int64_t m, int64_t n, + scalar_t2 numer, scalar_t2 denom, + scalar_t* A, int64_t lda, + blas::Queue& queue) +{ + // quick return + if (m == 0 || n == 0) + return; + + /* + DPCT1093:154: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + scalar_t2 mul = numer / denom; + + /* + DPCT1049:60: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gescale_kernel(m, n, mul, A, lda, item_ct1); + }); + + /* + DPCT1010:155: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void gescale( + int64_t m, int64_t n, + float numer, float denom, + float* A, int64_t lda, + blas::Queue& queue); + +template +void gescale( + int64_t m, int64_t n, + double numer, double denom, + double* A, int64_t lda, + blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void gescale( + int64_t m, int64_t n, + float numer, float denom, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + gescale(m, n, numer, denom, (sycl::float2 *)A, lda, queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + std::complex numer, std::complex denom, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + gescale(m, n, sycl::float2(real(numer), imag(numer)), + sycl::float2(real(denom), imag(denom)), (sycl::float2 *)A, lda, + queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + double numer, double denom, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + gescale(m, n, numer, denom, (sycl::double2 *)A, lda, queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + std::complex numer, std::complex denom, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + gescale(m, n, sycl::double2(real(numer), imag(numer)), + sycl::double2(real(denom), imag(denom)), (sycl::double2 *)A, lda, + queue); +} + + +//============================================================================== +namespace batch { + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise tile scale. Sets +/// \[ +/// Aarray[k] *= (numer / denom). +/// \] +/// This does NOT currently take extra care to avoid over/underflow. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] numer +/// Scale value numerator. +/// +/// @param[in] denom +/// Scale value denominator. +/// +/// @param[in,out] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void gescale( + int64_t m, int64_t n, + scalar_t2 numer, scalar_t2 denom, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + // quick return + if (m == 0 || n == 0) + return; + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:156: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + scalar_t2 mul = numer / denom; + + /* + DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gescale_batch_kernel(m, n, mul, Aarray, lda, + item_ct1); + }); + + /* + DPCT1010:157: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void gescale( + int64_t m, int64_t n, + float numer, float denom, + float** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +template +void gescale( + int64_t m, int64_t n, + double numer, double denom, + double** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void gescale( + int64_t m, int64_t n, + float numer, float denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale(m, n, numer, denom, (sycl::float2 **)Aarray, lda, batch_count, queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + std::complex numer, std::complex denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale(m, n, sycl::float2(real(numer), imag(numer)), + sycl::float2(real(denom), imag(denom)), (sycl::float2 **)Aarray, + lda, batch_count, queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + double numer, double denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale(m, n, numer, denom, (sycl::double2 **)Aarray, lda, batch_count, + queue); +} + +template <> +void gescale( + int64_t m, int64_t n, + std::complex numer, std::complex denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale(m, n, sycl::double2(real(numer), imag(numer)), + sycl::double2(real(denom), imag(denom)), (sycl::double2 **)Aarray, + lda, batch_count, queue); +} + +} // namespace batch +} // namespace device +} // namespace slate diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp new file mode 100644 index 000000000..ffb727c4d --- /dev/null +++ b/src/sycl/device_gescale_row_col.dp.cpp @@ -0,0 +1,334 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing row and column scaling. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gescale_row_col(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Rarray +/// Vector of length m containing row scaling factors. +/// +/// @param[in] Carray +/// Vector of length n containing column scaling factors. +/// +/// @param[in,out] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +template +void gescale_row_col_batch_kernel( + int64_t m, int64_t n, + scalar_t2 const* const* Rarray, + scalar_t2 const* const* Carray, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + scalar_t2 const *R = Rarray[item_ct1.get_group(2)]; + scalar_t2 const *C = Carray[item_ct1.get_group(2)]; + scalar_t *tileA = Aarray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &tileA[ i ]; + scalar_t2 ri = R[ i ]; + for (int64_t j = 0; j < n; ++j) + rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]); + // rowA[j * lda] = dpct_operator_overloading::operator*( + // rowA[j * lda], + // dpct_operator_overloading::operator*(ri, C[j]))); + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing column scaling. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gescale_row_col(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Carray +/// Vector of length n containing column scaling factors. +/// +/// @param[in,out] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +template +void gescale_col_batch_kernel( + int64_t m, int64_t n, + scalar_t2 const* const* Carray, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + scalar_t2 const *C = Carray[item_ct1.get_group(2)]; + scalar_t *tileA = Aarray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &tileA[ i ]; + for (int64_t j = 0; j < n; ++j) + rowA[ j*lda ] = rowA[ j*lda ] * C[ j ]; + // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], C[j]); + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing row scaling. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gescale_row_col(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Rarray +/// Vector of length m containing row scaling factors. +/// +/// @param[in,out] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +template +void gescale_row_batch_kernel( + int64_t m, int64_t n, + scalar_t2 const* const* Rarray, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + scalar_t2 const *R = Rarray[item_ct1.get_group(2)]; + scalar_t *tileA = Aarray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &tileA[ i ]; + scalar_t2 ri = R[ i ]; + for (int64_t j = 0; j < n; ++j) + rowA[ j*lda ] = rowA[ j*lda ] * ri; + // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], ri); + } +} + +//------------------------------------------------------------------------------ +/// Batched routine for row and column scaling. +/// +/// @param[in] equed +/// Form of scaling to do. +/// - Equed::Row: sets $ A = diag(R) A $ +/// - Equed::Col: sets $ A = A diag(C) $ +/// - Equed::Both: sets $ A = diag(R) A diag(C) $ +/// for each R in Rarray, C in Carray, and A in Aarray. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Rarray +/// Vector of length m containing row scaling factors. +/// +/// @param[in] Carray +/// Vector of length n containing column scaling factors. +/// +/// @param[in,out] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + scalar_t2 const* const* Rarray, + scalar_t2 const* const* Carray, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:140: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + if (equed == Equed::Row) { + /* + DPCT1049:26: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gescale_row_batch_kernel(m, n, Rarray, Aarray, + lda, item_ct1); + }); + } + else if (equed == Equed::Col) { + /* + DPCT1049:27: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gescale_col_batch_kernel(m, n, Carray, Aarray, + lda, item_ct1); + }); + } + else if (equed == Equed::Both) { + /* + DPCT1049:28: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + gescale_row_col_batch_kernel( + m, n, Rarray, Carray, Aarray, lda, item_ct1); + }); + } + + /* + DPCT1010:141: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + float const* const* Rarray, + float const* const* Carray, + float** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +template +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + double const* const* Rarray, + double const* const* Carray, + double** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +// real R, C +template <> +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + float const* const* Rarray, + float const* const* Carray, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale_row_col_batch(equed, m, n, Rarray, Carray, (sycl::float2 **)Aarray, + lda, batch_count, queue); +} + +template <> +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + double const* const* Rarray, + double const* const* Carray, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale_row_col_batch(equed, m, n, Rarray, Carray, (sycl::double2 **)Aarray, + lda, batch_count, queue); +} + +// complex R, C +template <> +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + std::complex const* const* Rarray, + std::complex const* const* Carray, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale_row_col_batch(equed, m, n, (sycl::float2 **)Rarray, + (sycl::float2 **)Carray, (sycl::float2 **)Aarray, lda, + batch_count, queue); +} + +template <> +void gescale_row_col_batch( + Equed equed, int64_t m, int64_t n, + std::complex const* const* Rarray, + std::complex const* const* Carray, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + gescale_row_col_batch(equed, m, n, (sycl::double2 **)Rarray, + (sycl::double2 **)Carray, (sycl::double2 **)Aarray, + lda, batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_geset.dp.cpp b/src/sycl/device_geset.dp.cpp new file mode 100644 index 000000000..f82d58fdf --- /dev/null +++ b/src/sycl/device_geset.dp.cpp @@ -0,0 +1,308 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile set. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by geset_kernel() and geset_batch_kernel(). +/// +/// @copydoc geset +/// +template +void geset_func( + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &A[ i ]; + + for (int64_t j = 0; j < n; ++j) + rowA[ j*lda ] = (j != i) ? offdiag_value : diag_value; + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile. +/// @copydoc geset +template +void geset_kernel( + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + geset_func(m, n, offdiag_value, diag_value, A, lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile set. +/// @copydoc geset_batch +template +void geset_batch_kernel( + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + geset_func(m, n, offdiag_value, diag_value, Aarray[item_ct1.get_group(2)], + lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Element-wise m-by-n matrix A +/// to diag_value on the diagonal and offdiag_value on the off-diagonals. +/// +/// @param[in] m +/// Number of rows of A. m >= 0. +/// +/// @param[in] n +/// Number of columns of A. n >= 0. +/// +/// @param[in] offdiag_value +/// The value to set outside of the diagonal. +/// +/// @param[in] diag_value +/// The value to set on the diagonal. +/// +/// @param[out] A +/// An m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of A. lda >= m. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void geset( + int64_t m, int64_t n, + scalar_t const& offdiag_value, + scalar_t const& diag_value, + scalar_t* A, int64_t lda, + blas::Queue &queue) +{ + // quick return + if (m == 0 || n == 0) + return; + + /* + DPCT1093:134: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + geset_kernel(m, n, offdiag_value, diag_value, A, lda, + item_ct1); + }); + + /* + DPCT1010:135: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geset( + int64_t m, int64_t n, + float const& offdiag_value, float const& diag_value, + float* A, int64_t lda, + blas::Queue &queue); + +template +void geset( + int64_t m, int64_t n, + double const& offdiag_value, double const& diag_value, + double* A, int64_t lda, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void geset( + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex* A, int64_t lda, + blas::Queue &queue) +{ + geset(m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)), + sycl::float2(real(diag_value), imag(diag_value)), (sycl::float2 *)A, + lda, queue); +} + +template <> +void geset( + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex* A, int64_t lda, + blas::Queue &queue) +{ + geset(m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)), + sycl::double2(real(diag_value), imag(diag_value)), (sycl::double2 *)A, + lda, queue); +} + +//============================================================================== +namespace batch { + +//------------------------------------------------------------------------------ +/// Initializes a batch of m-by-n matrices Aarray[k] +/// to diag_value on the diagonal and offdiag_value on the off-diagonals. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] offdiag_value +/// The value to set outside of the diagonal. +/// +/// @param[in] diag_value +/// The value to set on the diagonal. +/// +/// @param[out] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void geset( + int64_t m, int64_t n, + scalar_t const& offdiag_value, + scalar_t const& diag_value, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, blas::Queue &queue) +{ + // quick return + if (batch_count == 0) + return; + // quick return + if (m == 0 || n == 0) + return; + + /* + DPCT1093:136: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + geset_batch_kernel(m, n, offdiag_value, diag_value, + Aarray, lda, item_ct1); + }); + + /* + DPCT1010:137: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void geset( + int64_t m, int64_t n, + float const& offdiag_value, + float const& diag_value, + float** Aarray, int64_t lda, + int64_t batch_count, blas::Queue &queue); + +template +void geset( + int64_t m, int64_t n, + double const& offdiag_value, + double const& diag_value, + double** Aarray, int64_t lda, + int64_t batch_count, blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void geset( + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue &queue) +{ + geset(m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)), + sycl::float2(real(diag_value), imag(diag_value)), + (sycl::float2 **)Aarray, lda, batch_count, queue); +} + +template <> +void geset( + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue &queue) +{ + geset(m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)), + sycl::double2(real(diag_value), imag(diag_value)), + (sycl::double2 **)Aarray, lda, batch_count, queue); +} + +} // namespace batch +} // namespace device +} // namespace slate diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp new file mode 100644 index 000000000..3a3ea4569 --- /dev/null +++ b/src/sycl/device_henorm.dp.cpp @@ -0,0 +1,487 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Finds the largest absolute value of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Uses dynamic shared memory array of length sizeof(real_t) * n. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by henorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_maxima +/// Array of dimension gridDim.x. +/// On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for tile A^(k). +/// +template +void henorm_max_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_maxima, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_max = (real_t*) dynamic_data; + if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) { + row_max[item_ct1.get_local_id(2)] = 0; + } + + // Each thread finds max of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < n; + i += item_ct1.get_local_range(2)) { + chunk = i % item_ct1.get_local_range(2); + + scalar_t const* row = &tile[ i ]; + if (i < item_ct1.get_local_range(2)) { + row_max[chunk] = 0; + } + + real_t max = 0; + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + max = max_nan(max, abs(row[j*lda])); + int64_t j = i; + max = max_nan(max, abs( real( row[j*lda] ))); // diag (real) + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j > i; --j) // strictly upper + max = max_nan(max, abs(row[j*lda])); + int64_t j = i; + max = max_nan(max, abs( real( row[j*lda] ))); // diag (real) + } + row_max[chunk] = max_nan(max, row_max[chunk]); + } + + // Reduction to find max of tile. + /* + DPCT1065:29: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2), + row_max, item_ct1); + if (item_ct1.get_local_id(2) == 0) { + tiles_maxima[item_ct1.get_group(2)] = row_max[0]; + } +} + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each column of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one column. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by henorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) +/// for row j of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void henorm_one_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + // Each thread sums one row/column. + // todo: the row reads are coalesced, but the col reads are not coalesced + for (int k = item_ct1.get_local_id(2); k < n; + k += item_ct1.get_local_range(2)) { + scalar_t const* row = &tile[ k ]; + scalar_t const* column = &tile[ lda*k ]; + real_t sum = 0; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j < k; ++j) // strictly lower + sum += abs(row[j*lda]); + int64_t j = k; + sum += abs( real( row[j*lda] )); // diag (real) + for (int64_t i = k + 1; i < n; ++i) // strictly lower + sum += abs(column[i]); + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j > k; --j) // strictly upper + sum += abs(row[j*lda]); + int64_t j = k; + sum += abs( real( row[j*lda] )); // diag (real) + for (int64_t i = 0; i < k && i < n; ++i) // strictly upper + sum += abs(column[i]); + } + tiles_sums[item_ct1.get_group(2) * ldv + k] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of squares, in scaled representation, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by henorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension blockDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_values +/// Array of dimension 2 * blockDim.x. +/// On exit, +/// tiles_values[2*k + 0] = scale +/// tiles_values[2*k + 1] = sumsq +/// such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 +/// for tile A^(k). +/// +template +void henorm_fro_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_values, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_scale = (real_t*) &dynamic_data[0]; + real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)]; + + // Each thread finds sum-of-squares of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < n; + i += item_ct1.get_local_range(2)) { + real_t scale = 0; + real_t sumsq = 1; + chunk = i % item_ct1.get_local_range(2); + scalar_t const* row = &tile[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + add_sumsq(scale, sumsq, abs(row[j*lda])); + // double for symmetric entries + sumsq *= 2; + // diagonal (real) + add_sumsq( scale, sumsq, abs( real( row[ i*lda ] ) ) ); + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j > i; --j) // strictly upper + add_sumsq( scale, sumsq, abs( row[ j*lda ] ) ); + // double for symmetric entries + sumsq *= 2; + // diagonal (real) + add_sumsq( scale, sumsq, abs( real( row[ i*lda ] ) ) ); + } + + if (i < item_ct1.get_local_range(2)) { + row_scale[chunk] = 0; + row_sumsq[chunk] = 1; + } + combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); + } + + // Reduction to find sum-of-squares of tile. + // todo: parallel reduction. + /* + DPCT1065:30: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + if (item_ct1.get_local_id(2) == 0) { + real_t tile_scale = row_scale[0]; + real_t tile_sumsq = row_sumsq[0]; + for (int64_t chunk = 1; + chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + } + + tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; + tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq; + } +} + +//------------------------------------------------------------------------------ +/// Batched routine that computes a partial norm for each tile. +/// +/// @param[in] norm +/// Norm to compute. See values for description. +/// +/// @param[in] uplo +/// Whether each Aarray[k] is stored in the upper or lower triangle. +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] values +/// Array in GPU memory, dimension batch_count * ldv. +/// - Norm::Max: ldv = 1. +/// On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count. +/// +/// - Norm::One: ldv >= n. +/// On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= j < n. +/// +/// - Norm::Inf: for symmetric, same as Norm::One +/// +/// - Norm::Max: ldv = 2. +/// On exit, +/// values[k*2 + 0] = scale_k +/// values[k*2 + 1] = sumsq_k +/// where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 +/// for 0 <= k < batch_count. +/// +/// @param[in] ldv +/// Leading dimension of values array. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void henorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* values, int64_t ldv, int64_t batch_count, + blas::Queue& queue) +{ + using real_t = blas::real_type; + int64_t nb = 512; + + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:142: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + //--------- + // max norm + if (norm == lapack::Norm::Max) { + if (n == 0) { + blas::device_memset(values, 0, batch_count, queue); + } + else { + assert(ldv == 1); + /* + DPCT1083:32: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb; + /* + DPCT1049:31: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + henorm_max_kernel(uplo, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + //--------- + // one norm + else if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) { + if (n == 0) { + blas::device_memset(values, 0, batch_count * n, queue); + } + else { + assert(ldv >= n); + /* + DPCT1049:33: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + henorm_one_kernel(uplo, n, Aarray, lda, values, ldv, + item_ct1); + }); + } + } + //--------- + // Frobenius norm + else if (norm == lapack::Norm::Fro) { + if (n == 0) { + blas::device_memset(values, 0, batch_count * 2, queue); + } + else { + assert(ldv == 2); + /* + DPCT1083:35: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb * 2; + /* + DPCT1049:34: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + henorm_fro_kernel(uplo, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + + /* + DPCT1010:143: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void henorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + float const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue& queue); + +template +void henorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + double const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void henorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + std::complex const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue& queue) +{ + henorm(norm, uplo, n, (sycl::float2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +template <> +void henorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + std::complex const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue& queue) +{ + henorm(norm, uplo, n, (sycl::double2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp new file mode 100644 index 000000000..d57370262 --- /dev/null +++ b/src/sycl/device_synorm.dp.cpp @@ -0,0 +1,736 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Finds the largest absolute value of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Uses dynamic shared memory array of length sizeof(real_t) * n. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by synorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_maxima +/// Array of dimension gridDim.x. +/// On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for tile A^(k). +/// +template +void synorm_max_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_maxima, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_max = (real_t*) dynamic_data; + if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) { + row_max[item_ct1.get_local_id(2)] = 0; + } + + // Each thread finds max of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < n; + i += item_ct1.get_local_range(2)) { + chunk = i % item_ct1.get_local_range(2); + + scalar_t const* row = &tile[ i ]; + if (i < item_ct1.get_local_range(2)) { + row_max[chunk] = 0; + } + + real_t max = 0; + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= i && j < n; ++j) // lower + max = max_nan(max, abs(row[j*lda])); + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j >= i; --j) // upper + max = max_nan(max, abs(row[j*lda])); + } + row_max[chunk] = max_nan(max, row_max[chunk]); + } + + // Reduction to find max of tile. + /* + DPCT1065:73: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2), + row_max, item_ct1); + if (item_ct1.get_local_id(2) == 0) { + tiles_maxima[item_ct1.get_group(2)] = row_max[0]; + } +} + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each column of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one column. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by synorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) +/// for row j of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void synorm_one_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + // Each thread sums one row/column. + // todo: the row reads are coalesced, but the col reads are not coalesced + for (int k = item_ct1.get_local_id(2); k < n; + k += item_ct1.get_local_range(2)) { + scalar_t const* row = &tile[ k ]; + scalar_t const* column = &tile[ lda*k ]; + real_t sum = 0; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= k; ++j) // lower + sum += abs(row[j*lda]); + for (int64_t i = k + 1; i < n; ++i) // strictly lower + sum += abs(column[i]); + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j >= k; --j) // upper + sum += abs(row[j*lda]); + for (int64_t i = 0; i < k && i < n; ++i) // strictly upper + sum += abs(column[i]); + } + tiles_sums[item_ct1.get_group(2) * ldv + k] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of squares, in scaled representation, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Kernel assumes non-trivial tiles (n >= 1). +/// Launched by synorm(). +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension blockDim.x, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] tiles_values +/// Array of dimension 2 * blockDim.x. +/// On exit, +/// tiles_values[2*k + 0] = scale +/// tiles_values[2*k + 1] = sumsq +/// such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 +/// for tile A^(k). +/// +template +void synorm_fro_kernel( + lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_values, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_scale = (real_t*) &dynamic_data[0]; + real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)]; + + // Each thread finds sum-of-squares of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < n; + i += item_ct1.get_local_range(2)) { + real_t scale = 0; + real_t sumsq = 1; + chunk = i % item_ct1.get_local_range(2); + scalar_t const* row = &tile[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + add_sumsq(scale, sumsq, abs(row[j*lda])); + // double for symmetric entries + sumsq *= 2; + // diagonal + add_sumsq( scale, sumsq, abs( row[ i*lda ] ) ); + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + for (int64_t j = n-1; j > i; --j) // strictly upper + add_sumsq(scale, sumsq, abs(row[j*lda])); + // double for symmetric entries + sumsq *= 2; + // diagonal + add_sumsq( scale, sumsq, abs( row[ i*lda ] ) ); + } + + if (i < item_ct1.get_local_range(2)) { + row_scale[chunk] = 0; + row_sumsq[chunk] = 1; + } + combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); + } + + // Reduction to find sum-of-squares of tile. + // todo: parallel reduction. + /* + DPCT1065:74: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + if (item_ct1.get_local_id(2) == 0) { + real_t tile_scale = row_scale[0]; + real_t tile_sumsq = row_sumsq[0]; + for (int64_t chunk = 1; + chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + } + + tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; + tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq; + } +} + +//------------------------------------------------------------------------------ +/// Batched routine that computes a partial norm for each tile. +/// +/// @param[in] norm +/// Norm to compute. See values for description. +/// +/// @param[in] uplo +/// Whether each Aarray[k] is stored in the upper or lower triangle. +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[out] values +/// Array in GPU memory, dimension batch_count * ldv. +/// - Norm::Max: ldv = 1. +/// On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count. +/// +/// - Norm::One: ldv >= n. +/// On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= j < n. +/// +/// - Norm::Inf: for symmetric, same as Norm::One +/// +/// - Norm::Max: ldv = 2. +/// On exit, +/// values[k*2 + 0] = scale_k +/// values[k*2 + 1] = sumsq_k +/// where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 +/// for 0 <= k < batch_count. +/// +/// @param[in] ldv +/// Leading dimension of values array. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void synorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + using real_t = blas::real_type; + int64_t nb = 512; + + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:172: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + //--------- + // max norm + if (norm == lapack::Norm::Max) { + if (n == 0) { + blas::device_memset(values, 0, batch_count, queue); + } + else { + assert(ldv == 1); + /* + DPCT1083:76: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb; + /* + DPCT1049:75: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + synorm_max_kernel(uplo, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + //--------- + // one norm + else if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) { + if (n == 0) { + blas::device_memset(values, 0, batch_count * n, queue); + } + else { + assert(ldv >= n); + /* + DPCT1049:77: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + synorm_one_kernel(uplo, n, Aarray, lda, values, ldv, + item_ct1); + }); + } + } + //--------- + // Frobenius norm + else if (norm == lapack::Norm::Fro) { + if (n == 0) { + blas::device_memset(values, 0, batch_count * 2, queue); + } + else { + assert(ldv == 2); + /* + DPCT1083:79: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb * 2; + /* + DPCT1049:78: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + synorm_fro_kernel(uplo, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + + /* + DPCT1010:173: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +const int ib = 32; +const int ib1 = 33; + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each row and each column of elements, +/// for each tile in tiles. +/// Each thread block deals with one tile. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by synormOffdiag(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, +/// tiles_sums[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for column j of tile A^(k), and +/// tiles_sums[k*ldv + i + n] = sum_{j} abs( A^(k)_(i, j) ) +/// for row i of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void synorm_offdiag_one_kernel( + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) +{ + // row_sums doesn't need to be shared, it could be in registers, + // but we don't know how large it is beforehand -- each thread uses + // ceil(m/ib) entries; in total it is ceil(m/ib)*ib entries. + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + auto dynamic_data = (char *)dpct_local; + real_t* shmem_tile = (real_t*)dynamic_data; + real_t* row_sums = &shmem_tile[ ib1*ib ]; + const int k = item_ct1.get_local_id(2); + + // Initialize row sums. + for (int64_t ii = 0; ii < m; ii += ib) { + row_sums[ ii+k ] = 0; + } + + for (int64_t jj = 0; jj < n; jj += ib) { + real_t sum = 0.0; + for (int64_t ii = 0; ii < m; ii += ib) { + // Read 32 x 32 (ib x ib) sub-tile into shared memory. + // This does coalesced reads of one column at a time in parallel. + for (int64_t j = 0; j < ib; ++j) + if (jj+j < n && ii+k < m) + shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] ); + /* + DPCT1065:80: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // shmem_tile loaded + + // Each thread sums one column. + for (int64_t i = 0; i < ib; ++i) + if (ii+i < m) + sum += shmem_tile[ k*ib1 + i ]; + + // Each thread sums one row. + for (int64_t j = 0; j < ib; ++j) + if (jj+j < n) + row_sums[ ii+k ] += shmem_tile[ j*ib1 + k ]; + /* + DPCT1065:81: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); // done with shmem_tile + } + + if (jj+k < n) + tiles_sums[item_ct1.get_group(2) * ldv + jj + k] = sum; + } + + // Save row sums. + for (int64_t ii = 0; ii < m; ii += ib) { + if (ii+k < m) + tiles_sums[item_ct1.get_group(2) * ldv + ii + k + n] = row_sums[ii + k]; + } +} + +//------------------------------------------------------------------------------ +/// Batched routine that computes a partial norm for each tile. +/// Used for full, off-diagonal tiles within a symmetric matrix, +/// where element Aij contributes to both column i and j. +/// +/// @param[in] norm +/// Norm to compute. See values for description. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] values +/// Array in GPU memory, dimension batch_count * ldv. +/// - Norm::Max: ldv = 1. +/// On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count. +/// +/// - Norm::One: ldv >= n. +/// On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= j < n. +/// +/// - Norm::Inf: for symmetric, same as Norm::One +/// +/// - Norm::Max: ldv = 2. +/// On exit, +/// values[k*2 + 0] = scale_k +/// values[k*2 + 1] = sumsq_k +/// where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 +/// for 0 <= k < batch_count. +/// +/// @param[in] ldv +/// Leading dimension of values array. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void synormOffdiag( + lapack::Norm norm, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* values, int64_t ldv, + int64_t batch_count, + blas::Queue &queue) +{ + using real_t = blas::real_type; + + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:174: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + //--------- + // one norm + if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) { + assert(ldv >= n); + size_t shared_mem + /* + DPCT1083:82: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + = sizeof(real_t) * (ib * ib1 + roundup(m, int64_t(ib))); + assert( shared_mem <= 48*1024 ); // max 48 KiB + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + synorm_offdiag_one_kernel(m, n, Aarray, lda, values, ldv, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + else { + slate_not_implemented("Only Norm::One and Norm::Inf is supported."); + } + + /* + DPCT1010:175: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void synorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + float const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +template +void synorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + double const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void synorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + std::complex const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + synorm(norm, uplo, n, (sycl::float2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +template <> +void synorm( + lapack::Norm norm, lapack::Uplo uplo, + int64_t n, + std::complex const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + synorm(norm, uplo, n, (sycl::double2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void synormOffdiag( + lapack::Norm norm, + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + float* values, int64_t ldv, + int64_t batch_count, + blas::Queue &queue); + +template +void synormOffdiag( + lapack::Norm norm, + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + double* values, int64_t ldv, + int64_t batch_count, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void synormOffdiag( + lapack::Norm norm, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + float* values, int64_t ldv, + int64_t batch_count, + blas::Queue &queue) +{ + synormOffdiag(norm, m, n, (sycl::float2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +template <> +void synormOffdiag( + lapack::Norm norm, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + double* values, int64_t ldv, + int64_t batch_count, + blas::Queue &queue) + +{ + synormOffdiag(norm, m, n, (sycl::double2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_transpose.dp.cpp b/src/sycl/device_transpose.dp.cpp new file mode 100644 index 000000000..33d7007eb --- /dev/null +++ b/src/sycl/device_transpose.dp.cpp @@ -0,0 +1,918 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +/// internal blocking +/// 16 x 16 thread block = 256 threads +/// 32 x 32 thread block = 1024 threads +static const int ib = 16; + +//------------------------------------------------------------------------------ +/// Device routine handles one matrix. +/// Thread block grid: +/// x = batch index (ignored here; see batch kernel), +/// y = block row index, +/// z = block col index. +/// Each thread block is ib-by-ib threads and does one ib-by-ib block of an +/// n-by-n matrix. +/// +/// Let nt = ceildiv( n, ib ) be the number of blocks for one n-by-n matrix. +/// An even number of blocks uses an (nt + 1) by (nt/2) grid. +/// Example: for nt = 4 blocks, y by z = 5 by 2 grid: +/// [ A00 A01 ] +/// [----. A11 ] [ A10 . | . . ] +/// [ A10 '----] [ A20 A21 | . . ] +/// [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] +/// [ A30 A31 ] [ A40 A41 | A01 A11 ] +/// [ A40 A41 ] +/// +/// An odd number of blocks uses an (nt) by (nt + 1)/2 grid. +/// Example: for nt = 5 blocks, y by z = 5 by 3 grid: +/// [ A00 | A01 A02 ] +/// [ '----. ] [ A00 . . | . . ] +/// [ A10 A11 | A12 ] [ A10 A11 . | . . ] +/// [ '-----] covers matrix as [ A20 A21 A22 | . . ] +/// [ A20 A21 A22 ] [ A30 A31 A32 | A01 . ] +/// [ A30 A31 A32 ] [ A40 A41 A42 | A02 A12 ] +/// [ A40 A41 A42 ] +/// +template +void transpose_func( + bool is_conj, + int n, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + // +1 to avoid memory bank conflicts. + + // i, j are row & column indices of top-left corner of each block. + // ii, jj are row & column offsets within each block. + int ii = item_ct1.get_local_id(2); + int jj = item_ct1.get_local_id(1); + + int i, j; + if (item_ct1.get_group_range(1) - 1 == item_ct1.get_group_range(0) * 2) { + // Even number of blocks. + //assert( ceildiv(n, ib) % 2 == 0 ); + bool lower = (item_ct1.get_group(1) > item_ct1.get_group(0)); + i = (lower ? (item_ct1.get_group(1) - 1) + : (item_ct1.get_group(0) + item_ct1.get_group_range(0))); + j = (lower ? (item_ct1.get_group(0)) + : (item_ct1.get_group(1) + item_ct1.get_group_range(0))); + } + else { + // Odd number of blocks. + //assert( ceildiv(n, ib) % 2 == 1 ); + bool lower = (item_ct1.get_group(1) >= item_ct1.get_group(0)); + i = (lower ? item_ct1.get_group(1) + : (item_ct1.get_group(0) + item_ct1.get_group_range(0) - 1)); + j = (lower ? item_ct1.get_group(0) + : (item_ct1.get_group(1) + item_ct1.get_group_range(0))); + } + i *= ib; + j *= ib; + + scalar_t* A1 = A + i + ii + (j + jj)*lda; // A(i, j) + if (i == j) { // diagonal block + // Load block A(i, j) into shared memory sA1. + if (i + ii < n && j + jj < n) { + sA1[jj][ii] = *A1; + } + /* + DPCT1065:62: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + // Save transposed block, A(i, j) = trans(sA1). + if (i + ii < n && j + jj < n) { + if (is_conj) + *A1 = conj(sA1[ii][jj]); + else + *A1 = sA1[ii][jj]; + } + } + else { // off-diagonal block + scalar_t* A2 = A + j + ii + (i + jj)*lda; // A(j, i) + // Load blocks A(i, j) and A(j, i) into shared memory sA1 and sA2. + if (i + ii < n && j + jj < n) { + sA1[jj][ii] = *A1; + } + if (j + ii < n && i + jj < n) { + sA2[jj][ii] = *A2; + } + /* + DPCT1065:63: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + // Save transposed blocks, A(i, j) = trans(sA2), A(j, i) = trans(sA1). + if (i + ii < n && j + jj < n) { + if (is_conj) + *A1 = conj(sA2[ii][jj]); + else + *A1 = sA2[ii][jj]; + } + if (j + ii < n && i + jj < n) { + if (is_conj) + *A2 = conj(sA1[ii][jj]); + else + *A2 = sA1[ii][jj]; + } + } +} + +//------------------------------------------------------------------------------ +static const int NB = 32; ///< block size for transpose_func +static const int NY = 8; ///< y dim of thread block size for transpose_func +// static const int NX = 32; handled as template parameter, look below + + +/// tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB. +/// uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. +/// subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB +/// for each subtile +/// load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY +/// save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY +/// A += NX +/// AT += NX*ldat +/// +/// e.g., with NB=32, NX=32, NY=8 ([sdc] precisions) +/// load 32x32 subtile as 4 blocks of 32x8 columns: (A11 A12 A13 A14 ) +/// save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14) +/// +/// e.g., with NB=32, NX=16, NY=8 (z precision) +/// load 16x32 subtile as 4 blocks of 16x8 columns: (A11 A12 A13 A14) +/// save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12) +/// (AT21 AT22) +/// +template +void transpose_func( + bool is_conj, + int m, int n, + const scalar_t *A, int64_t lda, + scalar_t *AT, int64_t ldat, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, + sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + + int tx = item_ct1.get_local_id(2); + int ty = item_ct1.get_local_id(1); + int iby = item_ct1.get_group(1) * NB; + int ibz = item_ct1.get_group(0) * NB; + int i, j; + + A += iby + tx + (ibz + ty)*lda; + AT += ibz + tx + (iby + ty)*ldat; + + #pragma unroll + for (int tile=0; tile < NB/NX; ++tile) { + // load NX-by-NB subtile transposed from A into sA + i = iby + tx + tile*NX; + j = ibz + ty; + if (i < m) { + if (is_conj) { + #pragma unroll + for (int j2=0; j2 < NB; j2 += NY) { + if (j + j2 < n) { + sA[ty + j2][tx] = conj(A[j2*lda]); + } + } + } + else { + #pragma unroll + for (int j2=0; j2 < NB; j2 += NY) { + if (j + j2 < n) { + sA[ty + j2][tx] = A[j2*lda]; + } + } + } + } + /* + DPCT1065:64: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + // save NB-by-NX subtile from sA into AT + i = ibz + tx; + j = iby + ty + tile*NX; + #pragma unroll + for (int i2=0; i2 < NB; i2 += NX) { + if (i + i2 < n) { + #pragma unroll + for (int j2=0; j2 < NX; j2 += NY) { + if (j + j2 < m) { + AT[i2 + j2*ldat] = sA[tx + i2][ty + j2]; + } + } + } + } + /* + DPCT1065:65: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + // move to next subtile + A += NX; + AT += NX*ldat; + } +} + +//------------------------------------------------------------------------------ +/// in-place transpose of a square buffer +template +void transpose_kernel( + bool is_conj, + int n, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + transpose_func(is_conj, n, A, lda, item_ct1, sA1, sA2, sA); +} + +//------------------------------------------------------------------------------ +/// in-place transpose of array of square buffers +template +void transpose_batch_kernel( + bool is_conj, + int n, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + transpose_func(is_conj, n, Aarray[item_ct1.get_group(2)], lda, item_ct1, sA1, sA2, sA); +} + +//------------------------------------------------------------------------------ +/// out-of-place transpose of a rectangular buffer +/// transopses A onto AT +/// +template +void transpose_kernel( + bool is_conj, + int m, int n, + const scalar_t *A, int64_t lda, + scalar_t *AT, int64_t ldat, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + transpose_func(is_conj, m, n, A, lda, AT, ldat, item_ct1, sA1, sA2, sA); +} + +//------------------------------------------------------------------------------ +/// out-of-place transpose of an array of rectangular buffers +/// transopses dA_array onto dAT_array +/// +template +void transpose_batch_kernel( + bool is_conj, + int m, int n, + scalar_t **dA_array, int64_t lda, + scalar_t **dAT_array, int64_t ldat, const sycl::nd_item<3> &item_ct1, + sycl::local_accessor sA1, sycl::local_accessor sA2, + sycl::local_accessor sA) +{ + transpose_func(is_conj, m, n, dA_array[item_ct1.get_group(2)], + lda, dAT_array[item_ct1.get_group(2)], ldat, + item_ct1, sA1, sA2, sA); +} + +//------------------------------------------------------------------------------ +/// Physically transpose a square matrix in place. +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 0. +/// +/// @param[in,out] A +/// A square n-by-n matrix stored in an lda-by-n array in GPU memory. +/// On output, A is transposed. +/// +/// @param[in] lda +/// Leading dimension of A. lda >= n. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void transpose( + bool is_conj, + int64_t n, + scalar_t* A, int64_t lda, + blas::Queue& queue) +{ + if (n <= 1) + return; + assert(lda >= n); + + /* + DPCT1093:158: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + int nt = ceildiv( n, int64_t(ib) ); + assert(nt <= 65535); // CUDA limitation + + // Need 1/2 * (nt + 1) * nt to cover lower triangle and diagonal of matrix. + // Block assignment differs depending on whether nt is odd or even. + sycl::range<3> blocks(1, 1, 1); + if (nt % 2 == 0) { + // even blocks + blocks = sycl::range<3>(uint(nt / 2), uint(nt + 1), 1); + } + else { + // odd blocks + blocks = sycl::range<3>(uint((nt + 1) / 2), uint(nt), 1); + } + sycl::range<3> threads(1, ib, ib); + + /* + DPCT1049:66: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + /* + DPCT1101:176: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:177: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA1_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:178: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:179: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA2_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:180: 'NB' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + sycl::local_accessor sA_acc_ct1( + sycl::range<2>(32 /*NB*/, + /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + transpose_kernel(is_conj, n, A, lda, item_ct1, sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1); + }); + }); + + /* + DPCT1010:159: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +/// Physically transpose a batch of square matrices in place. +/// +/// @param[in] n +/// Number of rows and columns of each tile. n >= 0. +/// +/// @param[in,out] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to +/// matrices, where each Aarray[k] is a square n-by-n matrix stored in an +/// lda-by-n array in GPU memory. +/// On output, each Aarray[k] is transposed. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= n. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void transpose_batch( + bool is_conj, + int64_t n, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, + blas::Queue& queue) +{ + if (batch_count < 0 || n <= 1) + return; + assert(lda >= n); + + /* + DPCT1093:160: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + int nt = ceildiv( n, int64_t(ib) ); + assert(nt <= 65535); // CUDA limitation + assert(batch_count <= 2147483647); // CUDA limitation, 2^31 - 1 + + // Need 1/2 * (nt + 1) * nt to cover lower triangle and diagonal of matrix. + // Block assignment differs depending on whether nt is odd or even. + sycl::range<3> blocks(1, 1, 1); + if (nt % 2 == 0) { + // even blocks + blocks = sycl::range<3>(uint(nt / 2), uint(nt + 1), uint(batch_count)); + } + else { + // odd blocks + blocks = sycl::range<3>(uint((nt + 1) / 2), uint(nt), uint(batch_count)); + } + sycl::range<3> threads(1, ib, ib); + + /* + DPCT1049:67: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + /* + DPCT1101:181: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:182: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA1_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:183: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:184: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA2_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:185: 'NB' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + sycl::local_accessor sA_acc_ct1( + sycl::range<2>(32 /*NB*/, + /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + transpose_batch_kernel(is_conj, n, Aarray, lda, + item_ct1, sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1); + }); + }); + + /* + DPCT1010:161: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +/// Look up NX based on data type. +/// float, double, complex-float use NX = 32. +template +struct nx_traits +{ + static const int NX = 32; +}; + +template <> struct nx_traits +{ + // static const int NX = 16; + static const int NX = 32; // always use 32 for SYCL +}; + +//------------------------------------------------------------------------------ +/// Physically transpose a rectangular matrix out-of-place. +/// +/// @param[in] m +/// Number of columns of tile. m >= 0. +/// +/// @param[in] n +/// Number of rows of tile. n >= 0. +/// +/// @param[in] dA +/// A rectangular m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of dA. lda >= m. +/// +/// @param[out] dAT +/// A rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. +/// On output, dAT is the transpose of dA. +/// +/// @param[in] ldat +/// Leading dimension of dAT. ldat >= n. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void transpose( + bool is_conj, + int64_t m, int64_t n, + scalar_t* dA, int64_t lda, + scalar_t* dAT, int64_t ldat, + blas::Queue& queue) +{ + const int NX = nx_traits::NX; + + if ((m <= 0) || (n <= 0)) + return; + assert(lda >= m); + assert(ldat >= n); + + /* + DPCT1093:162: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + int mt = ceildiv( m, int64_t(NB) ); + assert(mt <= 65535); // CUDA limitation + int nt = ceildiv( n, int64_t(NB) ); + assert(nt <= 65535); // CUDA limitation + + sycl::range<3> grid(nt, mt, 1); + sycl::range<3> threads(1, NY, NX); + /* + DPCT1049:68: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + /* + DPCT1101:186: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:187: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA1_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:188: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:189: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA2_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:190: 'NB' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + sycl::local_accessor sA_acc_ct1( + sycl::range<2>(32 /*NB*/, + /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + transpose_kernel( + is_conj, m, n, dA, lda, dAT, ldat, item_ct1, + sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1); + }); + }); + + /* + DPCT1010:163: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +/// Physically transpose a batch of rectangular matrices out-of-place. +/// +/// @param[in] m +/// Number of columns of each tile. m >= 0. +/// +/// @param[in] n +/// Number of rows of each tile. n >= 0. +/// +/// @param[in] dA_array +/// Array in GPU memory of dimension batch_count, containing pointers to +/// matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an +/// lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each dA_array[k] tile. lda >= m. +/// +/// @param[out] dAT_array +/// Array in GPU memory of dimension batch_count, containing pointers to +/// matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an +/// ldat-by-m array in GPU memory. +/// On output, each dAT_array[k] is the transpose of dA_array[k]. +/// +/// @param[in] ldat +/// Leading dimension of each dAT_array[k] tile. ldat >= n. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void transpose_batch( + bool is_conj, + int64_t m, int64_t n, + scalar_t **dA_array, int64_t lda, + scalar_t **dAT_array, int64_t ldat, + int64_t batch_count, + blas::Queue& queue) +{ + const int NX = nx_traits::NX; + + if ((m <= 0) || (n <= 0)) + return; + assert(lda >= m); + assert(ldat >= n); + + /* + DPCT1093:164: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + int mt = ceildiv( m, int64_t(NB) ); + assert(mt <= 65535); // CUDA limitation + int nt = ceildiv( n, int64_t(NB) ); + assert(nt <= 65535); // CUDA limitation + assert(batch_count <= 2147483647); // CUDA limitation, 2^31 - 1 + + sycl::range<3> grid(nt, mt, uint(batch_count)); + sycl::range<3> threads(1, NY, NX); + /* + DPCT1049:69: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + /* + DPCT1101:191: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:192: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA1_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:193: 'ib' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + /* + DPCT1101:194: 'ib+1' expression was replaced with a value. Modify the + code to use the original expression, provided in comments, if it is + correct. + */ + sycl::local_accessor sA2_acc_ct1( + sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh); + /* + DPCT1101:195: 'NB' expression was replaced with a value. Modify the code + to use the original expression, provided in comments, if it is correct. + */ + sycl::local_accessor sA_acc_ct1( + sycl::range<2>(32 /*NB*/, + /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1), + cgh); + + cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads), + [=](sycl::nd_item<3> item_ct1) { + transpose_batch_kernel( + is_conj, m, n, dA_array, lda, dAT_array, ldat, + item_ct1, sA1_acc_ct1, sA2_acc_ct1, + sA_acc_ct1); + }); + }); + + /* + DPCT1010:165: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void transpose( + bool is_conj, + int64_t n, + float* A, int64_t lda, + blas::Queue& queue); + +template +void transpose( + bool is_conj, + int64_t n, + double* A, int64_t lda, + blas::Queue& queue); + +//----- rectangular, out-of-place +template +void transpose( + bool is_conj, + int64_t m, int64_t n, + float* A, int64_t lda, + float* B, int64_t ldb, + blas::Queue& queue); + +template +void transpose( + bool is_conj, + int64_t m, int64_t n, + double* A, int64_t lda, + double* B, int64_t ldb, + blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void transpose( + bool is_conj, + int64_t n, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + transpose(is_conj, n, (sycl::float2 *)A, lda, queue); +} + +template <> +void transpose( + bool is_conj, + int64_t n, + std::complex* A, int64_t lda, + blas::Queue& queue) +{ + transpose(is_conj, n, (sycl::double2 *)A, lda, queue); +} + +template <> +void transpose( + bool is_conj, + int64_t m, int64_t n, + std::complex* A, int64_t lda, + std::complex* B, int64_t ldb, + blas::Queue& queue) +{ + transpose(is_conj, m, n, (sycl::float2 *)A, lda, (sycl::float2 *)B, ldb, + queue); +} + +template <> +void transpose( + bool is_conj, + int64_t m, int64_t n, + std::complex* A, int64_t lda, + std::complex* B, int64_t ldb, + blas::Queue& queue) +{ + transpose(is_conj, m, n, (sycl::double2 *)A, lda, (sycl::double2 *)B, ldb, + queue); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void transpose_batch( + bool is_conj, + int64_t n, + float** Aarray, int64_t lda, + int64_t batch_count, + blas::Queue& queue); + +template +void transpose_batch( + bool is_conj, + int64_t n, + double** Aarray, int64_t lda, + int64_t batch_count, + blas::Queue& queue); + +//----- rectangular, out-of-place +template +void transpose_batch( + bool is_conj, + int64_t m, int64_t n, + float** Aarray, int64_t lda, + float** Barray, int64_t ldb, + int64_t batch_count, + blas::Queue& queue); + +template +void transpose_batch( + bool is_conj, + int64_t m, int64_t n, + double** Aarray, int64_t lda, + double** Barray, int64_t ldb, + int64_t batch_count, + blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void transpose_batch( + bool is_conj, + int64_t n, + std::complex** Aarray, int64_t lda, + int64_t batch_count, + blas::Queue& queue) +{ + transpose_batch(is_conj, n, (sycl::float2 **)Aarray, lda, batch_count, queue); +} + +template <> +void transpose_batch( + bool is_conj, + int64_t n, + std::complex** Aarray, int64_t lda, + int64_t batch_count, + blas::Queue& queue) +{ + transpose_batch(is_conj, n, (sycl::double2 **)Aarray, lda, batch_count, + queue); +} + +//----- rectangular, out-of-place +template <> +void transpose_batch( + bool is_conj, + int64_t m, int64_t n, + std::complex** Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, + blas::Queue& queue) +{ + transpose_batch(is_conj, m, n, (sycl::float2 **)Aarray, lda, + (sycl::float2 **)Barray, ldb, batch_count, queue); +} + +template <> +void transpose_batch( + bool is_conj, + int64_t m, int64_t n, + std::complex** Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, + blas::Queue& queue) +{ + transpose_batch(is_conj, m, n, (sycl::double2 **)Aarray, lda, + (sycl::double2 **)Barray, ldb, batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp new file mode 100644 index 000000000..f0b802937 --- /dev/null +++ b/src/sycl/device_trnorm.dp.cpp @@ -0,0 +1,632 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Finds the largest absolute value of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Uses dynamic shared memory array of length sizeof(real_t) * m. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by trnorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_maxima +/// Array of dimension gridDim.x. +/// On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for tile A^(k). +/// +template +void trnorm_max_kernel( + lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_maxima, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_max = (real_t*) dynamic_data; + + if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) { + row_max[item_ct1.get_local_id(2)] = 0; + } + // Each thread finds max of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + chunk = i % item_ct1.get_local_range(2); + + scalar_t const* row = &tile[ i ]; + + real_t max = 0; + if (uplo == lapack::Uplo::Lower) { + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + max = 1; + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + max = max_nan(max, abs(row[j*lda])); + } + else { + for (int64_t j = 0; j <= i && j < n; ++j) // lower + max = max_nan(max, abs(row[j*lda])); + } + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + max = 1; + for (int64_t j = n-1; j > i; --j) // strictly upper + max = max_nan(max, abs(row[j*lda])); + } + else { + for (int64_t j = n-1; j >= i; --j) // upper + max = max_nan(max, abs(row[j*lda])); + } + } + + row_max[chunk] = max_nan(max, row_max[chunk]); + } + + // Reduction to find max of tile. + /* + DPCT1065:51: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2), + row_max, item_ct1); + if (item_ct1.get_local_id(2) == 0) { + tiles_maxima[item_ct1.get_group(2)] = row_max[0]; + } +} + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each column of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one column. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by trnorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// Also the number of threads per block (blockDim.x), hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) +/// for row j of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void trnorm_one_kernel( + lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + // Each thread sums one column. + // todo: this doesn't do coalesced reads + for (int j = item_ct1.get_local_id(2); j < n; + j += item_ct1.get_local_range(2)) { + + scalar_t const* column = &tile[ lda*j ]; + real_t sum = 0; + + if (uplo == lapack::Uplo::Lower) { + if (diag == lapack::Diag::Unit) { + if (j < m) // diag + sum += 1; + for (int64_t i = j+1; i < m; ++i) // strictly lower + sum += abs(column[i]); + } + else { + for (int64_t i = j; i < m; ++i) // lower + sum += abs(column[i]); + } + } + else { + if (diag == lapack::Diag::Unit) { + if (j < m) // diag + sum += 1; + for (int64_t i = 0; i < j && i < m; ++i) // strictly upper + sum += abs(column[i]); + } + else { + for (int64_t i = 0; i <= j && i < m; ++i) // upper + sum += abs(column[i]); + } + } + tiles_sums[item_ct1.get_group(2) * ldv + j] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of absolute values of each row of elements, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by trnorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_sums +/// Array of dimension gridDim.x * ldv. +/// On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) +/// for row i of tile A^(k). +/// +/// @param[in] ldv +/// Leading dimension of tiles_sums (values) array. +/// +template +void trnorm_inf_kernel( + lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_sums, int64_t ldv, + const sycl::nd_item<3> &item_ct1) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + + // Each thread sums one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t const* row = &tile[ i ]; + real_t sum = 0; + if (uplo == lapack::Uplo::Lower) { + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + sum += 1; + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + sum += abs(row[j*lda]); + } + else { + for (int64_t j = 0; j <= i && j < n; ++j) // lower + sum += abs(row[j*lda]); + } + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + sum += 1; + for (int64_t j = n-1; j > i; --j) // strictly upper + sum += abs(row[j*lda]); + } + else { + for (int64_t j = n-1; j >= i; --j) // upper + sum += abs(row[j*lda]); + } + } + tiles_sums[item_ct1.get_group(2) * ldv + i] = sum; + } +} + +//------------------------------------------------------------------------------ +/// Sum of squares, in scaled representation, for each tile in Aarray. +/// Each thread block deals with one tile. +/// Each thread deals with one row, followed by a reduction. +/// Kernel assumes non-trivial tiles (m, n >= 1). +/// Launched by trnorm(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// Also the number of threads per block, hence, +/// +/// @param[in] Aarray +/// Array of tiles of dimension blockDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] tiles_values +/// Array of dimension 2 * blockDim.x. +/// On exit, +/// tiles_values[2*k + 0] = scale +/// tiles_values[2*k + 1] = sumsq +/// such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 +/// for tile A^(k). +/// +template +void trnorm_fro_kernel( + lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* tiles_values, const sycl::nd_item<3> &item_ct1, + uint8_t *dpct_local) +{ + using real_t = blas::real_type; + scalar_t const *tile = Aarray[item_ct1.get_group(2)]; + int chunk; + + // Save partial results in shared memory. + auto dynamic_data = (char *)dpct_local; + real_t* row_scale = (real_t*) &dynamic_data[0]; + real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)]; + + // Each thread finds sum-of-squares of one row. + // This does coalesced reads of one column at a time in parallel. + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + real_t scale = 0; + real_t sumsq = 1; + chunk = i % item_ct1.get_local_range(2); + scalar_t const* row = &tile[ i ]; + + if (uplo == lapack::Uplo::Lower) { + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + add_sumsq(scale, sumsq, real_t(1)); + for (int64_t j = 0; j < i && j < n; ++j) // strictly lower + add_sumsq(scale, sumsq, abs(row[j*lda])); + } + else { + for (int64_t j = 0; j <= i && j < n; ++j) // lower + add_sumsq(scale, sumsq, abs(row[j*lda])); + } + } + else { + // Loop backwards (n-1 down to i) to maintain coalesced reads. + if (diag == lapack::Diag::Unit) { + if (i < n) // diag + add_sumsq(scale, sumsq, real_t(1)); + for (int64_t j = n-1; j > i; --j) // strictly upper + add_sumsq(scale, sumsq, abs(row[j*lda])); + } + else { + for (int64_t j = n-1; j >= i; --j) // upper + add_sumsq(scale, sumsq, abs(row[j*lda])); + } + } + + if (i < item_ct1.get_local_range(2)) { + row_scale[chunk] = 0; + row_sumsq[chunk] = 1; + } + + combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); + } + + // Reduction to find sum-of-squares of tile. + // todo: parallel reduction. + /* + DPCT1065:52: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + if (item_ct1.get_local_id(2) == 0) { + real_t tile_scale = row_scale[0]; + real_t tile_sumsq = row_sumsq[0]; + for (int64_t chunk = 1; + chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + } + + tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; + tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq; + } +} + +//------------------------------------------------------------------------------ +/// Batched routine that computes a partial norm for each trapezoidal tile. +/// +/// todo: rename to tznorm for consistency with other tz routines. +/// +/// @param[in] norm +/// Norm to compute. See values for description. +/// +/// @param[in] uplo +/// Whether each Aarray[k] is upper or lower trapezoidal. +/// +/// @param[in] diag +/// Whether or not each Aarray[k] has unit diagonal. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile. lda >= m. +/// +/// @param[out] values +/// Array in GPU memory, dimension batch_count * ldv. +/// - Norm::Max: ldv = 1. +/// On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count. +/// +/// - Norm::One: ldv >= n. +/// On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= j < n. +/// +/// - Norm::Inf: ldv >= m. +/// On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) +/// for 0 <= k < batch_count, 0 <= i < m. +/// +/// - Norm::Max: ldv = 2. +/// On exit, +/// values[k*2 + 0] = scale_k +/// values[k*2 + 1] = sumsq_k +/// where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 +/// for 0 <= k < batch_count. +/// +/// @param[in] ldv +/// Leading dimension of values array. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void trnorm( + lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + scalar_t const* const* Aarray, int64_t lda, + blas::real_type* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + using real_t = blas::real_type; + int64_t nb = 512; + + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:150: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + //--------- + // max norm + if (norm == lapack::Norm::Max) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count, queue); + } + else { + assert(ldv == 1); + /* + DPCT1083:54: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb; + /* + DPCT1049:53: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + trnorm_max_kernel(uplo, diag, m, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + //--------- + // one norm + else if (norm == lapack::Norm::One) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * n, queue); + } + else { + assert(ldv >= n); + /* + DPCT1049:55: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + trnorm_one_kernel(uplo, diag, m, n, Aarray, lda, values, + ldv, item_ct1); + }); + } + } + //--------- + // inf norm + else if (norm == lapack::Norm::Inf) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * m, queue); + } + else { + assert(ldv >= m); + /* + DPCT1049:56: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + trnorm_inf_kernel(uplo, diag, m, n, Aarray, lda, values, + ldv, item_ct1); + }); + } + } + //--------- + // Frobenius norm + else if (norm == lapack::Norm::Fro) { + if (m == 0 || n == 0) { + blas::device_memset(values, 0, batch_count * 2, queue); + } + else { + assert(ldv == 2); + /* + DPCT1083:58: The size of local memory in the migrated code may be + different from the original code. Check that the allocated memory + size in the migrated code is correct. + */ + size_t shared_mem = sizeof(real_t) * nb * 2; + /* + DPCT1049:57: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nb), + sycl::range<3>(1, 1, nb)), + [=](sycl::nd_item<3> item_ct1) { + trnorm_fro_kernel(uplo, diag, m, n, Aarray, lda, values, + item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); + } + } + + /* + DPCT1010:151: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void trnorm( + lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +template +void trnorm( + lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void trnorm( + lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + float* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + trnorm(norm, uplo, diag, m, n, (sycl::float2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +template <> +void trnorm( + lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + double* values, int64_t ldv, int64_t batch_count, + blas::Queue &queue) +{ + trnorm(norm, uplo, diag, m, n, (sycl::double2 **)Aarray, lda, values, ldv, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp new file mode 100644 index 000000000..e588fb4c9 --- /dev/null +++ b/src/sycl/device_tzadd.dp.cpp @@ -0,0 +1,213 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile addition. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by tzadd(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +/// @param[in,out] Barray +/// Array of tiles of dimension gridDim.x, +/// where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. +/// +/// @param[in] ldb +/// Leading dimension of each tile in Barray. ldb >= m. +/// +template +void tzadd_kernel( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t alpha, scalar_t** Aarray, int64_t lda, + scalar_t beta, scalar_t** Barray, int64_t ldb, + const sycl::nd_item<3> &item_ct1) +{ + scalar_t *tileA = Aarray[item_ct1.get_group(2)]; + scalar_t *tileB = Barray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &tileA[ i ]; + scalar_t* rowB = &tileB[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= i && j < n; ++j) { // lower + rowB[j*ldb] = alpha * rowA[j*lda] + beta * rowB[ j*ldb ]; + // rowB[j * ldb] = dpct_operator_overloading::operator+( + // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), + // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + } + } + else { + for (int64_t j = n-1; j >= i; --j) { // upper + rowB[j*ldb] = alpha * rowA[ j*lda ] + beta * rowB[ j*ldb ]; + // rowB[j * ldb] = dpct_operator_overloading::operator+( + // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), + // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + } + } + } +} + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise trapezoidal tile addition. +/// Sets upper or lower part of +/// \[ +/// Barray[k] = \alpha Aarray[k] + \beta Barray[k]. +/// \] +/// +/// @param[in] uplo +/// Whether each Aarray[k] is upper or lower trapezoidal. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] alpha +/// The scalar alpha. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] beta +/// The scalar beta. +/// +/// @param[in,out] Barray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] ldb +/// Leading dimension of each tile in B. ldb >= m. +/// +/// @param[in] batch_count +/// Size of Aarray and Barray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void tzadd( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t const& alpha, scalar_t** Aarray, int64_t lda, + scalar_t const& beta, scalar_t** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + // quick return + if (batch_count == 0) + return; + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1093:138: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + /* + DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + tzadd_kernel(uplo, m, n, alpha, Aarray, lda, beta, + Barray, ldb, item_ct1); + }); + + /* + DPCT1010:139: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void tzadd( + lapack::Uplo uplo, + int64_t m, int64_t n, + float const& alpha, float** Aarray, int64_t lda, + float const& beta, float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +template +void tzadd( + lapack::Uplo uplo, + int64_t m, int64_t n, + double const& alpha, double** Aarray, int64_t lda, + double const& beta, double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void tzadd( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& alpha, std::complex** Aarray, int64_t lda, + std::complex const& beta, std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzadd(uplo, m, n, sycl::float2(real(alpha), imag(alpha)), + (sycl::float2 **)Aarray, lda, sycl::float2(real(beta), imag(beta)), + (sycl::float2 **)Barray, ldb, batch_count, queue); +} + +template <> +void tzadd( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& alpha, std::complex** Aarray, int64_t lda, + std::complex const& beta, std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzadd(uplo, m, n, sycl::double2(real(alpha), imag(alpha)), + (sycl::double2 **)Aarray, lda, sycl::double2(real(beta), imag(beta)), + (sycl::double2 **)Barray, ldb, batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_tzcopy.dp.cpp b/src/sycl/device_tzcopy.dp.cpp new file mode 100644 index 000000000..586ba697f --- /dev/null +++ b/src/sycl/device_tzcopy.dp.cpp @@ -0,0 +1,246 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing copy and precision conversions, copying A to B. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by tzcopy(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +/// @param[out] Barray +/// Array of tiles of dimension gridDim.x, +/// where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. +/// +/// @param[in] ldb +/// Leading dimension of each tile in Barray. ldb >= m. +/// +template +void tzcopy_kernel( + lapack::Uplo uplo, + int64_t m, int64_t n, + src_scalar_t const* const* Aarray, int64_t lda, + dst_scalar_t** Barray, int64_t ldb, const sycl::nd_item<3> &item_ct1) +{ + src_scalar_t const *tileA = Aarray[item_ct1.get_group(2)]; + dst_scalar_t *tileB = Barray[item_ct1.get_group(2)]; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + src_scalar_t const* rowA = &tileA[ i ]; + dst_scalar_t* rowB = &tileB[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= i && j < n; ++j) { // lower + copy(rowA[j*lda], rowB[j*ldb]); + } + } + else { + for (int64_t j = n-1; j >= i; --j) { // upper + copy(rowA[j*lda], rowB[j*ldb]); + } + } + } +} + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise trapezoidal copy and precision conversion, +/// copying A to B. Sets upper or lower part of +/// \[ +/// Barray[k] = Aarray[k]. +/// \] +/// +/// @param[in] uplo +/// Whether each Aarray[k] is upper or lower trapezoidal. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[out] Barray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] ldb +/// Leading dimension of each tile in B. ldb >= m. +/// +/// @param[in] batch_count +/// Size of Aarray and Barray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + src_scalar_t const* const* Aarray, int64_t lda, + dst_scalar_t** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + // quick return + if (batch_count == 0) + return; + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1093:170: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + /* + DPCT1049:72: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + tzcopy_kernel(uplo, m, n, Aarray, lda, Barray, ldb, + item_ct1); + }); + + /* + DPCT1010:171: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. + +// float => float +template +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// float => double +template +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + float const* const* Aarray, int64_t lda, + double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// double => double +template +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + double** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +// double => float +template +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + double const* const* Aarray, int64_t lda, + float** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. + +// complex-float => complex-float +template <> +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzcopy(uplo, m, n, (sycl::float2 **)Aarray, lda, (sycl::float2 **)Barray, + ldb, batch_count, queue); +} + +// complex-float => complex-double +template <> +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzcopy(uplo, m, n, (sycl::float2 **)Aarray, lda, (sycl::double2 **)Barray, + ldb, batch_count, queue); +} + +// complex-double => complex-double +template <> +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzcopy(uplo, m, n, (sycl::double2 **)Aarray, lda, (sycl::double2 **)Barray, + ldb, batch_count, queue); +} + +// complex-double => complex-float +template <> +void tzcopy( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const* const* Aarray, int64_t lda, + std::complex** Barray, int64_t ldb, + int64_t batch_count, blas::Queue &queue) +{ + tzcopy(uplo, m, n, (sycl::double2 **)Aarray, lda, (sycl::float2 **)Barray, + ldb, batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp new file mode 100644 index 000000000..61df572f2 --- /dev/null +++ b/src/sycl/device_tzscale.dp.cpp @@ -0,0 +1,197 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" + +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile scale. +/// Each thread block deals with one tile. +/// Each thread deals with one row. +/// Launched by gescale(). +/// +/// @param[in] m +/// Number of rows of each tile. m >= 1. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 1. +/// +/// @param[in] numer +/// Scale value numerator. +/// +/// @param[in] denom +/// Scale value denominator. +/// +/// @param[in,out] Aarray +/// Array of tiles of dimension gridDim.x, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +template +void tzscale_kernel( + lapack::Uplo uplo, + int64_t m, int64_t n, + blas::real_type numer, blas::real_type denom, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + scalar_t *tileA = Aarray[item_ct1.get_group(2)]; + blas::real_type mul = numer / denom; + + // thread per row, if more rows than threads, loop by blockDim.x + for (int64_t i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &tileA[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= i && j < n; ++j) { // lower + rowA[j*lda] = rowA[j*lda] * mul; + // rowA[j * lda] = + // dpct_operator_overloading::operator*(rowA[j * lda], mul); + } + } + else { + for (int64_t j = n-1; j >= i; --j) // upper + rowA[j*lda] = rowA[j*lda] * mul; + // rowA[j * lda] = + // dpct_operator_overloading::operator*(rowA[j * lda], mul); + } + } +} + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise trapezoidal tile scale. +/// Sets upper or lower part of +/// \[ +/// Aarray[k] *= (numer / denom). +/// \] +/// This does NOT currently take extra care to avoid over/underflow. +/// +/// @param[in] uplo +/// Whether each Aarray[k] is upper or lower trapezoidal. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] numer +/// Scale value numerator. +/// +/// @param[in] denom +/// Scale value denominator. +/// +/// @param[in,out] Aarray +/// Array in GPU memory of dimension batch_count, containing pointers to tiles, +/// where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in A. lda >= m. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void tzscale( + lapack::Uplo uplo, + int64_t m, int64_t n, + blas::real_type numer, blas::real_type denom, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:132: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int64_t nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + tzscale_kernel(uplo, m, n, numer, denom, Aarray, lda, + item_ct1); + }); + + /* + DPCT1010:133: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void tzscale( + lapack::Uplo uplo, + int64_t m, int64_t n, + float numer, float denom, float** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +template +void tzscale( + lapack::Uplo uplo, + int64_t m, int64_t n, + double numer, double denom, double** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void tzscale( + lapack::Uplo uplo, + int64_t m, int64_t n, + float numer, float denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + tzscale(uplo, m, n, numer, denom, (sycl::float2 **)Aarray, lda, batch_count, + queue); +} + +template <> +void tzscale( + lapack::Uplo uplo, + int64_t m, int64_t n, + double numer, double denom, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue) +{ + tzscale(uplo, m, n, numer, denom, (sycl::double2 **)Aarray, lda, + batch_count, queue); +} + +} // namespace device +} // namespace slate diff --git a/src/sycl/device_tzset.dp.cpp b/src/sycl/device_tzset.dp.cpp new file mode 100644 index 000000000..932d1c901 --- /dev/null +++ b/src/sycl/device_tzset.dp.cpp @@ -0,0 +1,329 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include +#include +#include "slate/Exception.hh" +#include "slate/internal/device.hh" + +#include "device_util.dp.hpp" +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// Device function implementing element-wise tile set. +/// Each thread block deals with one tile. gridDim.x == batch_count. +/// Each thread deals with one row. +/// Called by tzset_kernel and tzset_batch_kernel. +/// +/// @copydoc tzset +/// +template +void tzset_func( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + // thread per row, if more rows than threads, loop by blockDim.x + for (int i = item_ct1.get_local_id(2); i < m; + i += item_ct1.get_local_range(2)) { + scalar_t* rowA = &A[ i ]; + + if (uplo == lapack::Uplo::Lower) { + for (int64_t j = 0; j <= i && j < n; ++j) { // lower + rowA[ j*lda ] = i == j ? diag_value : offdiag_value; + } + } + else { + for (int64_t j = n-1; j >= i; --j) { // upper + rowA[ j*lda ] = i == j ? diag_value : offdiag_value; + } + } + } +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile set. +/// @copydoc tzset +template +void tzset_kernel( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + tzset_func(uplo, m, n, offdiag_value, diag_value, A, lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Kernel implementing element-wise tile set. +/// @copydoc tzset_batch +template +void tzset_batch_kernel( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t offdiag_value, + scalar_t diag_value, + scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1) +{ + tzset_func(uplo, m, n, offdiag_value, diag_value, + Aarray[item_ct1.get_group(2)], lda, item_ct1); +} + +//------------------------------------------------------------------------------ +/// Element-wise trapezoidal tile set. +/// Sets upper or lower part of Aarray[k] to +/// diag_value on the diagonal and offdiag_value on the off-diagonals. +/// +/// @param[in] uplo +/// Whether each Aarray[k] is upper or lower trapezoidal. +/// +/// @param[in] m +/// Number of rows of A. m >= 0. +/// +/// @param[in] n +/// Number of columns of A. n >= 0. +/// +/// @param[in] offdiag_value +/// Constant to set offdiagonal entries to. +/// +/// @param[in] diag_value +/// Constant to set diagonal entries to. +/// +/// @param[out] A +/// An m-by-n matrix stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of A. lda >= m. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t const& offdiag_value, + scalar_t const& diag_value, + scalar_t* A, int64_t lda, + blas::Queue& queue ) +{ + /* + DPCT1093:166: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:70: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + tzset_kernel(uplo, m, n, offdiag_value, diag_value, + A, lda, item_ct1); + }); + + /* + DPCT1010:167: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + float const& offdiag_value, + float const& diag_value, + float* A, int64_t lda, + blas::Queue& queue ); + +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + double const& offdiag_value, + double const& diag_value, + double* A, int64_t lda, + blas::Queue& queue ); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex* A, int64_t lda, + blas::Queue& queue ) +{ + tzset(uplo, m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)), + sycl::float2(real(diag_value), imag(diag_value)), (sycl::float2 *)A, + lda, queue); +} + +template <> +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex* A, int64_t lda, + blas::Queue& queue ) +{ + tzset(uplo, m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)), + sycl::double2(real(diag_value), imag(diag_value)), (sycl::double2 *)A, + lda, queue); +} + +//============================================================================== +namespace batch { + +//------------------------------------------------------------------------------ +/// Batched routine for element-wise trapezoidal tile set. +/// Sets upper or lower part of Aarray[k] to +/// diag_value on the diagonal and offdiag_value on the off-diagonals. +/// +/// @param[in] m +/// Number of rows of each tile. m >= 0. +/// +/// @param[in] n +/// Number of columns of each tile. n >= 0. +/// +/// @param[in] offdiag_value +/// Constant to set offdiagonal entries to. +/// +/// @param[in] diag_value +/// Constant to set diagonal entries to. +/// +/// @param[out] Aarray +/// Array in GPU memory of dimension batch_count, containing +/// pointers to tiles, where each Aarray[k] is an m-by-n matrix +/// stored in an lda-by-n array in GPU memory. +/// +/// @param[in] lda +/// Leading dimension of each tile in Aarray. lda >= m. +/// +/// @param[in] batch_count +/// Size of Aarray. batch_count >= 0. +/// +/// @param[in] queue +/// BLAS++ queue to execute in. +/// +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + scalar_t const& offdiag_value, + scalar_t const& diag_value, + scalar_t** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue ) +{ + // quick return + if (batch_count == 0) + return; + + /* + DPCT1093:168: The "queue.device()" device may be not the one intended for + use. Adjust the selected device if needed. + */ + dpct::select_device(queue.device()); + + // Max threads/block=1024 for current CUDA compute capability (<= 7.5) + int nthreads = std::min( int64_t( 1024 ), m ); + + /* + DPCT1049:71: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + ((sycl::queue *)(&queue.stream())) + ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) * + sycl::range<3>(1, 1, nthreads), + sycl::range<3>(1, 1, nthreads)), + [=](sycl::nd_item<3> item_ct1) { + tzset_batch_kernel(uplo, m, n, offdiag_value, + diag_value, Aarray, lda, + item_ct1); + }); + + /* + DPCT1010:169: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + dpct::err0 error = 0; + slate_assert(error == 0); +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + float const& offdiag_value, + float const& diag_value, + float** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue ); + +template +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + double const& offdiag_value, + double const& diag_value, + double** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue ); + +//------------------------------------------------------------------------------ +// Specializations to cast std::complex => cuComplex. +template <> +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue ) +{ + tzset(uplo, m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)), + sycl::float2(real(diag_value), imag(diag_value)), + (sycl::float2 **)Aarray, lda, batch_count, queue); +} + +template <> +void tzset( + lapack::Uplo uplo, + int64_t m, int64_t n, + std::complex const& offdiag_value, + std::complex const& diag_value, + std::complex** Aarray, int64_t lda, + int64_t batch_count, blas::Queue& queue ) +{ + tzset(uplo, m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)), + sycl::double2(real(diag_value), imag(diag_value)), + (sycl::double2 **)Aarray, lda, batch_count, queue); +} + +} // namespace batch +} // namespace device +} // namespace slate diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp new file mode 100644 index 000000000..6a3ded79c --- /dev/null +++ b/src/sycl/device_util.dp.hpp @@ -0,0 +1,1342 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#ifndef SLATE_DEVICE_UTIL_CUH +#define SLATE_DEVICE_UTIL_CUH + +#include +#include +#include + +namespace slate { +namespace device { + +//------------------------------------------------------------------------------ +/// max that propogates nan consistently: +/// max_nan( 1, nan ) = nan +/// max_nan( nan, 1 ) = nan +template + +inline real_t max_nan(real_t x, real_t y) +{ + return (sycl::isnan(y) || (y) >= (x) ? (y) : (x)); +} + +//------------------------------------------------------------------------------ +/// Max reduction of n-element array x, leaving total in x[0]. Propogates NaN +/// values consistently. +/// With k threads, can reduce array up to 2*k in size. Assumes number of +/// threads <= 1024, which is the current max number of CUDA threads. +/// +/// @param[in] n +/// Size of array. +/// +/// @param[in] tid +/// Thread id. +/// +/// @param[in] x +/// Array of dimension n. On exit, x[0] = max(x[0], ..., x[n-1]); +/// the rest of x is overwritten. +/// +template + +void max_nan_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1) +{ + /* + DPCT1065:0: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 1024) { + if (tid < 1024 && tid + 1024 < n) { + x[tid] = max_nan(x[tid], x[tid + 1024]); + } item_ct1.barrier(); + } + /* + DPCT1065:1: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 512) { + if (tid < 512 && tid + 512 < n) { + x[tid] = max_nan(x[tid], x[tid + 512]); + } item_ct1.barrier(); + } + /* + DPCT1065:2: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 256) { + if (tid < 256 && tid + 256 < n) { + x[tid] = max_nan(x[tid], x[tid + 256]); + } item_ct1.barrier(); + } + /* + DPCT1065:3: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 128) { + if (tid < 128 && tid + 128 < n) { + x[tid] = max_nan(x[tid], x[tid + 128]); + } item_ct1.barrier(); + } + /* + DPCT1065:4: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 64) { + if (tid < 64 && tid + 64 < n) { + x[tid] = max_nan(x[tid], x[tid + 64]); + } item_ct1.barrier(); + } + /* + DPCT1065:5: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 32) { + if (tid < 32 && tid + 32 < n) { + x[tid] = max_nan(x[tid], x[tid + 32]); + } item_ct1.barrier(); + } + /* + DPCT1065:6: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 16) { + if (tid < 16 && tid + 16 < n) { + x[tid] = max_nan(x[tid], x[tid + 16]); + } item_ct1.barrier(); + } + /* + DPCT1065:7: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 8) { + if (tid < 8 && tid + 8 < n) { + x[tid] = max_nan(x[tid], x[tid + 8]); + } item_ct1.barrier(); + } + /* + DPCT1065:8: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 4) { + if (tid < 4 && tid + 4 < n) { + x[tid] = max_nan(x[tid], x[tid + 4]); + } item_ct1.barrier(); + } + /* + DPCT1065:9: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 2) { + if (tid < 2 && tid + 2 < n) { + x[tid] = max_nan(x[tid], x[tid + 2]); + } item_ct1.barrier(); + } + /* + DPCT1065:10: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 1) { + if (tid < 1 && tid + 1 < n) { + x[tid] = max_nan(x[tid], x[tid + 1]); + } item_ct1.barrier(); + } +} + +//------------------------------------------------------------------------------ +/// Sum reduction of n-element array x, leaving total in x[0]. +/// With k threads, can reduce array up to 2*k in size. Assumes number of +/// threads <= 1024 (which is current max number of CUDA threads). +/// +/// @param[in] n +/// Size of array. +/// +/// @param[in] tid +/// Thread id. +/// +/// @param[in] x +/// Array of dimension n. On exit, x[0] = sum(x[0], ..., x[n-1]); +/// rest of x is overwritten. +/// +template + +void sum_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1) +{ + /* + DPCT1065:11: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 1024) { + if (tid < 1024 && tid + 1024 < n) { + x[tid] += x[tid + 1024]; + } item_ct1.barrier(); + } + /* + DPCT1065:12: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 512) { + if (tid < 512 && tid + 512 < n) { + x[tid] += x[tid + 512]; + } item_ct1.barrier(); + } + /* + DPCT1065:13: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 256) { + if (tid < 256 && tid + 256 < n) { + x[tid] += x[tid + 256]; + } item_ct1.barrier(); + } + /* + DPCT1065:14: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 128) { + if (tid < 128 && tid + 128 < n) { + x[tid] += x[tid + 128]; + } item_ct1.barrier(); + } + /* + DPCT1065:15: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 64) { + if (tid < 64 && tid + 64 < n) { + x[tid] += x[tid + 64]; + } item_ct1.barrier(); + } + /* + DPCT1065:16: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 32) { + if (tid < 32 && tid + 32 < n) { + x[tid] += x[tid + 32]; + } item_ct1.barrier(); + } + /* + DPCT1065:17: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 16) { + if (tid < 16 && tid + 16 < n) { + x[tid] += x[tid + 16]; + } item_ct1.barrier(); + } + /* + DPCT1065:18: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 8) { + if (tid < 8 && tid + 8 < n) { + x[tid] += x[tid + 8]; + } item_ct1.barrier(); + } + /* + DPCT1065:19: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 4) { + if (tid < 4 && tid + 4 < n) { + x[tid] += x[tid + 4]; + } item_ct1.barrier(); + } + /* + DPCT1065:20: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 2) { + if (tid < 2 && tid + 2 < n) { + x[tid] += x[tid + 2]; + } item_ct1.barrier(); + } + /* + DPCT1065:21: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + if (n > 1) { + if (tid < 1 && tid + 1 < n) { + x[tid] += x[tid + 1]; + } item_ct1.barrier(); + } +} + +//============================================================================== +// real, imag, conj. + +#ifdef DPCT_COMPATIBILITY_TEMP + +// CUDA doesn't provide real, imag, conj. +inline double real(sycl::double2 x) { return x.x(); } +inline float real(sycl::float2 x) { return x.x(); } + +inline double imag(sycl::double2 x) { return x.y(); } +inline float imag(sycl::float2 x) { return x.y(); } + +inline sycl::double2 conj(sycl::double2 x) { return dpct::conj(x); } +inline sycl::float2 conj(sycl::float2 x) { return dpct::conj(x); } + +#else + +__host__ __device__ inline double real( rocblas_double_complex x ) { return x.real(); } +__host__ __device__ inline float real( rocblas_float_complex x ) { return x.real(); } + +__host__ __device__ inline double imag( rocblas_double_complex x ) { return x.imag(); } +__host__ __device__ inline float imag( rocblas_float_complex x ) { return x.imag(); } + +__host__ __device__ inline rocblas_double_complex conj( rocblas_double_complex x ) { return { x.real(), -x.imag() }; } +__host__ __device__ inline rocblas_float_complex conj( rocblas_float_complex x ) { return { x.real(), -x.imag() }; } + +#endif + +//---------------------------------------- +// Overloads for real numbers. + +/// @return real component of complex number x; x for real number. +/// @ingroup complex +inline double real( double x ) { return x; } +inline float real( float x ) { return x; } + +/// @return imaginary component of complex number x; 0 for real number. +/// @ingroup complex +inline double imag( double x ) { return 0; } +inline float imag( float x ) { return 0; } + +/// @return conjugate of complex number x; x for real number. +/// @ingroup complex +inline double conj( double x ) { return x; } +inline float conj( float x ) { return x; } + +//------------------------------------------------------------------------------ +/// Overloaded versions of absolute value on device. + +inline float abs(float x) +{ + return sycl::fabs(x); +} + +//---------------------------------------- + +inline double abs(double x) +{ + return sycl::fabs(x); +} + +//---------------------------------------- + +inline float abs(sycl::float2 x) +{ +#ifdef DPCT_COMPATIBILITY_TEMP + // CUDA has a good implementation. + return dpct::cabs(x); +#else + // For HIP, use our implementation that scales per LAPACK. + float a = real( x ); + float b = imag( x ); + float z, w, t; + if (isnan( a )) { + return a; + } + else if (isnan( b )) { + return b; + } + else { + a = fabsf(a); + b = fabsf(b); + w = max(a, b); + z = min(a, b); + if (z == 0) { + t = w; + } + else { + t = z/w; + t = 1 + t*t; + t = w * sqrtf(t); + } + return t; + } +#endif +} + +//---------------------------------------- + +inline double abs(sycl::double2 x) +{ +#ifdef DPCT_COMPATIBILITY_TEMP + // CUDA has a good implementation. + return dpct::cabs(x); +#else + // For HIP, use our implementation that scales per LAPACK. + double a = real( x ); + double b = imag( x ); + double z, w, t; + if (isnan( a )) { + return a; + } + else if (isnan( b )) { + return b; + } + else { + a = fabs(a); + b = fabs(b); + w = max(a, b); + z = min(a, b); + if (z == 0) { + t = w; + } + else { + t = z/w; + t = 1.0 + t*t; + t = w * sqrt(t); + } + return t; + } +#endif +} + +//------------------------------------------------------------------------------ +/// Square of number. +/// @return x^2 +template + +inline scalar_t sqr(scalar_t x) +{ + return x*x; +} + +//------------------------------------------------------------------------------ +/// Adds two scaled, sum-of-squares representations. +/// On exit, scale1 and sumsq1 are updated such that: +/// scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2. +template + +void combine_sumsq( + real_t& scale1, real_t& sumsq1, + real_t scale2, real_t sumsq2 ) +{ + if (scale1 > scale2) { + sumsq1 = sumsq1 + sumsq2*sqr(scale2 / scale1); + // scale1 stays same + } + else if (scale2 != 0) { + sumsq1 = sumsq1*sqr(scale1 / scale2) + sumsq2; + scale1 = scale2; + } +} + +//------------------------------------------------------------------------------ +/// Adds new value to scaled, sum-of-squares representation. +/// On exit, scale and sumsq are updated such that: +/// scale^2 sumsq := scale^2 sumsq + (absx)^2 +template + +void add_sumsq( + real_t& scale, real_t& sumsq, + real_t absx) +{ + if (scale < absx) { + sumsq = 1 + sumsq * sqr(scale / absx); + scale = absx; + } + else if (scale != 0) { + sumsq = sumsq + sqr(absx / scale); + } +} + +//------------------------------------------------------------------------------ +/// @return ceil( x / y ), for integer type T. +template + +inline constexpr T ceildiv(T x, T y) +{ + return T((x + y - 1) / y); +} + +//------------------------------------------------------------------------------ +/// @return ceil( x / y )*y, i.e., x rounded up to next multiple of y. +template + +inline constexpr T roundup(T x, T y) +{ + return T((x + y - 1) / y) * y; +} + +//------------------------------------------------------------------------------ +/// Overloaded copy and precision conversion. +/// Sets b = a, converting from type TA to type TB. +template + +inline void copy(TA a, TB& b) +{ + b = a; +} + +/// Sets b = a, converting from complex-float to complex-double. + +inline void copy(sycl::float2 a, sycl::double2 &b) +{ + b = sycl::double2(real(a), imag(a)); +} + +/// Sets b = a, converting from complex-double to complex-float. + +inline void copy(sycl::double2 a, sycl::float2 &b) +{ + b = sycl::float2(real(a), imag(a)); +} + +/// Sets b = a, converting from float to complex-float. + +inline void copy(float a, sycl::float2 &b) +{ + b = sycl::float2(a, 0); +} + +/// Sets b = a, converting from double to complex-double. + +inline void copy(double a, sycl::double2 &b) +{ + b = sycl::double2(a, 0); +} + +//============================================================================== +// CUDA doesn't provide operators, so define our own. +// rocBLAS provides operators. +// +// complex-double + +#if defined( BLAS_HAVE_SYCL ) + +// ---------- negate +/* +DPCT1011:83: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator-(const sycl::double2 &a) +{ + return sycl::double2(-real(a), -imag(a)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:84: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b) +{ + return sycl::double2(real(a) + real(b), imag(a) + imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:85: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator+(const sycl::double2 a, const double s) +{ + return sycl::double2(real(a) + s, imag(a)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:86: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator+(const double s, const sycl::double2 b) +{ + return sycl::double2(s + real(b), imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:87: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b) +{ + a = sycl::double2(real(a) + real(b), imag(a) + imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:88: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator += (sycl::double2 &a, const double s) +{ + a = sycl::double2(real(a) + s, imag(a)); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- subtract +/* +DPCT1011:89: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b) +{ + return sycl::double2(real(a) - real(b), imag(a) - imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:90: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator-(const sycl::double2 a, const double s) +{ + return sycl::double2(real(a) - s, imag(a)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:91: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator-(const double s, const sycl::double2 b) +{ + return sycl::double2(s - real(b), -imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:92: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b) +{ + a = sycl::double2(real(a) - real(b), imag(a) - imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:93: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator -= (sycl::double2 &a, const double s) +{ + a = sycl::double2(real(a) - s, imag(a)); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- multiply +/* +DPCT1011:94: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b) +{ + return sycl::double2(real(a) * real(b) - imag(a) * imag(b), + imag(a) * real(b) + real(a) * imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:95: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator*(const sycl::double2 a, const double s) +{ + return sycl::double2(real(a) * s, imag(a) * s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:96: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator*(const sycl::double2 a, const float s) +{ + return sycl::double2(real(a) * s, imag(a) * s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:97: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator*(const double s, const sycl::double2 a) +{ + return sycl::double2(real(a) * s, imag(a) * s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:98: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b) +{ + a = sycl::double2(real(a) * real(b) - imag(a) * imag(b), + imag(a) * real(b) + real(a) * imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:99: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator *= (sycl::double2 &a, const double s) +{ + a = sycl::double2(real(a) * s, imag(a) * s); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- divide +/* From LAPACK DLADIV + * Performs complex division in real arithmetic, avoiding unnecessary overflow. + * + * a + i*b + * p + i*q = --------- + * c + i*d + */ +/* +DPCT1011:100: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y) +{ + double a = real(x); + double b = imag(x); + double c = real(y); + double d = imag(y); + double e, f, p, q; + if (abs( d ) < abs( c )) { + e = d / c; + f = c + d*e; + p = ( a + b*e ) / f; + q = ( b - a*e ) / f; + } + else { + e = c / d; + f = d + c*e; + p = ( b + a*e ) / f; + q = ( -a + b*e ) / f; + } + return sycl::double2(p, q); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:101: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator/(const sycl::double2 a, const double s) +{ + return sycl::double2(real(a) / s, imag(a) / s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:102: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 operator/(const double a, const sycl::double2 y) +{ + double c = real(y); + double d = imag(y); + double e, f, p, q; + if (abs( d ) < abs( c )) { + e = d / c; + f = c + d*e; + p = a / f; + q = -a*e / f; + } + else { + e = c / d; + f = d + c*e; + p = a*e / f; + q = -a / f; + } + return sycl::double2(p, q); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:103: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b) +{ + a = dpct_operator_overloading::operator/(a, b); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:104: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::double2 &operator /= (sycl::double2 &a, const double s) +{ + a = sycl::double2(real(a) / s, imag(a) / s); + return a; +} +} // namespace dpct_operator_overloading + +//============================================================================== +// complex-float + +// ---------- negate +/* +DPCT1011:105: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator-(const sycl::float2 &a) +{ + return sycl::float2(-real(a), -imag(a)); +} +} // namespace dpct_operator_overloading + +// ---------- add +/* +DPCT1011:106: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b) +{ + return sycl::float2(real(a) + real(b), imag(a) + imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:107: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator+(const sycl::float2 a, const float s) +{ + return sycl::float2(real(a) + s, imag(a)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:108: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator+(const float s, const sycl::float2 b) +{ + return sycl::float2(s + real(b), imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:109: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b) +{ + a = sycl::float2(real(a) + real(b), imag(a) + imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:110: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator += (sycl::float2 &a, const float s) +{ + a = sycl::float2(real(a) + s, imag(a)); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- subtract +/* +DPCT1011:111: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b) +{ + return sycl::float2(real(a) - real(b), imag(a) - imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:112: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator-(const sycl::float2 a, const float s) +{ + return sycl::float2(real(a) - s, imag(a)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:113: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator-(const float s, const sycl::float2 b) +{ + return sycl::float2(s - real(b), -imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:114: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b) +{ + a = sycl::float2(real(a) - real(b), imag(a) - imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:115: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator -= (sycl::float2 &a, const float s) +{ + a = sycl::float2(real(a) - s, imag(a)); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- multiply +/* +DPCT1011:116: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b) +{ + return sycl::float2(real(a) * real(b) - imag(a) * imag(b), + imag(a) * real(b) + real(a) * imag(b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:117: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator*(const sycl::float2 a, const float s) +{ + return sycl::float2(real(a) * s, imag(a) * s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:118: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator*(const float s, const sycl::float2 a) +{ + return sycl::float2(real(a) * s, imag(a) * s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:119: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b) +{ + a = sycl::float2(real(a) * real(b) - imag(a) * imag(b), + imag(a) * real(b) + real(a) * imag(b)); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:120: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator *= (sycl::float2 &a, const float s) +{ + a = sycl::float2(real(a) * s, imag(a) * s); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- divide +/* From LAPACK DLADIV + * Performs complex division in real arithmetic, avoiding unnecessary overflow. + * + * a + i*b + * p + i*q = --------- + * c + i*d + */ +/* +DPCT1011:121: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y) +{ + float a = real(x); + float b = imag(x); + float c = real(y); + float d = imag(y); + float e, f, p, q; + if (abs( d ) < abs( c )) { + e = d / c; + f = c + d*e; + p = ( a + b*e ) / f; + q = ( b - a*e ) / f; + } + else { + e = c / d; + f = d + c*e; + p = ( b + a*e ) / f; + q = ( -a + b*e ) / f; + } + return sycl::float2(p, q); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:122: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator/(const sycl::float2 a, const float s) +{ + return sycl::float2(real(a) / s, imag(a) / s); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:123: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 operator/(const float a, const sycl::float2 y) +{ + float c = real(y); + float d = imag(y); + float e, f, p, q; + if (abs( d ) < abs( c )) { + e = d / c; + f = c + d*e; + p = a / f; + q = -a*e / f; + } + else { + e = c / d; + f = d + c*e; + p = a*e / f; + q = -a / f; + } + return sycl::float2(p, q); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:124: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b) +{ + a = dpct_operator_overloading::operator/(a, b); + return a; +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:125: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline sycl::float2 &operator /= (sycl::float2 &a, const float s) +{ + a = sycl::float2(real(a) / s, imag(a) / s); + return a; +} +} // namespace dpct_operator_overloading + +// ---------- equality +/* +DPCT1011:126: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator *= =(const sycl::float2 a, const sycl::float2 b) +{ + return ( real(a) == real(b) && + imag(a) == imag(b) ); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:127: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator *= =(const sycl::float2 a, const float s) +{ + return ( real(a) == s && + imag(a) == 0. ); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:128: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator *= =(const float s, const sycl::float2 a) +{ + return ( real(a) == s && + imag(a) == 0. ); +} +} // namespace dpct_operator_overloading + +// ---------- not equality +/* +DPCT1011:129: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator != (const sycl::float2 a, const sycl::float2 b) +{ + return !(dpct_operator_overloading::operator *= =(a, b)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:130: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator != (const sycl::float2 a, const float s) +{ + return !(dpct_operator_overloading::operator *= =(a, s)); +} +} // namespace dpct_operator_overloading + +/* +DPCT1011:131: The tool detected overloaded operators for built-in vector types, +which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +standard operators instead. +*/ +namespace dpct_operator_overloading { + +inline bool operator != (const float s, const sycl::float2 a) +{ + return !(dpct_operator_overloading::operator *= =(a, s)); +} +} // namespace dpct_operator_overloading + +#endif // BLAS_WITH_CUBLAS + +} // namespace device +} // namespace slate + +#endif // SLATE_DEVICE_UTIL_CUH From 41cbf01fe58c18f1e5cd6e3f2a7dd2e7bec9e8c9 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Wed, 15 Nov 2023 23:49:48 +0000 Subject: [PATCH 02/10] Build for sycl using sycl/device_ kernels (instead of omptarget). --- GNUmakefile | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index aae7fa9e1..01e95f38a 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -147,9 +147,8 @@ omptarget = 0 ifneq ($(cuda),1) ifneq ($(hip),1) ifeq (${gpu_backend},sycl) - # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices - $(info Note: enabling omp-target-offload kernels) - omptarget = 1 + # enable the kernels in SLATE for oneMKL-SYCL devices + sycl = 1 # -Wno-unused-command-line-argument avoids # icpx warning: -Wl,-rpath,...: 'linker' input unused. @@ -163,7 +162,18 @@ ifneq ($(hip),1) CXXFLAGS += -fsycl -fp-model=precise -Wno-unused-command-line-argument \ -Wno-c99-extensions -Wno-pass-failed LIBS += -lsycl - endif + + # How should the slate kernels be compiled + ifeq (${sycl_kernels},omptarget) # src/omptarget kernels + # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices + omptarget = 1 + else # src/sycl kernels - default/fall-through option + sycl_kernels = 1 + CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels + LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels + endif + + endif endif endif @@ -210,8 +220,13 @@ endif ifeq ($(openmp),1) ifeq (${gpu_backend},sycl) # Intel icpx options for OpenMP offload. - CXXFLAGS += -fiopenmp -fopenmp-targets=spir64 - LDFLAGS += -fiopenmp -fopenmp-targets=spir64 + CXXFLAGS += -fiopenmp + LDFLAGS += -fiopenmp + ifeq (${omptarget},1) + # If SYCL + OpenMP-offload-kernels, specify omp device type + CXXFLAGS += -fopenmp-targets=spir64 + LDFLAGS += -fopenmp-targets=spir64 + endif else # Most other compilers recognize this. CXXFLAGS += -fopenmp @@ -542,6 +557,10 @@ cuda_hdr := \ hip_src := $(patsubst src/cuda/%.cu,src/hip/%.hip.cc,$(cuda_src)) hip_hdr := $(patsubst src/cuda/%.cuh,src/hip/%.hip.hh,$(cuda_hdr)) +# SYCL implementations of device kernels +sycl_kernels_src := $(patsubst src/cuda/%.cu,src/sycl/%.dp.cpp,$(cuda_src)) +sycl_kernels_hdr := $(patsubst src/cuda/%.cuh,src/sycl/%.dp.hpp,$(cuda_hdr)) + # OpenMP implementations of device kernels omptarget_src := \ src/omptarget/device_geadd.cc \ @@ -564,6 +583,8 @@ ifeq (${cuda},1) libslate_src += ${cuda_src} else ifeq (${hip},1) libslate_src += ${hip_src} +else ifeq ($(sycl_kernels),1) + libslate_src += $(sycl_kernels_src) else # Used for both OpenMP offload (${omptarget} == 1) and as stubs for # CPU-only build. @@ -1322,6 +1343,9 @@ hooks: ${hooks} %.hip.o: %.hip.cc | $(hip_hdr) $(HIPCC) $(HIPCCFLAGS) -c $< -o $@ +%.dp.o: %.dp.cpp | $(sycl_kernels_hdr) + $(CXX) $(CXXFLAGS) -c $< -o $@ + %.o: %.cc $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -1462,6 +1486,12 @@ echo: @echo "---------- OMP target-offload kernel options" @echo "omptarget = '${omptarget}'" @echo "omptarget_src = ${omptarget_src}" + @echo "omptarget_hdr = ${omptarget_hdr}" + @echo + @echo "---------- SYCL device kernels" + @echo "sycl_kernels = '$(sycl_kernels)'" + @echo "sycl_kernels_src = '$(sycl_kernels_src)'" + @echo "sycl_kernels_hdr = '$(sycl_kernels_hdr)'" @echo @echo "---------- Fortran compiler" @echo "FC = $(FC)" From c47b9e8287c03b687c382336c41f9e33745fa920 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Thu, 16 Nov 2023 21:27:41 +0000 Subject: [PATCH 03/10] Update sycl-kernels to handle multiplication, need for complex (axpby, multiply_ab). --- src/sycl/device_geadd.dp.cpp | 5 +- src/sycl/device_gescale.dp.cpp | 3 +- src/sycl/device_gescale_row_col.dp.cpp | 9 +- src/sycl/device_tzadd.dp.cpp | 10 +- src/sycl/device_tzscale.dp.cpp | 8 +- src/sycl/device_util.dp.hpp | 1653 ++++++++++++------------ 6 files changed, 851 insertions(+), 837 deletions(-) diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp index 3344ec0b3..ebeddb9ed 100644 --- a/src/sycl/device_geadd.dp.cpp +++ b/src/sycl/device_geadd.dp.cpp @@ -54,10 +54,7 @@ void geadd_func( scalar_t* rowB = &B[ i ]; for (int64_t j = 0; j < n; ++j) - rowB[ j*ldb ] = (alpha * rowA[ j*lda ]) + (beta * rowB[ j*ldb ]); - // rowB[j * ldb] = dpct_operator_overloading::operator+( - // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), - // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + rowB[ j*ldb ] = axpby( alpha, rowA[ j*lda ], beta, rowB[ j*ldb ] ); } } diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp index f76343c7f..afe24370a 100644 --- a/src/sycl/device_gescale.dp.cpp +++ b/src/sycl/device_gescale.dp.cpp @@ -35,8 +35,7 @@ void gescale_func( i += item_ct1.get_local_range(2)) { scalar_t* rowA = &A[ i ]; for (int64_t j = 0; j < n; ++j) - rowA[ j*lda ] = rowA[ j*lda ] * mul; - // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], mul); + rowA[ j*lda ] = multiply_ax( mul, rowA[ j*lda ] ); } } diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp index ffb727c4d..a4b145d46 100644 --- a/src/sycl/device_gescale_row_col.dp.cpp +++ b/src/sycl/device_gescale_row_col.dp.cpp @@ -57,11 +57,10 @@ void gescale_row_col_batch_kernel( i += item_ct1.get_local_range(2)) { scalar_t* rowA = &tileA[ i ]; scalar_t2 ri = R[ i ]; - for (int64_t j = 0; j < n; ++j) - rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]); - // rowA[j * lda] = dpct_operator_overloading::operator*( - // rowA[j * lda], - // dpct_operator_overloading::operator*(ri, C[j]))); + for (int64_t j = 0; j < n; ++j) { + rowA[ j*lda ] = multiply_ax( multiply_ax(ri, C[ j ]), rowA[ j*lda ] ); + // rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]); + } } } diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp index e588fb4c9..158eb92ab 100644 --- a/src/sycl/device_tzadd.dp.cpp +++ b/src/sycl/device_tzadd.dp.cpp @@ -61,18 +61,12 @@ void tzadd_kernel( if (uplo == lapack::Uplo::Lower) { for (int64_t j = 0; j <= i && j < n; ++j) { // lower - rowB[j*ldb] = alpha * rowA[j*lda] + beta * rowB[ j*ldb ]; - // rowB[j * ldb] = dpct_operator_overloading::operator+( - // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), - // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + rowB[j*ldb] = axpby( alpha, rowA[j*lda], beta, rowB[ j*ldb ] ); } } else { for (int64_t j = n-1; j >= i; --j) { // upper - rowB[j*ldb] = alpha * rowA[ j*lda ] + beta * rowB[ j*ldb ]; - // rowB[j * ldb] = dpct_operator_overloading::operator+( - // dpct_operator_overloading::operator*(alpha, rowA[j * lda]), - // dpct_operator_overloading::operator*(beta, rowB[j * ldb])); + rowB[j*ldb] = axpby( alpha, rowA[j*lda], beta, rowB[ j*ldb ] ); } } } diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp index 61df572f2..01bc55ea5 100644 --- a/src/sycl/device_tzscale.dp.cpp +++ b/src/sycl/device_tzscale.dp.cpp @@ -58,16 +58,12 @@ void tzscale_kernel( if (uplo == lapack::Uplo::Lower) { for (int64_t j = 0; j <= i && j < n; ++j) { // lower - rowA[j*lda] = rowA[j*lda] * mul; - // rowA[j * lda] = - // dpct_operator_overloading::operator*(rowA[j * lda], mul); + rowA[j*lda] = multiply_ax( mul, rowA[j*lda] ); } } else { for (int64_t j = n-1; j >= i; --j) // upper - rowA[j*lda] = rowA[j*lda] * mul; - // rowA[j * lda] = - // dpct_operator_overloading::operator*(rowA[j * lda], mul); + rowA[j*lda] = multiply_ax( mul, rowA[j*lda] ); } } } diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp index 6a3ded79c..8c440ae50 100644 --- a/src/sycl/device_util.dp.hpp +++ b/src/sycl/device_util.dp.hpp @@ -353,7 +353,7 @@ inline double abs(double x) inline float abs(sycl::float2 x) { #ifdef DPCT_COMPATIBILITY_TEMP - // CUDA has a good implementation. + // Use DPCT routine return dpct::cabs(x); #else // For HIP, use our implementation that scales per LAPACK. @@ -389,7 +389,7 @@ inline float abs(sycl::float2 x) inline double abs(sycl::double2 x) { #ifdef DPCT_COMPATIBILITY_TEMP - // CUDA has a good implementation. + // Use DPCT routine return dpct::cabs(x); #else // For HIP, use our implementation that scales per LAPACK. @@ -424,7 +424,6 @@ inline double abs(sycl::double2 x) /// Square of number. /// @return x^2 template - inline scalar_t sqr(scalar_t x) { return x*x; @@ -435,7 +434,6 @@ inline scalar_t sqr(scalar_t x) /// On exit, scale1 and sumsq1 are updated such that: /// scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2. template - void combine_sumsq( real_t& scale1, real_t& sumsq1, real_t scale2, real_t sumsq2 ) @@ -455,7 +453,6 @@ void combine_sumsq( /// On exit, scale and sumsq are updated such that: /// scale^2 sumsq := scale^2 sumsq + (absx)^2 template - void add_sumsq( real_t& scale, real_t& sumsq, real_t absx) @@ -472,7 +469,6 @@ void add_sumsq( //------------------------------------------------------------------------------ /// @return ceil( x / y ), for integer type T. template - inline constexpr T ceildiv(T x, T y) { return T((x + y - 1) / y); @@ -481,7 +477,6 @@ inline constexpr T ceildiv(T x, T y) //------------------------------------------------------------------------------ /// @return ceil( x / y )*y, i.e., x rounded up to next multiple of y. template - inline constexpr T roundup(T x, T y) { return T((x + y - 1) / y) * y; @@ -491,850 +486,884 @@ inline constexpr T roundup(T x, T y) /// Overloaded copy and precision conversion. /// Sets b = a, converting from type TA to type TB. template - inline void copy(TA a, TB& b) { b = a; } /// Sets b = a, converting from complex-float to complex-double. - inline void copy(sycl::float2 a, sycl::double2 &b) { b = sycl::double2(real(a), imag(a)); } /// Sets b = a, converting from complex-double to complex-float. - inline void copy(sycl::double2 a, sycl::float2 &b) { b = sycl::float2(real(a), imag(a)); } /// Sets b = a, converting from float to complex-float. - inline void copy(float a, sycl::float2 &b) { b = sycl::float2(a, 0); } /// Sets b = a, converting from double to complex-double. - inline void copy(double a, sycl::double2 &b) { b = sycl::double2(a, 0); } -//============================================================================== -// CUDA doesn't provide operators, so define our own. -// rocBLAS provides operators. -// -// complex-double - -#if defined( BLAS_HAVE_SYCL ) - -// ---------- negate -/* -DPCT1011:83: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator-(const sycl::double2 &a) -{ - return sycl::double2(-real(a), -imag(a)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:84: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b) -{ - return sycl::double2(real(a) + real(b), imag(a) + imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:85: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator+(const sycl::double2 a, const double s) -{ - return sycl::double2(real(a) + s, imag(a)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:86: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator+(const double s, const sycl::double2 b) -{ - return sycl::double2(s + real(b), imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:87: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b) -{ - a = sycl::double2(real(a) + real(b), imag(a) + imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:88: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator += (sycl::double2 &a, const double s) -{ - a = sycl::double2(real(a) + s, imag(a)); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- subtract -/* -DPCT1011:89: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b) -{ - return sycl::double2(real(a) - real(b), imag(a) - imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:90: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator-(const sycl::double2 a, const double s) -{ - return sycl::double2(real(a) - s, imag(a)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:91: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator-(const double s, const sycl::double2 b) -{ - return sycl::double2(s - real(b), -imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:92: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b) -{ - a = sycl::double2(real(a) - real(b), imag(a) - imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:93: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator -= (sycl::double2 &a, const double s) -{ - a = sycl::double2(real(a) - s, imag(a)); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- multiply -/* -DPCT1011:94: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b) -{ - return sycl::double2(real(a) * real(b) - imag(a) * imag(b), - imag(a) * real(b) + real(a) * imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:95: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator*(const sycl::double2 a, const double s) -{ - return sycl::double2(real(a) * s, imag(a) * s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:96: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator*(const sycl::double2 a, const float s) -{ - return sycl::double2(real(a) * s, imag(a) * s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:97: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator*(const double s, const sycl::double2 a) -{ - return sycl::double2(real(a) * s, imag(a) * s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:98: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b) -{ - a = sycl::double2(real(a) * real(b) - imag(a) * imag(b), - imag(a) * real(b) + real(a) * imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:99: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator *= (sycl::double2 &a, const double s) -{ - a = sycl::double2(real(a) * s, imag(a) * s); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- divide -/* From LAPACK DLADIV - * Performs complex division in real arithmetic, avoiding unnecessary overflow. - * - * a + i*b - * p + i*q = --------- - * c + i*d - */ -/* -DPCT1011:100: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y) -{ - double a = real(x); - double b = imag(x); - double c = real(y); - double d = imag(y); - double e, f, p, q; - if (abs( d ) < abs( c )) { - e = d / c; - f = c + d*e; - p = ( a + b*e ) / f; - q = ( b - a*e ) / f; - } - else { - e = c / d; - f = d + c*e; - p = ( b + a*e ) / f; - q = ( -a + b*e ) / f; - } - return sycl::double2(p, q); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:101: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator/(const sycl::double2 a, const double s) -{ - return sycl::double2(real(a) / s, imag(a) / s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:102: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 operator/(const double a, const sycl::double2 y) -{ - double c = real(y); - double d = imag(y); - double e, f, p, q; - if (abs( d ) < abs( c )) { - e = d / c; - f = c + d*e; - p = a / f; - q = -a*e / f; - } - else { - e = c / d; - f = d + c*e; - p = a*e / f; - q = -a / f; - } - return sycl::double2(p, q); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:103: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b) -{ - a = dpct_operator_overloading::operator/(a, b); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:104: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::double2 &operator /= (sycl::double2 &a, const double s) -{ - a = sycl::double2(real(a) / s, imag(a) / s); - return a; -} -} // namespace dpct_operator_overloading - -//============================================================================== -// complex-float - -// ---------- negate -/* -DPCT1011:105: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator-(const sycl::float2 &a) -{ - return sycl::float2(-real(a), -imag(a)); -} -} // namespace dpct_operator_overloading - -// ---------- add -/* -DPCT1011:106: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b) -{ - return sycl::float2(real(a) + real(b), imag(a) + imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:107: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator+(const sycl::float2 a, const float s) -{ - return sycl::float2(real(a) + s, imag(a)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:108: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator+(const float s, const sycl::float2 b) -{ - return sycl::float2(s + real(b), imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:109: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b) -{ - a = sycl::float2(real(a) + real(b), imag(a) + imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:110: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator += (sycl::float2 &a, const float s) -{ - a = sycl::float2(real(a) + s, imag(a)); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- subtract -/* -DPCT1011:111: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b) -{ - return sycl::float2(real(a) - real(b), imag(a) - imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:112: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator-(const sycl::float2 a, const float s) -{ - return sycl::float2(real(a) - s, imag(a)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:113: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator-(const float s, const sycl::float2 b) -{ - return sycl::float2(s - real(b), -imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:114: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b) -{ - a = sycl::float2(real(a) - real(b), imag(a) - imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:115: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator -= (sycl::float2 &a, const float s) -{ - a = sycl::float2(real(a) - s, imag(a)); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- multiply -/* -DPCT1011:116: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b) -{ - return sycl::float2(real(a) * real(b) - imag(a) * imag(b), - imag(a) * real(b) + real(a) * imag(b)); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:117: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator*(const sycl::float2 a, const float s) -{ - return sycl::float2(real(a) * s, imag(a) * s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:118: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator*(const float s, const sycl::float2 a) -{ - return sycl::float2(real(a) * s, imag(a) * s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:119: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b) -{ - a = sycl::float2(real(a) * real(b) - imag(a) * imag(b), - imag(a) * real(b) + real(a) * imag(b)); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:120: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator *= (sycl::float2 &a, const float s) -{ - a = sycl::float2(real(a) * s, imag(a) * s); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- divide -/* From LAPACK DLADIV - * Performs complex division in real arithmetic, avoiding unnecessary overflow. - * - * a + i*b - * p + i*q = --------- - * c + i*d - */ -/* -DPCT1011:121: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y) -{ - float a = real(x); - float b = imag(x); - float c = real(y); - float d = imag(y); - float e, f, p, q; - if (abs( d ) < abs( c )) { - e = d / c; - f = c + d*e; - p = ( a + b*e ) / f; - q = ( b - a*e ) / f; - } - else { - e = c / d; - f = d + c*e; - p = ( b + a*e ) / f; - q = ( -a + b*e ) / f; - } - return sycl::float2(p, q); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:122: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator/(const sycl::float2 a, const float s) -{ - return sycl::float2(real(a) / s, imag(a) / s); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:123: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 operator/(const float a, const sycl::float2 y) -{ - float c = real(y); - float d = imag(y); - float e, f, p, q; - if (abs( d ) < abs( c )) { - e = d / c; - f = c + d*e; - p = a / f; - q = -a*e / f; - } - else { - e = c / d; - f = d + c*e; - p = a*e / f; - q = -a / f; - } - return sycl::float2(p, q); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:124: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b) -{ - a = dpct_operator_overloading::operator/(a, b); - return a; -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:125: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline sycl::float2 &operator /= (sycl::float2 &a, const float s) -{ - a = sycl::float2(real(a) / s, imag(a) / s); - return a; -} -} // namespace dpct_operator_overloading - -// ---------- equality -/* -DPCT1011:126: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline bool operator *= =(const sycl::float2 a, const sycl::float2 b) -{ - return ( real(a) == real(b) && - imag(a) == imag(b) ); -} -} // namespace dpct_operator_overloading - -/* -DPCT1011:127: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline bool operator *= =(const sycl::float2 a, const float s) +//------------------------------------------------------------------------------ +/// Overloaded versions of Ax+By on device, specified for complex +template +inline T axpby(T alpha, T x, T beta, T y) { - return ( real(a) == s && - imag(a) == 0. ); + return alpha*x + beta*y; } -} // namespace dpct_operator_overloading - -/* -DPCT1011:128: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { -inline bool operator *= =(const float s, const sycl::float2 a) -{ - return ( real(a) == s && - imag(a) == 0. ); -} -} // namespace dpct_operator_overloading - -// ---------- not equality -/* -DPCT1011:129: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { - -inline bool operator != (const sycl::float2 a, const sycl::float2 b) +inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x, + sycl::float2 beta, sycl::float2 y) { - return !(dpct_operator_overloading::operator *= =(a, b)); + return dpct::cmul(alpha, x) + dpct::cmul(beta, y); } -} // namespace dpct_operator_overloading - -/* -DPCT1011:130: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { -inline bool operator != (const sycl::float2 a, const float s) +inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x, + sycl::double2 beta, sycl::double2 y) { - return !(dpct_operator_overloading::operator *= =(a, s)); + return dpct::cmul(alpha, x) + dpct::cmul(beta, y); } -} // namespace dpct_operator_overloading - -/* -DPCT1011:131: The tool detected overloaded operators for built-in vector types, -which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec -interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 -standard operators instead. -*/ -namespace dpct_operator_overloading { -inline bool operator != (const float s, const sycl::float2 a) -{ - return !(dpct_operator_overloading::operator *= =(a, s)); -} -} // namespace dpct_operator_overloading -#endif // BLAS_WITH_CUBLAS +//------------------------------------------------------------------------------ +/// Overloaded versions of multiply on device, specified for complex +template +inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x) +{ + return alpha * x; +} + +inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x) +{ + return dpct::cmul(alpha, x); +} + +inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x) +{ + return dpct::cmul(alpha, x); +} + +// //============================================================================== +// // CUDA doesn't provide operators, so define our own. +// // rocBLAS provides operators. +// // +// // complex-double + +// #if defined( BLAS_HAVE_SYCL ) + +// // ---------- negate +// /* +// DPCT1011:83: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator-(const sycl::double2 &a) +// { +// return sycl::double2(-real(a), -imag(a)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:84: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b) +// { +// return sycl::double2(real(a) + real(b), imag(a) + imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:85: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator+(const sycl::double2 a, const double s) +// { +// return sycl::double2(real(a) + s, imag(a)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:86: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator+(const double s, const sycl::double2 b) +// { +// return sycl::double2(s + real(b), imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:87: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b) +// { +// a = sycl::double2(real(a) + real(b), imag(a) + imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:88: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator += (sycl::double2 &a, const double s) +// { +// a = sycl::double2(real(a) + s, imag(a)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- subtract +// /* +// DPCT1011:89: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b) +// { +// return sycl::double2(real(a) - real(b), imag(a) - imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:90: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator-(const sycl::double2 a, const double s) +// { +// return sycl::double2(real(a) - s, imag(a)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:91: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator-(const double s, const sycl::double2 b) +// { +// return sycl::double2(s - real(b), -imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:92: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b) +// { +// a = sycl::double2(real(a) - real(b), imag(a) - imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:93: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator -= (sycl::double2 &a, const double s) +// { +// a = sycl::double2(real(a) - s, imag(a)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- multiply +// /* +// DPCT1011:94: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b) +// { +// return sycl::double2(real(a) * real(b) - imag(a) * imag(b), +// imag(a) * real(b) + real(a) * imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:95: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator*(const sycl::double2 a, const double s) +// { +// return sycl::double2(real(a) * s, imag(a) * s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:96: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator*(const sycl::double2 a, const float s) +// { +// return sycl::double2(real(a) * s, imag(a) * s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:97: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator*(const double s, const sycl::double2 a) +// { +// return sycl::double2(real(a) * s, imag(a) * s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:98: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b) +// { +// a = sycl::double2(real(a) * real(b) - imag(a) * imag(b), +// imag(a) * real(b) + real(a) * imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:99: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator *= (sycl::double2 &a, const double s) +// { +// a = sycl::double2(real(a) * s, imag(a) * s); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- divide +// /* From LAPACK DLADIV +// * Performs complex division in real arithmetic, avoiding unnecessary overflow. +// * +// * a + i*b +// * p + i*q = --------- +// * c + i*d +// */ +// /* +// DPCT1011:100: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y) +// { +// double a = real(x); +// double b = imag(x); +// double c = real(y); +// double d = imag(y); +// double e, f, p, q; +// if (abs( d ) < abs( c )) { +// e = d / c; +// f = c + d*e; +// p = ( a + b*e ) / f; +// q = ( b - a*e ) / f; +// } +// else { +// e = c / d; +// f = d + c*e; +// p = ( b + a*e ) / f; +// q = ( -a + b*e ) / f; +// } +// return sycl::double2(p, q); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:101: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator/(const sycl::double2 a, const double s) +// { +// return sycl::double2(real(a) / s, imag(a) / s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:102: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 operator/(const double a, const sycl::double2 y) +// { +// double c = real(y); +// double d = imag(y); +// double e, f, p, q; +// if (abs( d ) < abs( c )) { +// e = d / c; +// f = c + d*e; +// p = a / f; +// q = -a*e / f; +// } +// else { +// e = c / d; +// f = d + c*e; +// p = a*e / f; +// q = -a / f; +// } +// return sycl::double2(p, q); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:103: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b) +// { +// a = dpct_operator_overloading::operator/(a, b); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:104: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::double2 &operator /= (sycl::double2 &a, const double s) +// { +// a = sycl::double2(real(a) / s, imag(a) / s); +// return a; +// } +// } // namespace dpct_operator_overloading + +// //============================================================================== +// // complex-float + +// // ---------- negate +// /* +// DPCT1011:105: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator-(const sycl::float2 &a) +// { +// return sycl::float2(-real(a), -imag(a)); +// } +// } // namespace dpct_operator_overloading + +// // ---------- add +// /* +// DPCT1011:106: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b) +// { +// return sycl::float2(real(a) + real(b), imag(a) + imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:107: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator+(const sycl::float2 a, const float s) +// { +// return sycl::float2(real(a) + s, imag(a)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:108: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator+(const float s, const sycl::float2 b) +// { +// return sycl::float2(s + real(b), imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:109: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b) +// { +// a = sycl::float2(real(a) + real(b), imag(a) + imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:110: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator += (sycl::float2 &a, const float s) +// { +// a = sycl::float2(real(a) + s, imag(a)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- subtract +// /* +// DPCT1011:111: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b) +// { +// return sycl::float2(real(a) - real(b), imag(a) - imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:112: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator-(const sycl::float2 a, const float s) +// { +// return sycl::float2(real(a) - s, imag(a)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:113: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator-(const float s, const sycl::float2 b) +// { +// return sycl::float2(s - real(b), -imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:114: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b) +// { +// a = sycl::float2(real(a) - real(b), imag(a) - imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:115: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator -= (sycl::float2 &a, const float s) +// { +// a = sycl::float2(real(a) - s, imag(a)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- multiply +// /* +// DPCT1011:116: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b) +// { +// return sycl::float2(real(a) * real(b) - imag(a) * imag(b), +// imag(a) * real(b) + real(a) * imag(b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:117: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator*(const sycl::float2 a, const float s) +// { +// return sycl::float2(real(a) * s, imag(a) * s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:118: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator*(const float s, const sycl::float2 a) +// { +// return sycl::float2(real(a) * s, imag(a) * s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:119: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b) +// { +// a = sycl::float2(real(a) * real(b) - imag(a) * imag(b), +// imag(a) * real(b) + real(a) * imag(b)); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:120: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator *= (sycl::float2 &a, const float s) +// { +// a = sycl::float2(real(a) * s, imag(a) * s); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- divide +// /* From LAPACK DLADIV +// * Performs complex division in real arithmetic, avoiding unnecessary overflow. +// * +// * a + i*b +// * p + i*q = --------- +// * c + i*d +// */ +// /* +// DPCT1011:121: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y) +// { +// float a = real(x); +// float b = imag(x); +// float c = real(y); +// float d = imag(y); +// float e, f, p, q; +// if (abs( d ) < abs( c )) { +// e = d / c; +// f = c + d*e; +// p = ( a + b*e ) / f; +// q = ( b - a*e ) / f; +// } +// else { +// e = c / d; +// f = d + c*e; +// p = ( b + a*e ) / f; +// q = ( -a + b*e ) / f; +// } +// return sycl::float2(p, q); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:122: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator/(const sycl::float2 a, const float s) +// { +// return sycl::float2(real(a) / s, imag(a) / s); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:123: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 operator/(const float a, const sycl::float2 y) +// { +// float c = real(y); +// float d = imag(y); +// float e, f, p, q; +// if (abs( d ) < abs( c )) { +// e = d / c; +// f = c + d*e; +// p = a / f; +// q = -a*e / f; +// } +// else { +// e = c / d; +// f = d + c*e; +// p = a*e / f; +// q = -a / f; +// } +// return sycl::float2(p, q); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:124: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b) +// { +// a = dpct_operator_overloading::operator/(a, b); +// return a; +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:125: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline sycl::float2 &operator /= (sycl::float2 &a, const float s) +// { +// a = sycl::float2(real(a) / s, imag(a) / s); +// return a; +// } +// } // namespace dpct_operator_overloading + +// // ---------- equality +// /* +// DPCT1011:126: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator *= (const sycl::float2 a, const sycl::float2 b) +// { +// return ( real(a) == real(b) && +// imag(a) == imag(b) ); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:127: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator *= (const sycl::float2 a, const float s) +// { +// return ( real(a) == s && +// imag(a) == 0. ); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:128: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator *= (const float s, const sycl::float2 a) +// { +// return ( real(a) == s && +// imag(a) == 0. ); +// } +// } // namespace dpct_operator_overloading + +// // ---------- not equality +// /* +// DPCT1011:129: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator != (const sycl::float2 a, const sycl::float2 b) +// { +// return !(dpct_operator_overloading::operator *= (a, b)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:130: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator != (const sycl::float2 a, const float s) +// { +// return !(dpct_operator_overloading::operator *= (a, s)); +// } +// } // namespace dpct_operator_overloading + +// /* +// DPCT1011:131: The tool detected overloaded operators for built-in vector types, +// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec +// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020 +// standard operators instead. +// */ +// namespace dpct_operator_overloading { + +// inline bool operator != (const float s, const sycl::float2 a) +// { +// return !(dpct_operator_overloading::operator *= (a, s)); +// } +// } // namespace dpct_operator_overloading + +// #endif // BLAS_WITH_CUBLAS } // namespace device } // namespace slate From 974e9c77d5aa65055171b800c389df327d580b45 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Thu, 16 Nov 2023 21:29:10 +0000 Subject: [PATCH 04/10] Better separation for sycl build and omptarget-vs-sycl-kernels. --- GNUmakefile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 01e95f38a..d35192878 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -143,7 +143,8 @@ ifneq ($(cuda),1) endif endif -omptarget = 0 +use_omptarget_kernels = 0 +use_sycl_kernels = 0 ifneq ($(cuda),1) ifneq ($(hip),1) ifeq (${gpu_backend},sycl) @@ -166,9 +167,9 @@ ifneq ($(hip),1) # How should the slate kernels be compiled ifeq (${sycl_kernels},omptarget) # src/omptarget kernels # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices - omptarget = 1 + use_omptarget_kernels = 1 else # src/sycl kernels - default/fall-through option - sycl_kernels = 1 + use_sycl_kernels = 1 CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels endif @@ -222,7 +223,7 @@ ifeq ($(openmp),1) # Intel icpx options for OpenMP offload. CXXFLAGS += -fiopenmp LDFLAGS += -fiopenmp - ifeq (${omptarget},1) + ifeq (${use_omptarget_kernels},1) # If SYCL + OpenMP-offload-kernels, specify omp device type CXXFLAGS += -fopenmp-targets=spir64 LDFLAGS += -fopenmp-targets=spir64 @@ -583,7 +584,7 @@ ifeq (${cuda},1) libslate_src += ${cuda_src} else ifeq (${hip},1) libslate_src += ${hip_src} -else ifeq ($(sycl_kernels),1) +else ifeq ($(use_sycl_kernels),1) libslate_src += $(sycl_kernels_src) else # Used for both OpenMP offload (${omptarget} == 1) and as stubs for @@ -1484,12 +1485,12 @@ echo: @echo "sycl = '$(sycl)'" @echo @echo "---------- OMP target-offload kernel options" - @echo "omptarget = '${omptarget}'" + @echo "omptarget = '${use_omptarget_kernels}'" @echo "omptarget_src = ${omptarget_src}" @echo "omptarget_hdr = ${omptarget_hdr}" @echo @echo "---------- SYCL device kernels" - @echo "sycl_kernels = '$(sycl_kernels)'" + @echo "sycl_kernels = '$(use_sycl_kernels)'" @echo "sycl_kernels_src = '$(sycl_kernels_src)'" @echo "sycl_kernels_hdr = '$(sycl_kernels_hdr)'" @echo From 21d4025798553865589e8f6a0c15043b29261ef5 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Mon, 20 Nov 2023 19:02:30 +0000 Subject: [PATCH 05/10] For sycl kernels (generated by dpct tool), remove dependency on dpct/dpct.hpp header. --- src/sycl/device_geadd.dp.cpp | 26 -------- src/sycl/device_gecopy.dp.cpp | 13 ---- src/sycl/device_genorm.dp.cpp | 13 ---- src/sycl/device_gescale.dp.cpp | 25 -------- src/sycl/device_gescale_row_col.dp.cpp | 13 ---- src/sycl/device_geset.dp.cpp | 25 -------- src/sycl/device_henorm.dp.cpp | 13 ---- src/sycl/device_synorm.dp.cpp | 25 -------- src/sycl/device_transpose.dp.cpp | 49 --------------- src/sycl/device_trnorm.dp.cpp | 13 ---- src/sycl/device_tzadd.dp.cpp | 13 ---- src/sycl/device_tzcopy.dp.cpp | 13 ---- src/sycl/device_tzscale.dp.cpp | 13 ---- src/sycl/device_tzset.dp.cpp | 25 -------- src/sycl/device_util.dp.hpp | 87 +++++++++++++++++++------- 15 files changed, 63 insertions(+), 303 deletions(-) diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp index ebeddb9ed..53ee3e69d 100644 --- a/src/sycl/device_geadd.dp.cpp +++ b/src/sycl/device_geadd.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -129,12 +128,6 @@ void geadd( if (m == 0 || n == 0) return; - /* - DPCT1093:146: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -150,13 +143,6 @@ void geadd( geadd_kernel(m, n, alpha, A, lda, beta, B, ldb, item_ct1); }); - - /* - DPCT1010:147: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -257,12 +243,6 @@ void geadd( if (batch_count == 0) return; - /* - DPCT1093:148: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -280,12 +260,6 @@ void geadd( Barray, ldb, item_ct1); }); - /* - DPCT1010:149: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_gecopy.dp.cpp b/src/sycl/device_gecopy.dp.cpp index 660946f00..299363c12 100644 --- a/src/sycl/device_gecopy.dp.cpp +++ b/src/sycl/device_gecopy.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -107,12 +106,6 @@ void gecopy( // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); - /* - DPCT1093:152: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - /* DPCT1049:59: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. @@ -127,12 +120,6 @@ void gecopy( item_ct1); }); - /* - DPCT1010:153: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp index bb20deaa9..45136ca52 100644 --- a/src/sycl/device_genorm.dp.cpp +++ b/src/sycl/device_genorm.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -427,12 +426,6 @@ void genorm( if (batch_count == 0) return; - /* - DPCT1093:144: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - if (scope == NormScope::Matrix) { //--------- @@ -610,12 +603,6 @@ void genorm( slate_not_implemented("The norm scope isn't yet supported."); } - /* - DPCT1010:145: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp index afe24370a..57d04b203 100644 --- a/src/sycl/device_gescale.dp.cpp +++ b/src/sycl/device_gescale.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -98,12 +97,6 @@ void gescale( if (m == 0 || n == 0) return; - /* - DPCT1093:154: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -121,12 +114,6 @@ void gescale( gescale_kernel(m, n, mul, A, lda, item_ct1); }); - /* - DPCT1010:155: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -241,12 +228,6 @@ void gescale( if (batch_count == 0) return; - /* - DPCT1093:156: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -266,12 +247,6 @@ void gescale( item_ct1); }); - /* - DPCT1010:157: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp index a4b145d46..cd24eaeef 100644 --- a/src/sycl/device_gescale_row_col.dp.cpp +++ b/src/sycl/device_gescale_row_col.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -194,12 +193,6 @@ void gescale_row_col_batch( if (batch_count == 0) return; - /* - DPCT1093:140: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -249,12 +242,6 @@ void gescale_row_col_batch( }); } - /* - DPCT1010:141: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_geset.dp.cpp b/src/sycl/device_geset.dp.cpp index f82d58fdf..713a9d3e6 100644 --- a/src/sycl/device_geset.dp.cpp +++ b/src/sycl/device_geset.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -105,12 +104,6 @@ void geset( if (m == 0 || n == 0) return; - /* - DPCT1093:134: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -127,12 +120,6 @@ void geset( item_ct1); }); - /* - DPCT1010:135: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -226,12 +213,6 @@ void geset( if (m == 0 || n == 0) return; - /* - DPCT1093:136: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -249,12 +230,6 @@ void geset( Aarray, lda, item_ct1); }); - /* - DPCT1010:137: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp index 3a3ea4569..d7a829185 100644 --- a/src/sycl/device_henorm.dp.cpp +++ b/src/sycl/device_henorm.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -327,12 +326,6 @@ void henorm( if (batch_count == 0) return; - /* - DPCT1093:142: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - //--------- // max norm if (norm == lapack::Norm::Max) { @@ -431,12 +424,6 @@ void henorm( } } - /* - DPCT1010:143: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp index d57370262..26e6e4648 100644 --- a/src/sycl/device_synorm.dp.cpp +++ b/src/sycl/device_synorm.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -319,12 +318,6 @@ void synorm( if (batch_count == 0) return; - /* - DPCT1093:172: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - //--------- // max norm if (norm == lapack::Norm::Max) { @@ -423,12 +416,6 @@ void synorm( } } - /* - DPCT1010:173: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } const int ib = 32; @@ -594,12 +581,6 @@ void synormOffdiag( if (batch_count == 0) return; - /* - DPCT1093:174: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - //--------- // one norm if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) { @@ -631,12 +612,6 @@ void synormOffdiag( slate_not_implemented("Only Norm::One and Norm::Inf is supported."); } - /* - DPCT1010:175: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_transpose.dp.cpp b/src/sycl/device_transpose.dp.cpp index 33d7007eb..01ed4d7d6 100644 --- a/src/sycl/device_transpose.dp.cpp +++ b/src/sycl/device_transpose.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -328,12 +327,6 @@ void transpose( return; assert(lda >= n); - /* - DPCT1093:158: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - int nt = ceildiv( n, int64_t(ib) ); assert(nt <= 65535); // CUDA limitation @@ -393,12 +386,6 @@ void transpose( }); }); - /* - DPCT1010:159: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -434,12 +421,6 @@ void transpose_batch( return; assert(lda >= n); - /* - DPCT1093:160: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - int nt = ceildiv( n, int64_t(ib) ); assert(nt <= 65535); // CUDA limitation assert(batch_count <= 2147483647); // CUDA limitation, 2^31 - 1 @@ -501,12 +482,6 @@ void transpose_batch( }); }); - /* - DPCT1010:161: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -564,12 +539,6 @@ void transpose( assert(lda >= m); assert(ldat >= n); - /* - DPCT1093:162: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - int mt = ceildiv( m, int64_t(NB) ); assert(mt <= 65535); // CUDA limitation int nt = ceildiv( n, int64_t(NB) ); @@ -622,12 +591,6 @@ void transpose( }); }); - /* - DPCT1010:163: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -678,12 +641,6 @@ void transpose_batch( assert(lda >= m); assert(ldat >= n); - /* - DPCT1093:164: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - int mt = ceildiv( m, int64_t(NB) ); assert(mt <= 65535); // CUDA limitation int nt = ceildiv( n, int64_t(NB) ); @@ -738,12 +695,6 @@ void transpose_batch( }); }); - /* - DPCT1010:165: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp index f0b802937..9f2e94939 100644 --- a/src/sycl/device_trnorm.dp.cpp +++ b/src/sycl/device_trnorm.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -447,12 +446,6 @@ void trnorm( if (batch_count == 0) return; - /* - DPCT1093:150: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - //--------- // max norm if (norm == lapack::Norm::Max) { @@ -576,12 +569,6 @@ void trnorm( } } - /* - DPCT1010:151: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp index 158eb92ab..b10ec9bce 100644 --- a/src/sycl/device_tzadd.dp.cpp +++ b/src/sycl/device_tzadd.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -129,12 +128,6 @@ void tzadd( // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); - /* - DPCT1093:138: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - /* DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. @@ -149,12 +142,6 @@ void tzadd( Barray, ldb, item_ct1); }); - /* - DPCT1010:139: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_tzcopy.dp.cpp b/src/sycl/device_tzcopy.dp.cpp index 586ba697f..7a2dd5341 100644 --- a/src/sycl/device_tzcopy.dp.cpp +++ b/src/sycl/device_tzcopy.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -120,12 +119,6 @@ void tzcopy( // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); - /* - DPCT1093:170: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - /* DPCT1049:72: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. @@ -140,12 +133,6 @@ void tzcopy( item_ct1); }); - /* - DPCT1010:171: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp index 01bc55ea5..afb082173 100644 --- a/src/sycl/device_tzscale.dp.cpp +++ b/src/sycl/device_tzscale.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -116,12 +115,6 @@ void tzscale( if (batch_count == 0) return; - /* - DPCT1093:132: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int64_t nthreads = std::min( int64_t( 1024 ), m ); @@ -139,12 +132,6 @@ void tzscale( item_ct1); }); - /* - DPCT1010:133: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_tzset.dp.cpp b/src/sycl/device_tzset.dp.cpp index 932d1c901..786201076 100644 --- a/src/sycl/device_tzset.dp.cpp +++ b/src/sycl/device_tzset.dp.cpp @@ -4,7 +4,6 @@ // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. #include -#include #include "slate/Exception.hh" #include "slate/internal/device.hh" @@ -115,12 +114,6 @@ void tzset( scalar_t* A, int64_t lda, blas::Queue& queue ) { - /* - DPCT1093:166: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int nthreads = std::min( int64_t( 1024 ), m ); @@ -137,12 +130,6 @@ void tzset( A, lda, item_ct1); }); - /* - DPCT1010:167: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ @@ -242,12 +229,6 @@ void tzset( if (batch_count == 0) return; - /* - DPCT1093:168: The "queue.device()" device may be not the one intended for - use. Adjust the selected device if needed. - */ - dpct::select_device(queue.device()); - // Max threads/block=1024 for current CUDA compute capability (<= 7.5) int nthreads = std::min( int64_t( 1024 ), m ); @@ -266,12 +247,6 @@ void tzset( item_ct1); }); - /* - DPCT1010:169: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - dpct::err0 error = 0; - slate_assert(error == 0); } //------------------------------------------------------------------------------ diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp index 8c440ae50..b0d6694f7 100644 --- a/src/sycl/device_util.dp.hpp +++ b/src/sycl/device_util.dp.hpp @@ -7,7 +7,8 @@ #define SLATE_DEVICE_UTIL_CUH #include -#include +// #include +#define DPCT_COMPATIBILITY_TEMP #include namespace slate { @@ -41,7 +42,6 @@ inline real_t max_nan(real_t x, real_t y) /// the rest of x is overwritten. /// template - void max_nan_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1) { /* @@ -299,8 +299,32 @@ inline float real(sycl::float2 x) { return x.x(); } inline double imag(sycl::double2 x) { return x.y(); } inline float imag(sycl::float2 x) { return x.y(); } -inline sycl::double2 conj(sycl::double2 x) { return dpct::conj(x); } -inline sycl::float2 conj(sycl::float2 x) { return dpct::conj(x); } +// inline sycl::double2 conj(sycl::double2 x) { return dpct::conj(x); } +// inline sycl::float2 conj(sycl::float2 x) { return dpct::conj(x); } +//------------------------------------------------------------------------------ +/// Computes the complex conjugate of a complex number. +/// \tparam T Complex element type +/// \param [in] x The input complex number +/// \returns The result +// Taken from dpct util.hpp +template +sycl::vec conj(sycl::vec x) { + std::complex t(x[0], x[1]); + t = std::conj(t); + return sycl::vec(t.real(), t.imag()); +} + +/// Computes the magnitude of a complex number. +/// \tparam T Complex element type +/// \param [in] x The input complex number +/// \returns The result +// Taken from dpct util.hpp +template +T cabs(sycl::vec x) { + std::complex t(x[0], x[1]); + return std::abs(t); +} + #else @@ -354,7 +378,8 @@ inline float abs(sycl::float2 x) { #ifdef DPCT_COMPATIBILITY_TEMP // Use DPCT routine - return dpct::cabs(x); + //return dpct::cabs(x); + return cabs(x); #else // For HIP, use our implementation that scales per LAPACK. float a = real( x ); @@ -390,7 +415,8 @@ inline double abs(sycl::double2 x) { #ifdef DPCT_COMPATIBILITY_TEMP // Use DPCT routine - return dpct::cabs(x); + // return dpct::cabs(x); + return cabs(x); #else // For HIP, use our implementation that scales per LAPACK. double a = real( x ); @@ -516,44 +542,57 @@ inline void copy(double a, sycl::double2 &b) } //------------------------------------------------------------------------------ -/// Overloaded versions of Ax+By on device, specified for complex +/// Computes the multiplication of two complex numbers. +/// \tparam T Complex element type +/// \param [in] x The first input complex number +/// \param [in] y The second input complex number +/// \returns The result template -inline T axpby(T alpha, T x, T beta, T y) -{ - return alpha*x + beta*y; +sycl::vec cmul(sycl::vec x, sycl::vec y) { + std::complex t1(x[0], x[1]), t2(y[0], y[1]); + t1 = t1 * t2; + return sycl::vec(t1.real(), t1.imag()); } -inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x, - sycl::float2 beta, sycl::float2 y) +//------------------------------------------------------------------------------ +/// Overloaded versions of multiply on device, specified for complex +template +inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x) { - return dpct::cmul(alpha, x) + dpct::cmul(beta, y); + return alpha * x; } -inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x, - sycl::double2 beta, sycl::double2 y) +inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x) { - return dpct::cmul(alpha, x) + dpct::cmul(beta, y); + return cmul(alpha, x); } +inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x) +{ + return cmul(alpha, x); +} //------------------------------------------------------------------------------ -/// Overloaded versions of multiply on device, specified for complex -template -inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x) +/// Overloaded versions of Ax+By on device, specified for complex +template +inline T axpby(T alpha, T x, T beta, T y) { - return alpha * x; + return alpha*x + beta*y; } -inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x) +inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x, + sycl::float2 beta, sycl::float2 y) { - return dpct::cmul(alpha, x); + return cmul(alpha, x) + cmul(beta, y); } -inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x) +inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x, + sycl::double2 beta, sycl::double2 y) { - return dpct::cmul(alpha, x); + return cmul(alpha, x) + cmul(beta, y); } + // //============================================================================== // // CUDA doesn't provide operators, so define our own. // // rocBLAS provides operators. From 83d345bef0e502fd546a3afc3f69a33c3dc240c8 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Tue, 21 Nov 2023 15:26:25 +0000 Subject: [PATCH 06/10] In sycl device code, rename some shadow-ed variables. --- src/sycl/device_genorm.dp.cpp | 6 +++--- src/sycl/device_henorm.dp.cpp | 6 +++--- src/sycl/device_synorm.dp.cpp | 6 +++--- src/sycl/device_trnorm.dp.cpp | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp index 45136ca52..b4d9b0fac 100644 --- a/src/sycl/device_genorm.dp.cpp +++ b/src/sycl/device_genorm.dp.cpp @@ -300,9 +300,9 @@ void genorm_fro_kernel( if (item_ct1.get_local_id(2) == 0) { tile_scale = row_scale[0]; tile_sumsq = row_sumsq[0]; - for (int64_t chunk = 1; - chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) { - combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + for (int64_t chunk1 = 1; + chunk1 < item_ct1.get_local_range(2) && chunk1 < m; ++chunk1) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]); } tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp index d7a829185..fb172764b 100644 --- a/src/sycl/device_henorm.dp.cpp +++ b/src/sycl/device_henorm.dp.cpp @@ -254,9 +254,9 @@ void henorm_fro_kernel( if (item_ct1.get_local_id(2) == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; - for (int64_t chunk = 1; - chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) { - combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + for (int64_t chunk1 = 1; + chunk1 < item_ct1.get_local_range(2) && chunk1 < n; ++chunk1) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]); } tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp index 26e6e4648..ab08d073e 100644 --- a/src/sycl/device_synorm.dp.cpp +++ b/src/sycl/device_synorm.dp.cpp @@ -246,9 +246,9 @@ void synorm_fro_kernel( if (item_ct1.get_local_id(2) == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; - for (int64_t chunk = 1; - chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) { - combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + for (int64_t chunk1 = 1; + chunk1 < item_ct1.get_local_range(2) && chunk1 < n; ++chunk1) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]); } tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp index 9f2e94939..816cf7ae1 100644 --- a/src/sycl/device_trnorm.dp.cpp +++ b/src/sycl/device_trnorm.dp.cpp @@ -364,9 +364,9 @@ void trnorm_fro_kernel( if (item_ct1.get_local_id(2) == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; - for (int64_t chunk = 1; - chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) { - combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]); + for (int64_t chunk1 = 1; + chunk1 < item_ct1.get_local_range(2) && chunk1 < m; ++chunk1) { + combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]); } tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale; From e98380672664f3bc87de1d293236e0dc7b9a213a Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Tue, 21 Nov 2023 17:07:22 +0000 Subject: [PATCH 07/10] Allow CI testing for Intel GPUs. --- .github/workflows/test.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/test.sh b/.github/workflows/test.sh index 655ecf4f3..df203d52c 100755 --- a/.github/workflows/test.sh +++ b/.github/workflows/test.sh @@ -12,10 +12,6 @@ err=0 export OMP_NUM_THREADS=8 -# Currently, OpenMP offload tests don't work on our Intel GPU. -# CI checks only compilation. -if [ "${device}" != "gpu_intel" ]; then - print "======================================== Unit tests" cd unit_test @@ -85,7 +81,5 @@ if [ "${maker}" = "make" ]; then fi fi -fi # device != gpu_intel - print "======================================== Finished test" exit ${err} From f7c775a1564e14d0c590af136b4a008d13ae1ca4 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Wed, 22 Nov 2023 23:54:08 +0000 Subject: [PATCH 08/10] For sycl-kernels, update CMakefile. --- CMakeLists.txt | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e2ca27b2a..15761171c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -364,28 +364,41 @@ if (gpu_backend MATCHES "^(sycl|auto)$") # Intel-IntelLLVM compiler while compiling omptarget offload # routines. (the compiler uses fast floating point mode by # default). - target_compile_options( slate PRIVATE "-fp-model=precise" ) + target_compile_options( slate PRIVATE + "$<$:-fp-model=precise>" ) - # -Wno-unused-command-line-argument avoids - # icpx warning: -Wl,-rpath,...: 'linker' input unused. - # # -Wno-c99-extensions avoids # icpx warning: '_Complex' is a C99 extension. - # - # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc) - # icpx warning: loop not vectorized. - # target_compile_options( slate PRIVATE - "$<$:-Wno-unused-command-line-argument>" - "$<$:-Wno-c99-extensions>" - "$<$:-Wno-pass-failed>" ) - - # Intel icpx options for OpenMP offload. - target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" ) - target_link_options( slate PRIVATE "-fopenmp-targets=spir64" ) - - # Source files are set below after CUDA and HIP. + $<$:-Wno-c99-extensions> ) + + if (sycl_kernels MATCHES "^(omptarget)$") # src/omptarget kernels + message( "sycl_kernels = omptarget" ) + # Enable the OpenMP omptarget offload kernels in SLATE for oneMKL-SYCL devices + file( GLOB libslate_omptarget_src CONFIGURE_DEPENDS src/omptarget/*.cc ) + target_sources( slate PRIVATE ${libslate_omptarget_src} ) + # -Wno-unused-command-line-argument avoids + # icpx warning: -Wl,-rpath,...: 'linker' input unused. + target_compile_options( slate PRIVATE + $<$:-Wno-unused-command-line-argument> ) + # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc) + # icpx warning: loop not vectorized. + target_compile_options( slate PRIVATE + $<$:-Wno-pass-failed> ) + # specify the OpenMP offload target + target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" ) + target_link_options( slate PRIVATE "-fopenmp-targets=spir64" ) + else() # src/sycl kernels - default/fall-through option + message( "sycl_kernels = sycl" ) + file( GLOB libslate_sycl_src CONFIGURE_DEPENDS src/sycl/*.dp.cpp ) + target_sources( slate PRIVATE ${libslate_sycl_src} ) + target_compile_options( slate PRIVATE $<$: -fsycl> ) + target_compile_options( + slate PRIVATE $<$: -fsycl-unnamed-lambda> ) + target_link_options( slate PRIVATE "-fsycl" ) + target_link_options( slate PRIVATE "-fsycl-unnamed-lambda" ) + endif() target_link_libraries( slate PUBLIC -lmkl_sycl -lsycl -lOpenCL ) message( STATUS "Building SYCL support" ) @@ -669,7 +682,7 @@ endif() #------------------------------------------------------------------------------- # Files for OpenMP offload or CPU-only builds. -if (NOT "${gpu_backend}" MATCHES "^(cuda|hip)$") +if (NOT "${gpu_backend}" MATCHES "^(cuda|hip|sycl)$") file( GLOB libslate_omptarget_src CONFIGURE_DEPENDS # glob at build time From 7935dcb2e8f4a8014e8f432219a3312d8a2fe62c Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Mon, 27 Nov 2023 19:36:39 +0000 Subject: [PATCH 09/10] The Intel C++ flag "Wno-unused-command-line-argument" is set in Make/Cmake. --- .github/workflows/configure.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/configure.sh b/.github/workflows/configure.sh index 0b71dfeb0..eb45750cf 100755 --- a/.github/workflows/configure.sh +++ b/.github/workflows/configure.sh @@ -42,10 +42,7 @@ if [ "${maker}" = "make" ]; then || exit 10 elif [ "${maker}" = "cmake" ]; then - # Intel icpx needs -Wno-unused-command-line-argument to avoid - # warnings: 'linker' input unused, which prevent CMake finding OpenMP. cmake -Dcolor=no \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ -DCMAKE_INSTALL_PREFIX=${top}/install \ -Dgpu_backend=${gpu_backend} .. \ || exit 12 From c4ec23e6b97c0b221badf2b988a6a156eb731192 Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Thu, 21 Dec 2023 16:56:10 +0000 Subject: [PATCH 10/10] In sycl/device_tzscale, add missing namespace batch. --- src/sycl/device_tzscale.dp.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp index afb082173..2ecc61106 100644 --- a/src/sycl/device_tzscale.dp.cpp +++ b/src/sycl/device_tzscale.dp.cpp @@ -67,6 +67,9 @@ void tzscale_kernel( } } +//============================================================================== +namespace batch { + //------------------------------------------------------------------------------ /// Batched routine for element-wise trapezoidal tile scale. /// Sets upper or lower part of @@ -176,5 +179,6 @@ void tzscale( batch_count, queue); } +} // namespace batch } // namespace device } // namespace slate