From 0b00f45178b9222a31bd60a96a143e3d474d75c5 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Wed, 15 Nov 2023 23:48:24 +0000
Subject: [PATCH 01/10] Add sycl/device_ kernels and support for sycl complex
 definitions.

---
 include/slate/internal/device.hh       |   20 +-
 src/sycl/device_geadd.dp.cpp           |  338 ++++++
 src/sycl/device_gecopy.dp.cpp          |  249 +++++
 src/sycl/device_genorm.dp.cpp          |  666 ++++++++++++
 src/sycl/device_gescale.dp.cpp         |  343 ++++++
 src/sycl/device_gescale_row_col.dp.cpp |  334 ++++++
 src/sycl/device_geset.dp.cpp           |  308 ++++++
 src/sycl/device_henorm.dp.cpp          |  487 +++++++++
 src/sycl/device_synorm.dp.cpp          |  736 +++++++++++++
 src/sycl/device_transpose.dp.cpp       |  918 ++++++++++++++++
 src/sycl/device_trnorm.dp.cpp          |  632 +++++++++++
 src/sycl/device_tzadd.dp.cpp           |  213 ++++
 src/sycl/device_tzcopy.dp.cpp          |  246 +++++
 src/sycl/device_tzscale.dp.cpp         |  197 ++++
 src/sycl/device_tzset.dp.cpp           |  329 ++++++
 src/sycl/device_util.dp.hpp            | 1342 ++++++++++++++++++++++++
 16 files changed, 7355 insertions(+), 3 deletions(-)
 create mode 100644 src/sycl/device_geadd.dp.cpp
 create mode 100644 src/sycl/device_gecopy.dp.cpp
 create mode 100644 src/sycl/device_genorm.dp.cpp
 create mode 100644 src/sycl/device_gescale.dp.cpp
 create mode 100644 src/sycl/device_gescale_row_col.dp.cpp
 create mode 100644 src/sycl/device_geset.dp.cpp
 create mode 100644 src/sycl/device_henorm.dp.cpp
 create mode 100644 src/sycl/device_synorm.dp.cpp
 create mode 100644 src/sycl/device_transpose.dp.cpp
 create mode 100644 src/sycl/device_trnorm.dp.cpp
 create mode 100644 src/sycl/device_tzadd.dp.cpp
 create mode 100644 src/sycl/device_tzcopy.dp.cpp
 create mode 100644 src/sycl/device_tzscale.dp.cpp
 create mode 100644 src/sycl/device_tzset.dp.cpp
 create mode 100644 src/sycl/device_util.dp.hpp
diff --git a/include/slate/internal/device.hh b/include/slate/internal/device.hh
index a7092b7f0..b9099fda1 100644
--- a/include/slate/internal/device.hh
+++ b/include/slate/internal/device.hh
@@ -68,7 +68,19 @@
     };
 
     } // namespace blas
-#endif // #elif defined( BLAS_HAVE_ROCBLAS )
+
+#elif defined( BLAS_HAVE_SYCL )
+    #include <sycl/sycl.hpp>
+    namespace blas {
+
+    template <typename T>
+    struct blas::real_type_traits< sycl::vec<T, 2> > {
+        using real_t = T;
+    };
+
+    } // namespace blas
+
+#endif // #defined( BLAS_HAVE_{CUBLAS,ROCBLAS,SYCL} )
 
 namespace slate {
 
@@ -76,9 +88,11 @@ namespace slate {
 /// GPU device implementations of kernels.
 namespace device {
 
-// Use omp-target-kernels when OneMKL-SYCL is used
+// Use when SYCL and oneMKL are used
 #if defined( BLAS_HAVE_SYCL )
-    #define SLATE_HAVE_OMPTARGET
+    // todo: make this build automatically
+    // Manually uncomment to compile OMP target-offload kernels
+    // #define SLATE_HAVE_OMPTARGET
 #endif
 
 // Simplify checking for GPU device support (CUDA / ROCm / SYCL).
diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp
new file mode 100644
index 000000000..3344ec0b3
--- /dev/null
+++ b/src/sycl/device_geadd.dp.cpp
@@ -0,0 +1,338 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile addition.
+/// Each thread deals with one row.
+/// Launched by geadd_kernel() and geadd_batch_kernel().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+/// @param[in,out] B
+///     is an m-by-n matrix stored in an ldb-by-n array.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in Barray. ldb >= m.
+///
+/// @copydoc geadd
+///
+template <typename scalar_t>
+void geadd_func(
+    int64_t m, int64_t n,
+    scalar_t alpha, scalar_t* A, int64_t lda,
+    scalar_t beta,  scalar_t* B, int64_t ldb, const sycl::nd_item<3> &item_ct1)
+{
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &A[ i ];
+        scalar_t* rowB = &B[ i ];
+
+        for (int64_t j = 0; j < n; ++j)
+            rowB[ j*ldb ] = (alpha * rowA[ j*lda ]) + (beta * rowB[ j*ldb ]);
+            // rowB[j * ldb] = dpct_operator_overloading::operator+(
+            //                     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
+            //                     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile.
+/// @copydoc geadd
+template <typename scalar_t>
+void geadd_kernel(
+    int64_t m, int64_t n,
+    scalar_t alpha, scalar_t* A, int64_t lda,
+    scalar_t beta,  scalar_t* B, int64_t ldb, const sycl::nd_item<3> &item_ct1)
+{
+    geadd_func(m, n, alpha, A, lda, beta, B, ldb, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile set.
+/// @copydoc geadd_batch
+template <typename scalar_t>
+void geadd_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t alpha, scalar_t** Aarray, int64_t lda,
+    scalar_t beta,  scalar_t** Barray, int64_t ldb,
+    const sycl::nd_item<3> &item_ct1)
+{
+    geadd_func(m, n, alpha, Aarray[item_ct1.get_group(2)], lda, beta,
+               Barray[item_ct1.get_group(2)], ldb, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Routine for element-wise tile addition.
+/// Sets
+/// \[
+///     B = \alpha A + \beta B.
+/// \]
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] alpha
+///     The scalar alpha.
+///
+/// @param[in] A
+///     is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] beta
+///     The scalar beta.
+///
+/// @param[in,out] B
+///     is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in B. ldb >= m.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void geadd(
+    int64_t m, int64_t n,
+    scalar_t const& alpha, scalar_t* A, int64_t lda,
+    scalar_t const& beta, scalar_t* B, int64_t ldb,
+    blas::Queue &queue)
+{
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+
+    /*
+    DPCT1093:146: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:49: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           geadd_kernel(m, n, alpha, A, lda, beta, B, ldb,
+                                        item_ct1);
+                       });
+
+    /*
+    DPCT1010:147: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geadd(
+    int64_t m, int64_t n,
+    float const& alpha, float* Aarray, int64_t lda,
+    float const& beta,  float* Barray, int64_t ldb,
+    blas::Queue &queue);
+
+template
+void geadd(
+    int64_t m, int64_t n,
+    double const& alpha, double* Aarray, int64_t lda,
+    double const& beta,  double* Barray, int64_t ldb,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void geadd(
+    int64_t m, int64_t n,
+    std::complex<float> const& alpha, std::complex<float>* Aarray, int64_t lda,
+    std::complex<float> const& beta,  std::complex<float>* Barray, int64_t ldb,
+    blas::Queue &queue)
+{
+    geadd(m, n, sycl::float2(real(alpha), imag(alpha)), (sycl::float2 *)Aarray,
+          lda, sycl::float2(real(beta), imag(beta)), (sycl::float2 *)Barray,
+          ldb, queue);
+}
+
+template <>
+void geadd(
+    int64_t m, int64_t n,
+    std::complex<double> const& alpha, std::complex<double>* Aarray, int64_t lda,
+    std::complex<double> const& beta,  std::complex<double>* Barray, int64_t ldb,
+    blas::Queue &queue)
+{
+    geadd(m, n, sycl::double2(real(alpha), imag(alpha)),
+          (sycl::double2 *)Aarray, lda, sycl::double2(real(beta), imag(beta)),
+          (sycl::double2 *)Barray, ldb, queue);
+}
+
+//==============================================================================
+namespace batch {
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise tile addition.
+/// Sets
+/// \[
+///     Barray[k] = \alpha Aarray[k] + \beta Barray[k].
+/// \]
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] alpha
+///     The scalar alpha.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] beta
+///     The scalar beta.
+///
+/// @param[in,out] Barray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in B. ldb >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray and Barray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void geadd(
+    int64_t m, int64_t n,
+    scalar_t const& alpha, scalar_t** Aarray, int64_t lda,
+    scalar_t const& beta, scalar_t** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:148: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:50: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           geadd_batch_kernel(m, n, alpha, Aarray, lda, beta,
+                                              Barray, ldb, item_ct1);
+                       });
+
+    /*
+    DPCT1010:149: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geadd(
+    int64_t m, int64_t n,
+    float const& alpha, float** Aarray, int64_t lda,
+    float const& beta, float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+template
+void geadd(
+    int64_t m, int64_t n,
+    double const& alpha, double** Aarray, int64_t lda,
+    double const& beta, double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void geadd(
+    int64_t m, int64_t n,
+    std::complex<float> const& alpha, std::complex<float>** Aarray, int64_t lda,
+    std::complex<float> const& beta, std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    geadd(m, n, sycl::float2(real(alpha), imag(alpha)), (sycl::float2 **)Aarray,
+          lda, sycl::float2(real(beta), imag(beta)), (sycl::float2 **)Barray,
+          ldb, batch_count, queue);
+}
+
+template <>
+void geadd(
+    int64_t m, int64_t n,
+    std::complex<double> const& alpha, std::complex<double>** Aarray, int64_t lda,
+    std::complex<double> const& beta, std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    geadd(m, n, sycl::double2(real(alpha), imag(alpha)),
+          (sycl::double2 **)Aarray, lda, sycl::double2(real(beta), imag(beta)),
+          (sycl::double2 **)Barray, ldb, batch_count, queue);
+}
+
+} // namespace batch
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_gecopy.dp.cpp b/src/sycl/device_gecopy.dp.cpp
new file mode 100644
index 000000000..660946f00
--- /dev/null
+++ b/src/sycl/device_gecopy.dp.cpp
@@ -0,0 +1,249 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing copy and precision conversions, copying A to B.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gecopy().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+/// @param[out] Barray
+///     Array of tiles of dimension gridDim.x,
+///     where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in Barray. ldb >= m.
+///
+template <typename src_scalar_t, typename dst_scalar_t>
+void gecopy_kernel(
+    int64_t m, int64_t n,
+    src_scalar_t const* const* Aarray, int64_t lda,
+    dst_scalar_t** Barray, int64_t ldb, const sycl::nd_item<3> &item_ct1)
+{
+    src_scalar_t const *tileA = Aarray[item_ct1.get_group(2)];
+    dst_scalar_t *tileB = Barray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        src_scalar_t const* rowA = &tileA[ i ];
+        dst_scalar_t*       rowB = &tileB[ i ];
+
+        for (int64_t j = 0; j < n; ++j)
+            copy(rowA[j*lda], rowB[j*ldb]);
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise copy and precision conversion,
+/// copying A to B. Sets
+/// \[
+///     Barray[k] = Aarray[k].
+/// \]
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[out] Barray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in B. ldb >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray and Barray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename src_scalar_t, typename dst_scalar_t>
+void gecopy(
+    int64_t m, int64_t n,
+    src_scalar_t const* const* Aarray, int64_t lda,
+    dst_scalar_t** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1093:152: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    /*
+    DPCT1049:59: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           gecopy_kernel(m, n, Aarray, lda, Barray, ldb,
+                                         item_ct1);
+                       });
+
+    /*
+    DPCT1010:153: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+
+// float => float
+template
+void gecopy(
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// float => double
+template
+void gecopy(
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// double => double
+template
+void gecopy(
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// double => float
+template
+void gecopy(
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+
+// complex-float => complex-float
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (sycl::float2 **)Aarray, lda, (sycl::float2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+// complex-float => complex-double
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (sycl::float2 **)Aarray, lda, (sycl::double2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+// complex-double => complex-double
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (sycl::double2 **)Aarray, lda, (sycl::double2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+// complex-double => complex-float
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (sycl::double2 **)Aarray, lda, (sycl::float2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+// float => complex-float
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (float **)Aarray, lda, (sycl::float2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+// double => complex-double
+template <>
+void gecopy(
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    gecopy(m, n, (double **)Aarray, lda, (sycl::double2 **)Barray, ldb,
+           batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp
new file mode 100644
index 000000000..bb20deaa9
--- /dev/null
+++ b/src/sycl/device_genorm.dp.cpp
@@ -0,0 +1,666 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Finds the largest absolute value of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Uses dynamic shared memory array of length sizeof(real_t) * m.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by genorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_maxima
+///     Array of dimension gridDim.x.
+///     On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) )
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void genorm_max_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_maxima, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_max = (real_t*) dynamic_data;
+    int chunk;
+    if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) {
+        row_max[item_ct1.get_local_id(2)] = 0;
+    }
+
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        chunk = i % item_ct1.get_local_range(2);
+        scalar_t const* row = &tile[ i ];
+        real_t max = 0;
+
+        // Each thread finds max of one row.
+        for (int64_t j = 0; j < n; ++j)
+            max = max_nan(max, abs(row[j*lda]));
+
+        // Save partial results in shared memory.
+        row_max[chunk] = max_nan(max, row_max[chunk]);
+    }
+
+    // Reduction to find max of tile.
+    /*
+    DPCT1065:36: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2),
+                   row_max, item_ct1);
+    if (item_ct1.get_local_id(2) == 0) {
+        tiles_maxima[item_ct1.get_group(2)] = row_max[0];
+    }
+}
+
+const int ib  = 32;  ///< block size for genorm_one_kernel
+const int ib1 = 33;  ///< ib + 1 for stride to avoid GPU bank conflicts
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each column of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one column.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by genorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) )
+///     for row j of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void genorm_one_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    auto dynamic_data = (char *)dpct_local;
+    real_t* shmem_tile = (real_t*)dynamic_data;
+    const int k = item_ct1.get_local_id(2);
+
+    for (int64_t jj = 0; jj < n; jj += ib) {
+        real_t sum = 0.0;
+        for (int64_t ii = 0; ii < m; ii += ib) {
+            // Read 32x32 sub-tile into shared memory.
+            // This does coalesced reads of one column at a time in parallel.
+            for (int64_t j = 0; j < ib; ++j)
+                if (jj+j < n && ii+k < m)
+                    shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] );
+            /*
+            DPCT1065:37: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // shmem_tile loaded
+
+            // Each thread sums one column.
+            for (int64_t i = 0; i < ib; ++i)
+                if (jj+k < n && ii+i < m)
+                    sum += shmem_tile[ k*ib1 + i ];
+            /*
+            DPCT1065:38: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // done with shmem_tile
+        }
+
+        if (jj+k < n)
+            tiles_sums[item_ct1.get_group(2) * ldv + jj + k] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each row of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by genorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) )
+///     for row i of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void genorm_inf_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t const* row = &tile[ i ];
+
+        // Each thread sums one row.
+        // This does coalesced reads of one column at a time in parallel.
+        real_t sum = abs(row[0]);
+        for (int64_t j = 1; j < n; ++j)
+            sum += abs(row[j*lda]);
+
+        tiles_sums[item_ct1.get_group(2) * ldv + i] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of squares, in scaled representation, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by genorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension blockDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_values
+///     Array of dimension 2 * blockDim.x.
+///     On exit,
+///         tiles_values[2*k + 0] = scale
+///         tiles_values[2*k + 1] = sumsq
+///     such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void genorm_fro_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_values, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_scale = (real_t*) &dynamic_data[0];
+    real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)];
+
+    real_t tile_scale = row_scale[0];
+    real_t tile_sumsq = row_sumsq[0];
+
+    // Each thread finds sum-of-squares of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t const* row = &tile[ i ];
+        real_t scale = 0;
+        real_t sumsq = 1;
+        chunk = i % item_ct1.get_local_range(2);
+
+        for (int64_t j = 0; j < n; ++j) {
+            add_sumsq(scale, sumsq, abs(row[j*lda]));
+        }
+
+        if (i < item_ct1.get_local_range(2)) {
+            row_scale[chunk] = 0;
+            row_sumsq[chunk] = 1;
+        }
+
+        // Save partial results in shared memory.
+        combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
+    }
+
+    // Reduction to find sum-of-squares of tile.
+    // todo: parallel reduction.
+     /*
+     DPCT1065:39: Consider replacing sycl::nd_item::barrier() with
+     sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+     performance if there is no access to global memory.
+     */
+     item_ct1.barrier();
+    if (item_ct1.get_local_id(2) == 0) {
+        tile_scale = row_scale[0];
+        tile_sumsq = row_sumsq[0];
+        for (int64_t chunk = 1;
+             chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        }
+
+        tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
+        tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq;
+    }
+}
+
+//------------------------------------------------------------------------------
+// todo docs
+template <typename scalar_t>
+void ge_col_norms_max_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* col_max, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    auto dynamic_data = (char *)dpct_local;
+    real_t* shmem_tile = (real_t*)dynamic_data;
+    const int k = item_ct1.get_local_id(2);
+
+    for (int64_t jj = 0; jj < n; jj += ib) {
+        real_t max = 0.0;
+        for (int64_t ii = 0; ii < m; ii += ib) {
+            // Read 32x32 sub-tile into shared memory.
+            // This does coalesced reads of one column at a time in parallel.
+            for (int64_t j = 0; j < ib; ++j)
+                if (jj+j < n && ii+k < m)
+                    shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] );
+            /*
+            DPCT1065:40: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // shmem_tile loaded
+
+            // Each thread compute max of one column.
+            for (int64_t i = 0; i < ib; ++i)
+                if (jj+k < n && ii+i < m)
+                    max = max_nan( shmem_tile[ k*ib1 + i ], max );
+            /*
+            DPCT1065:41: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // done with shmem_tile
+        }
+
+        if (jj+k < n)
+            col_max[item_ct1.get_group(2) * ldv + jj + k] = max;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine that computes a partial norm for each tile.
+///
+/// @param[in] norm
+///     Norm to compute. See values for description.
+///
+/// @param[in] scope
+///     Scope of the norm.
+///     - NormScope::Matrix  computes partial norm of each tile.
+///     - NormScope::Columns computes norm of each column.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] values
+///     Array in GPU memory, dimension batch_count * ldv.
+///     - Norm::Max: ldv = 1.
+///         On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count.
+///
+///     - Norm::One: ldv >= n.
+///         On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= j < n.
+///
+///     - Norm::Inf: ldv >= m.
+///         On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= i < m.
+///
+///     - Norm::Max: ldv = 2.
+///         On exit,
+///             values[k*2 + 0] = scale_k
+///             values[k*2 + 1] = sumsq_k
+///         where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2
+///         for 0 <= k < batch_count.
+///
+/// @param[in] ldv
+///     Leading dimension of values array.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void genorm(
+    lapack::Norm norm, NormScope scope,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    using real_t = blas::real_type<scalar_t>;
+    int64_t nb = 512;
+
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:144: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    if (scope == NormScope::Matrix) {
+
+        //---------
+        // max norm
+        if (norm == lapack::Norm::Max) {
+            if (m == 0 || n == 0) {
+                blas::device_memset(values, 0, batch_count, queue);
+            }
+            else {
+                assert(ldv == 1);
+                /*
+                DPCT1083:43: The size of local memory in the migrated code may
+                be different from the original code. Check that the allocated
+                memory size in the migrated code is correct.
+                */
+                size_t shared_mem = sizeof(real_t) * nb;
+                /*
+                DPCT1049:42: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                ((sycl::queue *)(&queue.stream()))
+                    ->submit([&](sycl::handler &cgh) {
+                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                            sycl::range<1>(shared_mem), cgh);
+
+                        cgh.parallel_for(
+                            sycl::nd_range<3>(
+                                sycl::range<3>(1, 1, batch_count) *
+                                    sycl::range<3>(1, 1, nb),
+                                sycl::range<3>(1, 1, nb)),
+                            [=](sycl::nd_item<3> item_ct1) {
+                                genorm_max_kernel(
+                                    m, n, Aarray, lda, values, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer());
+                            });
+                    });
+            }
+        }
+        //---------
+        // one norm
+        else if (norm == lapack::Norm::One) {
+            if (m == 0 || n == 0) {
+                blas::device_memset(values, 0, batch_count * n, queue);
+            }
+            else {
+                assert(ldv >= n);
+                /*
+                DPCT1083:44: The size of local memory in the migrated code may
+                be different from the original code. Check that the allocated
+                memory size in the migrated code is correct.
+                */
+                size_t shared_mem = sizeof(real_t) * ib * ib1;
+                ((sycl::queue *)(&queue.stream()))
+                    ->submit([&](sycl::handler &cgh) {
+                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                            sycl::range<1>(shared_mem), cgh);
+
+                        cgh.parallel_for(
+                            sycl::nd_range<3>(
+                                sycl::range<3>(1, 1, batch_count) *
+                                    sycl::range<3>(1, 1, ib),
+                                sycl::range<3>(1, 1, ib)),
+                            [=](sycl::nd_item<3> item_ct1) {
+                                genorm_one_kernel(
+                                    m, n, Aarray, lda, values, ldv, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer());
+                            });
+                    });
+            }
+        }
+        //---------
+        // inf norm
+        else if (norm == lapack::Norm::Inf) {
+            if (m == 0 || n == 0) {
+                blas::device_memset(values, 0, batch_count * m, queue);
+            }
+            else {
+                assert(ldv >= m);
+                /*
+                DPCT1049:45: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                ((sycl::queue *)(&queue.stream()))
+                    ->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                              sycl::range<3>(1, 1, nb),
+                                          sycl::range<3>(1, 1, nb)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            genorm_inf_kernel(m, n, Aarray, lda, values, ldv,
+                                              item_ct1);
+                        });
+            }
+        }
+        //---------
+        // Frobenius norm
+        else if (norm == lapack::Norm::Fro) {
+            if (m == 0 || n == 0) {
+                blas::device_memset(values, 0, batch_count * 2, queue);
+            }
+            else {
+                assert(ldv == 2);
+                /*
+                DPCT1083:47: The size of local memory in the migrated code may
+                be different from the original code. Check that the allocated
+                memory size in the migrated code is correct.
+                */
+                size_t shared_mem = sizeof(real_t) * nb * 2;
+                /*
+                DPCT1049:46: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                ((sycl::queue *)(&queue.stream()))
+                    ->submit([&](sycl::handler &cgh) {
+                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                            sycl::range<1>(shared_mem), cgh);
+
+                        cgh.parallel_for(
+                            sycl::nd_range<3>(
+                                sycl::range<3>(1, 1, batch_count) *
+                                    sycl::range<3>(1, 1, nb),
+                                sycl::range<3>(1, 1, nb)),
+                            [=](sycl::nd_item<3> item_ct1) {
+                                genorm_fro_kernel(
+                                    m, n, Aarray, lda, values, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer());
+                            });
+                    });
+            }
+        }
+    }
+    else if (scope == NormScope::Columns) {
+
+        if (norm == Norm::Max) {
+
+            if (m == 0 || n == 0) {
+                blas::device_memset(values, 0, batch_count * n, queue);
+            }
+            else {
+                assert(ldv >= n);
+                /*
+                DPCT1083:48: The size of local memory in the migrated code may
+                be different from the original code. Check that the allocated
+                memory size in the migrated code is correct.
+                */
+                size_t shared_mem = sizeof(real_t) * ib * ib1;
+                ((sycl::queue *)(&queue.stream()))
+                    ->submit([&](sycl::handler &cgh) {
+                        sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                            sycl::range<1>(shared_mem), cgh);
+
+                        cgh.parallel_for(
+                            sycl::nd_range<3>(
+                                sycl::range<3>(1, 1, batch_count) *
+                                    sycl::range<3>(1, 1, ib),
+                                sycl::range<3>(1, 1, ib)),
+                            [=](sycl::nd_item<3> item_ct1) {
+                                ge_col_norms_max_kernel(
+                                    m, n, Aarray, lda, values, ldv, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer());
+                            });
+                    });
+            }
+        }
+        else {
+            slate_not_implemented("The norm isn't yet supported");
+        }
+    }
+    else {
+        slate_not_implemented("The norm scope isn't yet supported.");
+    }
+
+    /*
+    DPCT1010:145: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void genorm(
+    lapack::Norm norm, NormScope scope,
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+template
+void genorm(
+    lapack::Norm norm, NormScope scope,
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void genorm(
+    lapack::Norm norm, NormScope scope,
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    genorm(norm, scope, m, n, (sycl::float2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+template <>
+void genorm(
+    lapack::Norm norm, NormScope scope,
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    genorm(norm, scope, m, n, (sycl::double2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp
new file mode 100644
index 000000000..f76343c7f
--- /dev/null
+++ b/src/sycl/device_gescale.dp.cpp
@@ -0,0 +1,343 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Device function implementing element-wise tile scale.
+/// Each thread block deals with one tile. gridDim.x == batch_count.
+/// Each thread deals with one row.
+/// Called by gescale_kernel and gescale_batch_kernel.
+///
+/// @copydoc gescale
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale_func(
+    int64_t m, int64_t n,
+    scalar_t2 mul,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &A[ i ];
+        for (int64_t j = 0; j < n; ++j)
+            rowA[ j*lda ] = rowA[ j*lda ] * mul;
+            // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], mul);
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile scale.
+/// @copydoc gescale
+template <typename scalar_t, typename scalar_t2>
+void gescale_kernel(
+    int64_t m, int64_t n,
+    scalar_t2 mul,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    gescale_func(m, n, mul, A, lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile scale.
+/// @copydoc gescale_batch
+template <typename scalar_t, typename scalar_t2>
+void gescale_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t2 mul,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    gescale_func(m, n, mul, Aarray[item_ct1.get_group(2)], lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile scale.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gescale().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] numer
+///     Scale value numerator.
+///
+/// @param[in] denom
+///     Scale value denominator.
+///
+/// @param[in,out] A
+///     An m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale(
+    int64_t m, int64_t n,
+    scalar_t2 numer, scalar_t2 denom,
+    scalar_t* A, int64_t lda,
+    blas::Queue& queue)
+{
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+
+    /*
+    DPCT1093:154: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    scalar_t2 mul = numer / denom;
+
+    /*
+    DPCT1049:60: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           gescale_kernel(m, n, mul, A, lda, item_ct1);
+                       });
+
+    /*
+    DPCT1010:155: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void gescale(
+    int64_t m, int64_t n,
+    float numer, float denom,
+    float* A, int64_t lda,
+    blas::Queue& queue);
+
+template
+void gescale(
+    int64_t m, int64_t n,
+    double numer, double denom,
+    double* A, int64_t lda,
+    blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    float numer, float denom,
+    std::complex<float>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    gescale(m, n, numer, denom, (sycl::float2 *)A, lda, queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    std::complex<float> numer, std::complex<float> denom,
+    std::complex<float>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    gescale(m, n, sycl::float2(real(numer), imag(numer)),
+            sycl::float2(real(denom), imag(denom)), (sycl::float2 *)A, lda,
+            queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    double numer,  double denom,
+    std::complex<double>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    gescale(m, n, numer, denom, (sycl::double2 *)A, lda, queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    std::complex<double> numer, std::complex<double> denom,
+    std::complex<double>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    gescale(m, n, sycl::double2(real(numer), imag(numer)),
+            sycl::double2(real(denom), imag(denom)), (sycl::double2 *)A, lda,
+            queue);
+}
+
+
+//==============================================================================
+namespace batch {
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise tile scale. Sets
+/// \[
+///     Aarray[k] *= (numer / denom).
+/// \]
+/// This does NOT currently take extra care to avoid over/underflow.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] numer
+///     Scale value numerator.
+///
+/// @param[in] denom
+///     Scale value denominator.
+///
+/// @param[in,out] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale(
+    int64_t m, int64_t n,
+    scalar_t2 numer, scalar_t2 denom,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:156: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    scalar_t2 mul = numer / denom;
+
+    /*
+    DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           gescale_batch_kernel(m, n, mul, Aarray, lda,
+                                                item_ct1);
+                       });
+
+    /*
+    DPCT1010:157: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void gescale(
+    int64_t m, int64_t n,
+    float numer, float denom,
+    float** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+template
+void gescale(
+    int64_t m, int64_t n,
+    double numer, double denom,
+    double** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    float numer, float denom,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale(m, n, numer, denom, (sycl::float2 **)Aarray, lda, batch_count, queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    std::complex<float> numer, std::complex<float> denom,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale(m, n, sycl::float2(real(numer), imag(numer)),
+            sycl::float2(real(denom), imag(denom)), (sycl::float2 **)Aarray,
+            lda, batch_count, queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    double numer,  double denom,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale(m, n, numer, denom, (sycl::double2 **)Aarray, lda, batch_count,
+            queue);
+}
+
+template <>
+void gescale(
+    int64_t m, int64_t n,
+    std::complex<double> numer, std::complex<double> denom,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale(m, n, sycl::double2(real(numer), imag(numer)),
+            sycl::double2(real(denom), imag(denom)), (sycl::double2 **)Aarray,
+            lda, batch_count, queue);
+}
+
+} // namespace batch
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp
new file mode 100644
index 000000000..ffb727c4d
--- /dev/null
+++ b/src/sycl/device_gescale_row_col.dp.cpp
@@ -0,0 +1,334 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing row and column scaling.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gescale_row_col().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Rarray
+///     Vector of length m containing row scaling factors.
+///
+/// @param[in] Carray
+///     Vector of length n containing column scaling factors.
+///
+/// @param[in,out] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale_row_col_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t2 const* const* Rarray,
+    scalar_t2 const* const* Carray,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    scalar_t2 const *R = Rarray[item_ct1.get_group(2)];
+    scalar_t2 const *C = Carray[item_ct1.get_group(2)];
+    scalar_t *tileA = Aarray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &tileA[ i ];
+        scalar_t2 ri = R[ i ];
+        for (int64_t j = 0; j < n; ++j)
+            rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]);
+            // rowA[j * lda] = dpct_operator_overloading::operator*(
+            //     rowA[j * lda],
+            //     dpct_operator_overloading::operator*(ri, C[j])));
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing column scaling.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gescale_row_col().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Carray
+///     Vector of length n containing column scaling factors.
+///
+/// @param[in,out] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale_col_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t2 const* const* Carray,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    scalar_t2 const *C = Carray[item_ct1.get_group(2)];
+    scalar_t *tileA = Aarray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &tileA[ i ];
+        for (int64_t j = 0; j < n; ++j)
+            rowA[ j*lda ] = rowA[ j*lda ] * C[ j ];
+            // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], C[j]);
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing row scaling.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gescale_row_col().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Rarray
+///     Vector of length m containing row scaling factors.
+///
+/// @param[in,out] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale_row_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t2 const* const* Rarray,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    scalar_t2 const *R = Rarray[item_ct1.get_group(2)];
+    scalar_t *tileA = Aarray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &tileA[ i ];
+        scalar_t2 ri = R[ i ];
+        for (int64_t j = 0; j < n; ++j)
+            rowA[ j*lda ] = rowA[ j*lda ] * ri;
+            // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], ri);
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine for row and column scaling.
+///
+/// @param[in] equed
+///     Form of scaling to do.
+///     - Equed::Row:  sets $ A = diag(R) A         $
+///     - Equed::Col:  sets $ A =         A diag(C) $
+///     - Equed::Both: sets $ A = diag(R) A diag(C) $
+///     for each R in Rarray, C in Carray, and A in Aarray.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Rarray
+///     Vector of length m containing row scaling factors.
+///
+/// @param[in] Carray
+///     Vector of length n containing column scaling factors.
+///
+/// @param[in,out] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t, typename scalar_t2>
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    scalar_t2 const* const* Rarray,
+    scalar_t2 const* const* Carray,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:140: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    if (equed == Equed::Row) {
+        /*
+        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        ((sycl::queue *)(&queue.stream()))
+            ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                                 sycl::range<3>(1, 1, nthreads),
+                                             sycl::range<3>(1, 1, nthreads)),
+                           [=](sycl::nd_item<3> item_ct1) {
+                               gescale_row_batch_kernel(m, n, Rarray, Aarray,
+                                                        lda, item_ct1);
+                           });
+    }
+    else if (equed == Equed::Col) {
+        /*
+        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        ((sycl::queue *)(&queue.stream()))
+            ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                                 sycl::range<3>(1, 1, nthreads),
+                                             sycl::range<3>(1, 1, nthreads)),
+                           [=](sycl::nd_item<3> item_ct1) {
+                               gescale_col_batch_kernel(m, n, Carray, Aarray,
+                                                        lda, item_ct1);
+                           });
+    }
+    else if (equed == Equed::Both) {
+        /*
+        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        ((sycl::queue *)(&queue.stream()))
+            ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                                 sycl::range<3>(1, 1, nthreads),
+                                             sycl::range<3>(1, 1, nthreads)),
+                           [=](sycl::nd_item<3> item_ct1) {
+                               gescale_row_col_batch_kernel(
+                                   m, n, Rarray, Carray, Aarray, lda, item_ct1);
+                           });
+    }
+
+    /*
+    DPCT1010:141: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    float const* const* Rarray,
+    float const* const* Carray,
+    float** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+template
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    double const* const* Rarray,
+    double const* const* Carray,
+    double** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+// real R, C
+template <>
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    float const* const* Rarray,
+    float const* const* Carray,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale_row_col_batch(equed, m, n, Rarray, Carray, (sycl::float2 **)Aarray,
+                          lda, batch_count, queue);
+}
+
+template <>
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    double const* const* Rarray,
+    double const* const* Carray,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale_row_col_batch(equed, m, n, Rarray, Carray, (sycl::double2 **)Aarray,
+                          lda, batch_count, queue);
+}
+
+// complex R, C
+template <>
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    std::complex<float> const* const* Rarray,
+    std::complex<float> const* const* Carray,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale_row_col_batch(equed, m, n, (sycl::float2 **)Rarray,
+                          (sycl::float2 **)Carray, (sycl::float2 **)Aarray, lda,
+                          batch_count, queue);
+}
+
+template <>
+void gescale_row_col_batch(
+    Equed equed, int64_t m, int64_t n,
+    std::complex<double> const* const* Rarray,
+    std::complex<double> const* const* Carray,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    gescale_row_col_batch(equed, m, n, (sycl::double2 **)Rarray,
+                          (sycl::double2 **)Carray, (sycl::double2 **)Aarray,
+                          lda, batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_geset.dp.cpp b/src/sycl/device_geset.dp.cpp
new file mode 100644
index 000000000..f82d58fdf
--- /dev/null
+++ b/src/sycl/device_geset.dp.cpp
@@ -0,0 +1,308 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile set.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by geset_kernel() and geset_batch_kernel().
+///
+/// @copydoc geset
+///
+template <typename scalar_t>
+void geset_func(
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &A[ i ];
+
+        for (int64_t j = 0; j < n; ++j)
+            rowA[ j*lda ] = (j != i) ? offdiag_value : diag_value;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile.
+/// @copydoc geset
+template <typename scalar_t>
+void geset_kernel(
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    geset_func(m, n, offdiag_value, diag_value, A, lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile set.
+/// @copydoc geset_batch
+template <typename scalar_t>
+void geset_batch_kernel(
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    geset_func(m, n, offdiag_value, diag_value, Aarray[item_ct1.get_group(2)],
+               lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Element-wise m-by-n matrix A
+/// to diag_value on the diagonal and offdiag_value on the off-diagonals.
+///
+/// @param[in] m
+///     Number of rows of A. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of A. n >= 0.
+///
+/// @param[in] offdiag_value
+///     The value to set outside of the diagonal.
+///
+/// @param[in] diag_value
+///     The value to set on the diagonal.
+///
+/// @param[out] A
+///     An m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of A. lda >= m.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void geset(
+    int64_t m, int64_t n,
+    scalar_t const& offdiag_value,
+    scalar_t const& diag_value,
+    scalar_t* A, int64_t lda,
+    blas::Queue &queue)
+{
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+
+    /*
+    DPCT1093:134: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           geset_kernel(m, n, offdiag_value, diag_value, A, lda,
+                                        item_ct1);
+                       });
+
+    /*
+    DPCT1010:135: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geset(
+    int64_t m, int64_t n,
+    float const& offdiag_value, float const& diag_value,
+    float* A, int64_t lda,
+    blas::Queue &queue);
+
+template
+void geset(
+    int64_t m, int64_t n,
+    double const& offdiag_value, double const& diag_value,
+    double* A, int64_t lda,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void geset(
+    int64_t m, int64_t n,
+    std::complex<float> const& offdiag_value,
+    std::complex<float> const& diag_value,
+    std::complex<float>* A, int64_t lda,
+    blas::Queue &queue)
+{
+    geset(m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)),
+          sycl::float2(real(diag_value), imag(diag_value)), (sycl::float2 *)A,
+          lda, queue);
+}
+
+template <>
+void geset(
+    int64_t m, int64_t n,
+    std::complex<double> const& offdiag_value,
+    std::complex<double> const& diag_value,
+    std::complex<double>* A, int64_t lda,
+    blas::Queue &queue)
+{
+    geset(m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)),
+          sycl::double2(real(diag_value), imag(diag_value)), (sycl::double2 *)A,
+          lda, queue);
+}
+
+//==============================================================================
+namespace batch {
+
+//------------------------------------------------------------------------------
+/// Initializes a batch of m-by-n matrices Aarray[k]
+/// to diag_value on the diagonal and offdiag_value on the off-diagonals.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] offdiag_value
+///     The value to set outside of the diagonal.
+///
+/// @param[in] diag_value
+///     The value to set on the diagonal.
+///
+/// @param[out] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void geset(
+    int64_t m, int64_t n,
+    scalar_t const& offdiag_value,
+    scalar_t const& diag_value,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue &queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+    // quick return
+    if (m == 0 || n == 0)
+        return;
+
+    /*
+    DPCT1093:136: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           geset_batch_kernel(m, n, offdiag_value, diag_value,
+                                              Aarray, lda, item_ct1);
+                       });
+
+    /*
+    DPCT1010:137: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void geset(
+    int64_t m, int64_t n,
+    float const& offdiag_value,
+    float const& diag_value,
+    float** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue &queue);
+
+template
+void geset(
+    int64_t m, int64_t n,
+    double const& offdiag_value,
+    double const& diag_value,
+    double** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void geset(
+    int64_t m, int64_t n,
+    std::complex<float> const& offdiag_value,
+    std::complex<float> const& diag_value,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue &queue)
+{
+    geset(m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)),
+          sycl::float2(real(diag_value), imag(diag_value)),
+          (sycl::float2 **)Aarray, lda, batch_count, queue);
+}
+
+template <>
+void geset(
+    int64_t m, int64_t n,
+    std::complex<double> const& offdiag_value,
+    std::complex<double> const& diag_value,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue &queue)
+{
+    geset(m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)),
+          sycl::double2(real(diag_value), imag(diag_value)),
+          (sycl::double2 **)Aarray, lda, batch_count, queue);
+}
+
+} // namespace batch
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp
new file mode 100644
index 000000000..3a3ea4569
--- /dev/null
+++ b/src/sycl/device_henorm.dp.cpp
@@ -0,0 +1,487 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Finds the largest absolute value of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Uses dynamic shared memory array of length sizeof(real_t) * n.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by henorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_maxima
+///     Array of dimension gridDim.x.
+///     On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) )
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void henorm_max_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_maxima, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_max = (real_t*) dynamic_data;
+    if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) {
+        row_max[item_ct1.get_local_id(2)] = 0;
+    }
+
+    // Each thread finds max of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < n;
+         i += item_ct1.get_local_range(2)) {
+        chunk = i % item_ct1.get_local_range(2);
+
+        scalar_t const* row = &tile[ i ];
+        if (i < item_ct1.get_local_range(2)) {
+            row_max[chunk] = 0;
+        }
+
+        real_t max = 0;
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                max = max_nan(max, abs(row[j*lda]));
+            int64_t j = i;
+            max = max_nan(max, abs( real( row[j*lda] )));  // diag (real)
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j > i; --j) // strictly upper
+                max = max_nan(max, abs(row[j*lda]));
+            int64_t j = i;
+            max = max_nan(max, abs( real( row[j*lda] )));  // diag (real)
+        }
+        row_max[chunk] = max_nan(max, row_max[chunk]);
+    }
+
+    // Reduction to find max of tile.
+    /*
+    DPCT1065:29: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2),
+                   row_max, item_ct1);
+    if (item_ct1.get_local_id(2) == 0) {
+        tiles_maxima[item_ct1.get_group(2)] = row_max[0];
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each column of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one column.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by henorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) )
+///     for row j of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void henorm_one_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    // Each thread sums one row/column.
+    // todo: the row reads are coalesced, but the col reads are not coalesced
+    for (int k = item_ct1.get_local_id(2); k < n;
+         k += item_ct1.get_local_range(2)) {
+        scalar_t const* row    = &tile[ k ];
+        scalar_t const* column = &tile[ lda*k ];
+        real_t sum = 0;
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j < k; ++j) // strictly lower
+                sum += abs(row[j*lda]);
+            int64_t j = k;
+            sum += abs( real( row[j*lda] )); // diag (real)
+            for (int64_t i = k + 1; i < n; ++i) // strictly lower
+                sum += abs(column[i]);
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j > k; --j) // strictly upper
+                sum += abs(row[j*lda]);
+            int64_t j = k;
+            sum += abs( real( row[j*lda] )); // diag (real)
+            for (int64_t i = 0; i < k && i < n; ++i) // strictly upper
+                sum += abs(column[i]);
+        }
+        tiles_sums[item_ct1.get_group(2) * ldv + k] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of squares, in scaled representation, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by henorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension blockDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_values
+///     Array of dimension 2 * blockDim.x.
+///     On exit,
+///         tiles_values[2*k + 0] = scale
+///         tiles_values[2*k + 1] = sumsq
+///     such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void henorm_fro_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_values, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_scale = (real_t*) &dynamic_data[0];
+    real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)];
+
+    // Each thread finds sum-of-squares of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < n;
+         i += item_ct1.get_local_range(2)) {
+        real_t scale = 0;
+        real_t sumsq = 1;
+        chunk = i % item_ct1.get_local_range(2);
+        scalar_t const* row = &tile[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                add_sumsq(scale, sumsq, abs(row[j*lda]));
+            // double for symmetric entries
+            sumsq *= 2;
+            // diagonal (real)
+            add_sumsq( scale, sumsq, abs( real( row[ i*lda ] ) ) );
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j > i; --j) // strictly upper
+                add_sumsq( scale, sumsq, abs( row[ j*lda ] ) );
+            // double for symmetric entries
+            sumsq *= 2;
+            // diagonal (real)
+            add_sumsq( scale, sumsq, abs( real( row[ i*lda ] ) ) );
+        }
+
+        if (i < item_ct1.get_local_range(2)) {
+            row_scale[chunk] = 0;
+            row_sumsq[chunk] = 1;
+        }
+        combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
+    }
+
+    // Reduction to find sum-of-squares of tile.
+    // todo: parallel reduction.
+    /*
+    DPCT1065:30: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    if (item_ct1.get_local_id(2) == 0) {
+        real_t tile_scale = row_scale[0];
+        real_t tile_sumsq = row_sumsq[0];
+        for (int64_t chunk = 1;
+             chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        }
+
+        tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
+        tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine that computes a partial norm for each tile.
+///
+/// @param[in] norm
+///     Norm to compute. See values for description.
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is stored in the upper or lower triangle.
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] values
+///     Array in GPU memory, dimension batch_count * ldv.
+///     - Norm::Max: ldv = 1.
+///         On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count.
+///
+///     - Norm::One: ldv >= n.
+///         On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= j < n.
+///
+///     - Norm::Inf: for symmetric, same as Norm::One
+///
+///     - Norm::Max: ldv = 2.
+///         On exit,
+///             values[k*2 + 0] = scale_k
+///             values[k*2 + 1] = sumsq_k
+///         where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2
+///         for 0 <= k < batch_count.
+///
+/// @param[in] ldv
+///     Leading dimension of values array.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void henorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* values, int64_t ldv, int64_t batch_count,
+    blas::Queue& queue)
+{
+    using real_t = blas::real_type<scalar_t>;
+    int64_t nb = 512;
+
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:142: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    //---------
+    // max norm
+    if (norm == lapack::Norm::Max) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count, queue);
+        }
+        else {
+            assert(ldv == 1);
+            /*
+            DPCT1083:32: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb;
+            /*
+            DPCT1049:31: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        henorm_max_kernel(uplo, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+    //---------
+    // one norm
+    else if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count * n, queue);
+        }
+        else {
+            assert(ldv >= n);
+            /*
+            DPCT1049:33: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))
+                ->parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        henorm_one_kernel(uplo, n, Aarray, lda, values, ldv,
+                                          item_ct1);
+                    });
+        }
+    }
+    //---------
+    // Frobenius norm
+    else if (norm == lapack::Norm::Fro) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count * 2, queue);
+        }
+        else {
+            assert(ldv == 2);
+            /*
+            DPCT1083:35: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb * 2;
+            /*
+            DPCT1049:34: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        henorm_fro_kernel(uplo, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+
+    /*
+    DPCT1010:143: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void henorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue& queue);
+
+template
+void henorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void henorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue& queue)
+{
+    henorm(norm, uplo, n, (sycl::float2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+template <>
+void henorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue& queue)
+{
+    henorm(norm, uplo, n, (sycl::double2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp
new file mode 100644
index 000000000..d57370262
--- /dev/null
+++ b/src/sycl/device_synorm.dp.cpp
@@ -0,0 +1,736 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Finds the largest absolute value of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Uses dynamic shared memory array of length sizeof(real_t) * n.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by synorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_maxima
+///     Array of dimension gridDim.x.
+///     On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) )
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void synorm_max_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_maxima, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_max = (real_t*) dynamic_data;
+    if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) {
+        row_max[item_ct1.get_local_id(2)] = 0;
+    }
+
+    // Each thread finds max of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < n;
+         i += item_ct1.get_local_range(2)) {
+        chunk = i % item_ct1.get_local_range(2);
+
+        scalar_t const* row = &tile[ i ];
+        if (i < item_ct1.get_local_range(2)) {
+            row_max[chunk] = 0;
+        }
+
+        real_t max = 0;
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= i && j < n; ++j) // lower
+                max = max_nan(max, abs(row[j*lda]));
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j >= i; --j) // upper
+                max = max_nan(max, abs(row[j*lda]));
+        }
+        row_max[chunk] = max_nan(max, row_max[chunk]);
+    }
+
+    // Reduction to find max of tile.
+    /*
+    DPCT1065:73: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2),
+                   row_max, item_ct1);
+    if (item_ct1.get_local_id(2) == 0) {
+        tiles_maxima[item_ct1.get_group(2)] = row_max[0];
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each column of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one column.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by synorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) )
+///     for row j of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void synorm_one_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    // Each thread sums one row/column.
+    // todo: the row reads are coalesced, but the col reads are not coalesced
+    for (int k = item_ct1.get_local_id(2); k < n;
+         k += item_ct1.get_local_range(2)) {
+        scalar_t const* row    = &tile[ k ];
+        scalar_t const* column = &tile[ lda*k ];
+        real_t sum = 0;
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= k; ++j) // lower
+                sum += abs(row[j*lda]);
+            for (int64_t i = k + 1; i < n; ++i) // strictly lower
+                sum += abs(column[i]);
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j >= k; --j) // upper
+                sum += abs(row[j*lda]);
+            for (int64_t i = 0; i < k && i < n; ++i) // strictly upper
+                sum += abs(column[i]);
+        }
+        tiles_sums[item_ct1.get_group(2) * ldv + k] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of squares, in scaled representation, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Kernel assumes non-trivial tiles (n >= 1).
+/// Launched by synorm().
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension blockDim.x,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] tiles_values
+///     Array of dimension 2 * blockDim.x.
+///     On exit,
+///         tiles_values[2*k + 0] = scale
+///         tiles_values[2*k + 1] = sumsq
+///     such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void synorm_fro_kernel(
+    lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_values, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_scale = (real_t*) &dynamic_data[0];
+    real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)];
+
+    // Each thread finds sum-of-squares of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < n;
+         i += item_ct1.get_local_range(2)) {
+        real_t scale = 0;
+        real_t sumsq = 1;
+        chunk = i % item_ct1.get_local_range(2);
+        scalar_t const* row = &tile[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                add_sumsq(scale, sumsq, abs(row[j*lda]));
+            // double for symmetric entries
+            sumsq *= 2;
+            // diagonal
+            add_sumsq( scale, sumsq, abs( row[ i*lda ] ) );
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            for (int64_t j = n-1; j > i; --j) // strictly upper
+                add_sumsq(scale, sumsq, abs(row[j*lda]));
+            // double for symmetric entries
+            sumsq *= 2;
+            // diagonal
+            add_sumsq( scale, sumsq, abs( row[ i*lda ] ) );
+        }
+
+        if (i < item_ct1.get_local_range(2)) {
+            row_scale[chunk] = 0;
+            row_sumsq[chunk] = 1;
+        }
+        combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
+    }
+
+    // Reduction to find sum-of-squares of tile.
+    // todo: parallel reduction.
+    /*
+    DPCT1065:74: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    if (item_ct1.get_local_id(2) == 0) {
+        real_t tile_scale = row_scale[0];
+        real_t tile_sumsq = row_sumsq[0];
+        for (int64_t chunk = 1;
+             chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        }
+
+        tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
+        tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine that computes a partial norm for each tile.
+///
+/// @param[in] norm
+///     Norm to compute. See values for description.
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is stored in the upper or lower triangle.
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[out] values
+///     Array in GPU memory, dimension batch_count * ldv.
+///     - Norm::Max: ldv = 1.
+///         On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count.
+///
+///     - Norm::One: ldv >= n.
+///         On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= j < n.
+///
+///     - Norm::Inf: for symmetric, same as Norm::One
+///
+///     - Norm::Max: ldv = 2.
+///         On exit,
+///             values[k*2 + 0] = scale_k
+///             values[k*2 + 1] = sumsq_k
+///         where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2
+///         for 0 <= k < batch_count.
+///
+/// @param[in] ldv
+///     Leading dimension of values array.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void synorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    using real_t = blas::real_type<scalar_t>;
+    int64_t nb = 512;
+
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:172: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    //---------
+    // max norm
+    if (norm == lapack::Norm::Max) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count, queue);
+        }
+        else {
+            assert(ldv == 1);
+            /*
+            DPCT1083:76: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb;
+            /*
+            DPCT1049:75: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        synorm_max_kernel(uplo, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+    //---------
+    // one norm
+    else if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count * n, queue);
+        }
+        else {
+            assert(ldv >= n);
+            /*
+            DPCT1049:77: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))
+                ->parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        synorm_one_kernel(uplo, n, Aarray, lda, values, ldv,
+                                          item_ct1);
+                    });
+        }
+    }
+    //---------
+    // Frobenius norm
+    else if (norm == lapack::Norm::Fro) {
+        if (n == 0) {
+            blas::device_memset(values, 0, batch_count * 2, queue);
+        }
+        else {
+            assert(ldv == 2);
+            /*
+            DPCT1083:79: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb * 2;
+            /*
+            DPCT1049:78: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        synorm_fro_kernel(uplo, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+
+    /*
+    DPCT1010:173: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+const int ib  = 32;
+const int ib1 = 33;
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each row and each column of elements,
+/// for each tile in tiles.
+/// Each thread block deals with one tile.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by synormOffdiag().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit,
+///         tiles_sums[k*ldv + j]     = sum_{i} abs( A^(k)_(i, j) )
+///     for column j of tile A^(k), and
+///         tiles_sums[k*ldv + i + n] = sum_{j} abs( A^(k)_(i, j) )
+///     for row i of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void synorm_offdiag_one_kernel(
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local)
+{
+    // row_sums doesn't need to be shared, it could be in registers,
+    // but we don't know how large it is beforehand -- each thread uses
+    // ceil(m/ib) entries; in total it is ceil(m/ib)*ib entries.
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    auto dynamic_data = (char *)dpct_local;
+    real_t* shmem_tile = (real_t*)dynamic_data;
+    real_t* row_sums = &shmem_tile[ ib1*ib ];
+    const int k = item_ct1.get_local_id(2);
+
+    // Initialize row sums.
+    for (int64_t ii = 0; ii < m; ii += ib) {
+        row_sums[ ii+k ] = 0;
+    }
+
+    for (int64_t jj = 0; jj < n; jj += ib) {
+        real_t sum = 0.0;
+        for (int64_t ii = 0; ii < m; ii += ib) {
+            // Read 32 x 32 (ib x ib) sub-tile into shared memory.
+            // This does coalesced reads of one column at a time in parallel.
+            for (int64_t j = 0; j < ib; ++j)
+                if (jj+j < n && ii+k < m)
+                    shmem_tile[ j*ib1 + k ] = abs( tile[ (jj+j)*lda + ii+k ] );
+            /*
+            DPCT1065:80: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // shmem_tile loaded
+
+            // Each thread sums one column.
+            for (int64_t i = 0; i < ib; ++i)
+                if (ii+i < m)
+                    sum += shmem_tile[ k*ib1 + i ];
+
+            // Each thread sums one row.
+            for (int64_t j = 0; j < ib; ++j)
+                if (jj+j < n)
+                    row_sums[ ii+k ] += shmem_tile[ j*ib1 + k ];
+            /*
+            DPCT1065:81: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier(); // done with shmem_tile
+        }
+
+        if (jj+k < n)
+            tiles_sums[item_ct1.get_group(2) * ldv + jj + k] = sum;
+    }
+
+    // Save row sums.
+    for (int64_t ii = 0; ii < m; ii += ib) {
+        if (ii+k < m)
+            tiles_sums[item_ct1.get_group(2) * ldv + ii + k + n] = row_sums[ii + k];
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine that computes a partial norm for each tile.
+/// Used for full, off-diagonal tiles within a symmetric matrix,
+/// where element Aij contributes to both column i and j.
+///
+/// @param[in] norm
+///     Norm to compute. See values for description.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] values
+///     Array in GPU memory, dimension batch_count * ldv.
+///     - Norm::Max: ldv = 1.
+///         On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count.
+///
+///     - Norm::One: ldv >= n.
+///         On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= j < n.
+///
+///     - Norm::Inf: for symmetric, same as Norm::One
+///
+///     - Norm::Max: ldv = 2.
+///         On exit,
+///             values[k*2 + 0] = scale_k
+///             values[k*2 + 1] = sumsq_k
+///         where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2
+///         for 0 <= k < batch_count.
+///
+/// @param[in] ldv
+///     Leading dimension of values array.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void synormOffdiag(
+    lapack::Norm norm,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* values, int64_t ldv,
+    int64_t batch_count,
+    blas::Queue &queue)
+{
+    using real_t = blas::real_type<scalar_t>;
+
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:174: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    //---------
+    // one norm
+    if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) {
+        assert(ldv >= n);
+        size_t shared_mem
+            /*
+            DPCT1083:82: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            = sizeof(real_t) * (ib * ib1 + roundup(m, int64_t(ib)));
+        assert( shared_mem <= 48*1024 ); // max 48 KiB
+        ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                sycl::range<1>(shared_mem), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                      sycl::range<3>(1, 1, 32),
+                                  sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) {
+                    synorm_offdiag_one_kernel(m, n, Aarray, lda, values, ldv,
+                                              item_ct1,
+                                              dpct_local_acc_ct1.get_pointer());
+                });
+        });
+    }
+    else {
+        slate_not_implemented("Only Norm::One and Norm::Inf is supported.");
+    }
+
+    /*
+    DPCT1010:175: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void synorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+template
+void synorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void synorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    synorm(norm, uplo, n, (sycl::float2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+template <>
+void synorm(
+    lapack::Norm norm, lapack::Uplo uplo,
+    int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    synorm(norm, uplo, n, (sycl::double2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void synormOffdiag(
+    lapack::Norm norm,
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv,
+    int64_t batch_count,
+    blas::Queue &queue);
+
+template
+void synormOffdiag(
+    lapack::Norm norm,
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv,
+    int64_t batch_count,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void synormOffdiag(
+    lapack::Norm norm,
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv,
+    int64_t batch_count,
+    blas::Queue &queue)
+{
+    synormOffdiag(norm, m, n, (sycl::float2 **)Aarray, lda, values, ldv,
+                  batch_count, queue);
+}
+
+template <>
+void synormOffdiag(
+    lapack::Norm norm,
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv,
+    int64_t batch_count,
+    blas::Queue &queue)
+
+{
+    synormOffdiag(norm, m, n, (sycl::double2 **)Aarray, lda, values, ldv,
+                  batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_transpose.dp.cpp b/src/sycl/device_transpose.dp.cpp
new file mode 100644
index 000000000..33d7007eb
--- /dev/null
+++ b/src/sycl/device_transpose.dp.cpp
@@ -0,0 +1,918 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+/// internal blocking
+/// 16 x 16 thread block = 256 threads
+/// 32 x 32 thread block = 1024 threads
+static const int ib = 16;
+
+//------------------------------------------------------------------------------
+/// Device routine handles one matrix.
+/// Thread block grid:
+/// x = batch index (ignored here; see batch kernel),
+/// y = block row index,
+/// z = block col index.
+/// Each thread block is ib-by-ib threads and does one ib-by-ib block of an
+/// n-by-n matrix.
+///
+/// Let nt = ceildiv( n, ib ) be the number of blocks for one n-by-n matrix.
+/// An even number of blocks uses an (nt + 1) by (nt/2) grid.
+/// Example: for nt = 4 blocks, y by z = 5 by 2 grid:
+///   [ A00  A01 ]
+///   [----. A11 ]                  [ A10  .  |  .   .  ]
+///   [ A10 '----]                  [ A20 A21 |  .   .  ]
+///   [ A20  A21 ] covers matrix as [ A30 A31 | A00  .  ]
+///   [ A30  A31 ]                  [ A40 A41 | A01 A11 ]
+///   [ A40  A41 ]
+///
+/// An odd number of blocks uses an (nt) by (nt + 1)/2 grid.
+/// Example: for nt = 5 blocks, y by z = 5 by 3 grid:
+///   [ A00 | A01   A02 ]
+///   [     '----.      ]                  [ A00  .   .  |  .   .  ]
+///   [ A10   A11 | A12 ]                  [ A10 A11  .  |  .   .  ]
+///   [           '-----] covers matrix as [ A20 A21 A22 |  .   .  ]
+///   [ A20   A21   A22 ]                  [ A30 A31 A32 | A01  .  ]
+///   [ A30   A31   A32 ]                  [ A40 A41 A42 | A02 A12 ]
+///   [ A40   A41   A42 ]
+///
+template <typename scalar_t>
+void transpose_func(
+    bool is_conj,
+    int n,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1,
+    sycl::local_accessor<scalar_t, 2> sA1, sycl::local_accessor<scalar_t, 2> sA2,
+    sycl::local_accessor<scalar_t, 2> sA)
+{
+    // +1 to avoid memory bank conflicts.
+
+    // i, j are row & column indices of top-left corner of each block.
+    // ii, jj are row & column offsets within each block.
+    int ii = item_ct1.get_local_id(2);
+    int jj = item_ct1.get_local_id(1);
+
+    int i, j;
+    if (item_ct1.get_group_range(1) - 1 == item_ct1.get_group_range(0) * 2) {
+        // Even number of blocks.
+        //assert( ceildiv(n, ib) % 2 == 0 );
+        bool lower = (item_ct1.get_group(1) > item_ct1.get_group(0));
+        i = (lower ? (item_ct1.get_group(1) - 1)
+                   : (item_ct1.get_group(0) + item_ct1.get_group_range(0)));
+        j = (lower ? (item_ct1.get_group(0))
+                   : (item_ct1.get_group(1) + item_ct1.get_group_range(0)));
+    }
+    else {
+        // Odd number of blocks.
+        //assert( ceildiv(n, ib) % 2 == 1 );
+        bool lower = (item_ct1.get_group(1) >= item_ct1.get_group(0));
+        i = (lower ? item_ct1.get_group(1)
+                   : (item_ct1.get_group(0) + item_ct1.get_group_range(0) - 1));
+        j = (lower ? item_ct1.get_group(0)
+                   : (item_ct1.get_group(1) + item_ct1.get_group_range(0)));
+    }
+    i *= ib;
+    j *= ib;
+
+    scalar_t* A1 = A + i + ii + (j + jj)*lda;  // A(i, j)
+    if (i == j) { // diagonal block
+        // Load block A(i, j) into shared memory sA1.
+        if (i + ii < n  &&  j + jj < n) {
+            sA1[jj][ii] = *A1;
+        }
+        /*
+        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        // Save transposed block, A(i, j) = trans(sA1).
+        if (i + ii < n  &&  j + jj < n) {
+            if (is_conj)
+                *A1 = conj(sA1[ii][jj]);
+            else
+                *A1 = sA1[ii][jj];
+        }
+    }
+    else { // off-diagonal block
+        scalar_t* A2 = A + j + ii + (i + jj)*lda;  // A(j, i)
+        // Load blocks A(i, j) and A(j, i) into shared memory sA1 and sA2.
+        if (i + ii < n  &&  j + jj < n) {
+            sA1[jj][ii] = *A1;
+        }
+        if (j + ii < n  &&  i + jj < n) {
+            sA2[jj][ii] = *A2;
+        }
+        /*
+        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        // Save transposed blocks, A(i, j) = trans(sA2), A(j, i) = trans(sA1).
+        if (i + ii < n && j + jj < n) {
+            if (is_conj)
+                *A1 = conj(sA2[ii][jj]);
+            else
+                *A1 = sA2[ii][jj];
+        }
+        if (j + ii < n && i + jj < n) {
+            if (is_conj)
+                *A2 = conj(sA1[ii][jj]);
+            else
+                *A2 = sA1[ii][jj];
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+static const int NB = 32;  ///< block size for transpose_func
+static const int NY = 8;   ///< y dim of thread block size for transpose_func
+// static const int NX = 32; handled as template parameter, look below
+
+
+/// tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.
+/// uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly.
+/// subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB
+/// for each subtile
+///     load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY
+///     save NB-by-NX subtile from sA into AT,   as (NB/NX)*(NX/NY) blocks sized NX-by-NY
+///     A  += NX
+///     AT += NX*ldat
+///
+/// e.g., with NB=32, NX=32, NY=8 ([sdc] precisions)
+///     load 32x32 subtile as 4   blocks of 32x8 columns: (A11  A12  A13  A14 )
+///     save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14)
+///
+/// e.g., with NB=32, NX=16, NY=8 (z precision)
+///     load 16x32 subtile as 4   blocks of 16x8 columns: (A11  A12  A13  A14)
+///     save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12)
+///                                                       (AT21 AT22)
+///
+template <typename scalar_t, int NX>
+void transpose_func(
+    bool is_conj,
+    int m, int n,
+    const scalar_t *A,  int64_t lda,
+          scalar_t *AT, int64_t ldat, const sycl::nd_item<3> &item_ct1,
+          sycl::local_accessor<scalar_t, 2> sA1,
+          sycl::local_accessor<scalar_t, 2> sA2,
+          sycl::local_accessor<scalar_t, 2> sA)
+{
+
+    int tx = item_ct1.get_local_id(2);
+    int ty = item_ct1.get_local_id(1);
+    int iby = item_ct1.get_group(1) * NB;
+    int ibz = item_ct1.get_group(0) * NB;
+    int i, j;
+
+    A  += iby + tx + (ibz + ty)*lda;
+    AT += ibz + tx + (iby + ty)*ldat;
+
+    #pragma unroll
+    for (int tile=0; tile < NB/NX; ++tile) {
+        // load NX-by-NB subtile transposed from A into sA
+        i = iby + tx + tile*NX;
+        j = ibz + ty;
+        if (i < m) {
+            if (is_conj) {
+                #pragma unroll
+                for (int j2=0; j2 < NB; j2 += NY) {
+                    if (j + j2 < n) {
+                        sA[ty + j2][tx] = conj(A[j2*lda]);
+                    }
+                }
+            }
+            else {
+                #pragma unroll
+                for (int j2=0; j2 < NB; j2 += NY) {
+                    if (j + j2 < n) {
+                        sA[ty + j2][tx] = A[j2*lda];
+                    }
+                }
+            }
+        }
+        /*
+        DPCT1065:64: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        // save NB-by-NX subtile from sA into AT
+        i = ibz + tx;
+        j = iby + ty + tile*NX;
+        #pragma unroll
+        for (int i2=0; i2 < NB; i2 += NX) {
+            if (i + i2 < n) {
+                #pragma unroll
+                for (int j2=0; j2 < NX; j2 += NY) {
+                    if (j + j2 < m) {
+                        AT[i2 + j2*ldat] = sA[tx + i2][ty + j2];
+                    }
+                }
+            }
+        }
+        /*
+        DPCT1065:65: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        // move to next subtile
+        A  += NX;
+        AT += NX*ldat;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// in-place transpose of a square buffer
+template <typename scalar_t>
+void transpose_kernel(
+    bool is_conj,
+    int n,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1,
+    sycl::local_accessor<scalar_t, 2> sA1, sycl::local_accessor<scalar_t, 2> sA2,
+    sycl::local_accessor<scalar_t, 2> sA)
+{
+    transpose_func(is_conj, n, A, lda, item_ct1, sA1, sA2, sA);
+}
+
+//------------------------------------------------------------------------------
+/// in-place transpose of array of square buffers
+template <typename scalar_t>
+void transpose_batch_kernel(
+    bool is_conj,
+    int n,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1,
+    sycl::local_accessor<scalar_t, 2> sA1, sycl::local_accessor<scalar_t, 2> sA2,
+    sycl::local_accessor<scalar_t, 2> sA)
+{
+    transpose_func(is_conj, n, Aarray[item_ct1.get_group(2)], lda, item_ct1, sA1, sA2, sA);
+}
+
+//------------------------------------------------------------------------------
+/// out-of-place transpose of a rectangular buffer
+/// transopses A onto AT
+///
+template <typename scalar_t, int NX>
+void transpose_kernel(
+    bool is_conj,
+    int m, int n,
+    const scalar_t *A,  int64_t lda,
+          scalar_t *AT, int64_t ldat, const sycl::nd_item<3> &item_ct1,
+          sycl::local_accessor<scalar_t, 2> sA1, sycl::local_accessor<scalar_t, 2> sA2,
+          sycl::local_accessor<scalar_t, 2> sA)
+{
+    transpose_func<scalar_t, NX>(is_conj, m, n, A, lda, AT, ldat, item_ct1, sA1, sA2, sA);
+}
+
+//------------------------------------------------------------------------------
+/// out-of-place transpose of an array of rectangular buffers
+/// transopses dA_array onto dAT_array
+///
+template <typename scalar_t, int NX>
+void transpose_batch_kernel(
+    bool is_conj,
+    int m, int n,
+    scalar_t **dA_array,  int64_t lda,
+    scalar_t **dAT_array, int64_t ldat, const sycl::nd_item<3> &item_ct1,
+    sycl::local_accessor<scalar_t, 2> sA1, sycl::local_accessor<scalar_t, 2> sA2,
+    sycl::local_accessor<scalar_t, 2> sA)
+{
+    transpose_func<scalar_t, NX>(is_conj, m, n, dA_array[item_ct1.get_group(2)],
+                                 lda, dAT_array[item_ct1.get_group(2)], ldat,
+                                 item_ct1, sA1, sA2, sA);
+}
+
+//------------------------------------------------------------------------------
+/// Physically transpose a square matrix in place.
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 0.
+///
+/// @param[in,out] A
+///     A square n-by-n matrix stored in an lda-by-n array in GPU memory.
+///     On output, A is transposed.
+///
+/// @param[in] lda
+///     Leading dimension of A. lda >= n.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void transpose(
+    bool is_conj,
+    int64_t n,
+    scalar_t* A, int64_t lda,
+    blas::Queue& queue)
+{
+    if (n <= 1)
+        return;
+    assert(lda >= n);
+
+    /*
+    DPCT1093:158: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    int nt = ceildiv( n, int64_t(ib) );
+    assert(nt <= 65535);                // CUDA limitation
+
+    // Need 1/2 * (nt + 1) * nt to cover lower triangle and diagonal of matrix.
+    // Block assignment differs depending on whether nt is odd or even.
+    sycl::range<3> blocks(1, 1, 1);
+    if (nt % 2 == 0) {
+        // even blocks
+        blocks = sycl::range<3>(uint(nt / 2), uint(nt + 1), 1);
+    }
+    else {
+        // odd blocks
+        blocks = sycl::range<3>(uint((nt + 1) / 2), uint(nt), 1);
+    }
+    sycl::range<3> threads(1, ib, ib);
+
+    /*
+    DPCT1049:66: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:176: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:177: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA1_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:178: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:179: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA2_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:180: 'NB' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA_acc_ct1(
+            sycl::range<2>(32 /*NB*/,
+                           /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1),
+            cgh);
+
+        cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             transpose_kernel(is_conj, n, A, lda, item_ct1, sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1);
+                         });
+    });
+
+    /*
+    DPCT1010:159: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+/// Physically transpose a batch of square matrices in place.
+///
+/// @param[in] n
+///     Number of rows and columns of each tile. n >= 0.
+///
+/// @param[in,out] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to
+///     matrices, where each Aarray[k] is a square n-by-n matrix stored in an
+///     lda-by-n array in GPU memory.
+///     On output, each Aarray[k] is transposed.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= n.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void transpose_batch(
+    bool is_conj,
+    int64_t n,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    if (batch_count < 0 || n <= 1)
+        return;
+    assert(lda >= n);
+
+    /*
+    DPCT1093:160: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    int nt = ceildiv( n, int64_t(ib) );
+    assert(nt <= 65535);                // CUDA limitation
+    assert(batch_count <= 2147483647);  // CUDA limitation, 2^31 - 1
+
+    // Need 1/2 * (nt + 1) * nt to cover lower triangle and diagonal of matrix.
+    // Block assignment differs depending on whether nt is odd or even.
+    sycl::range<3> blocks(1, 1, 1);
+    if (nt % 2 == 0) {
+        // even blocks
+        blocks = sycl::range<3>(uint(nt / 2), uint(nt + 1), uint(batch_count));
+    }
+    else {
+        // odd blocks
+        blocks = sycl::range<3>(uint((nt + 1) / 2), uint(nt), uint(batch_count));
+    }
+    sycl::range<3> threads(1, ib, ib);
+
+    /*
+    DPCT1049:67: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:181: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:182: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA1_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:183: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:184: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA2_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:185: 'NB' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA_acc_ct1(
+            sycl::range<2>(32 /*NB*/,
+                           /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1),
+            cgh);
+
+        cgh.parallel_for(sycl::nd_range<3>(blocks * threads, threads),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             transpose_batch_kernel(is_conj, n, Aarray, lda,
+                                 item_ct1, sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1);
+                         });
+    });
+
+    /*
+    DPCT1010:161: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+/// Look up NX based on data type.
+/// float, double, complex-float use NX = 32.
+template <typename scalar_t>
+struct nx_traits
+{
+    static const int NX = 32;
+};
+
+template <> struct nx_traits<sycl::double2>
+{
+    //    static const int NX = 16;
+    static const int NX = 32;  // always use 32 for SYCL
+};
+
+//------------------------------------------------------------------------------
+/// Physically transpose a rectangular matrix out-of-place.
+///
+/// @param[in] m
+///     Number of columns of tile. m >= 0.
+///
+/// @param[in] n
+///     Number of rows of tile. n >= 0.
+///
+/// @param[in] dA
+///     A rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of dA. lda >= m.
+///
+/// @param[out] dAT
+///     A rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory.
+///     On output, dAT is the transpose of dA.
+///
+/// @param[in] ldat
+///     Leading dimension of dAT. ldat >= n.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void transpose(
+    bool is_conj,
+    int64_t m, int64_t n,
+    scalar_t* dA,  int64_t lda,
+    scalar_t* dAT, int64_t ldat,
+    blas::Queue& queue)
+{
+    const int NX = nx_traits<scalar_t>::NX;
+
+    if ((m <= 0) || (n <= 0))
+        return;
+    assert(lda >= m);
+    assert(ldat >= n);
+
+    /*
+    DPCT1093:162: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    int mt = ceildiv( m, int64_t(NB) );
+    assert(mt <= 65535);                // CUDA limitation
+    int nt = ceildiv( n, int64_t(NB) );
+    assert(nt <= 65535);                // CUDA limitation
+
+    sycl::range<3> grid(nt, mt, 1);
+    sycl::range<3> threads(1, NY, NX);
+    /*
+    DPCT1049:68: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:186: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:187: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA1_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:188: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:189: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA2_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:190: 'NB' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA_acc_ct1(
+            sycl::range<2>(32 /*NB*/,
+                           /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1),
+            cgh);
+
+        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             transpose_kernel<scalar_t, NX>(
+                                 is_conj, m, n, dA, lda, dAT, ldat, item_ct1,
+                                 sA1_acc_ct1, sA2_acc_ct1, sA_acc_ct1);
+                         });
+    });
+
+    /*
+    DPCT1010:163: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+/// Physically transpose a batch of rectangular matrices out-of-place.
+///
+/// @param[in] m
+///     Number of columns of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of rows of each tile. n >= 0.
+///
+/// @param[in] dA_array
+///     Array in GPU memory of dimension batch_count, containing pointers to
+///     matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an
+///     lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each dA_array[k] tile. lda >= m.
+///
+/// @param[out] dAT_array
+///     Array in GPU memory of dimension batch_count, containing pointers to
+///     matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an
+///     ldat-by-m array in GPU memory.
+///     On output, each dAT_array[k] is the transpose of dA_array[k].
+///
+/// @param[in] ldat
+///     Leading dimension of each dAT_array[k] tile. ldat >= n.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void transpose_batch(
+    bool is_conj,
+    int64_t m, int64_t n,
+    scalar_t **dA_array,  int64_t lda,
+    scalar_t **dAT_array, int64_t ldat,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    const int NX = nx_traits<scalar_t>::NX;
+
+    if ((m <= 0) || (n <= 0))
+        return;
+    assert(lda >= m);
+    assert(ldat >= n);
+
+    /*
+    DPCT1093:164: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    int mt = ceildiv( m, int64_t(NB) );
+    assert(mt <= 65535);                // CUDA limitation
+    int nt = ceildiv( n, int64_t(NB) );
+    assert(nt <= 65535);                // CUDA limitation
+    assert(batch_count <= 2147483647);  // CUDA limitation, 2^31 - 1
+
+    sycl::range<3> grid(nt, mt, uint(batch_count));
+    sycl::range<3> threads(1, NY, NX);
+    /*
+    DPCT1049:69: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:191: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:192: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA1_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:193: 'ib' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        /*
+        DPCT1101:194: 'ib+1' expression was replaced with a value. Modify the
+        code to use the original expression, provided in comments, if it is
+        correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA2_acc_ct1(
+            sycl::range<2>(16 /*ib*/, 17 /*ib+1*/), cgh);
+        /*
+        DPCT1101:195: 'NB' expression was replaced with a value. Modify the code
+        to use the original expression, provided in comments, if it is correct.
+        */
+        sycl::local_accessor<scalar_t, 2> sA_acc_ct1(
+            sycl::range<2>(32 /*NB*/,
+                           /* dpct_placeholder NX */ 32 /*Fix the type mannually*/ + 1),
+            cgh);
+
+        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             transpose_batch_kernel<scalar_t, NX>(
+                                 is_conj, m, n, dA_array, lda, dAT_array, ldat,
+                                 item_ct1, sA1_acc_ct1, sA2_acc_ct1,
+                                 sA_acc_ct1);
+                         });
+    });
+
+    /*
+    DPCT1010:165: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void transpose(
+    bool is_conj,
+    int64_t n,
+    float* A, int64_t lda,
+    blas::Queue& queue);
+
+template
+void transpose(
+    bool is_conj,
+    int64_t n,
+    double* A, int64_t lda,
+    blas::Queue& queue);
+
+//----- rectangular, out-of-place
+template
+void transpose(
+    bool is_conj,
+    int64_t m, int64_t n,
+    float* A, int64_t lda,
+    float* B, int64_t ldb,
+    blas::Queue& queue);
+
+template
+void transpose(
+    bool is_conj,
+    int64_t m, int64_t n,
+    double* A, int64_t lda,
+    double* B, int64_t ldb,
+    blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void transpose(
+    bool is_conj,
+    int64_t n,
+    std::complex<float>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    transpose(is_conj, n, (sycl::float2 *)A, lda, queue);
+}
+
+template <>
+void transpose(
+    bool is_conj,
+    int64_t n,
+    std::complex<double>* A, int64_t lda,
+    blas::Queue& queue)
+{
+    transpose(is_conj, n, (sycl::double2 *)A, lda, queue);
+}
+
+template <>
+void transpose(
+    bool is_conj,
+    int64_t m, int64_t n,
+    std::complex<float>* A, int64_t lda,
+    std::complex<float>* B, int64_t ldb,
+    blas::Queue& queue)
+{
+    transpose(is_conj, m, n, (sycl::float2 *)A, lda, (sycl::float2 *)B, ldb,
+              queue);
+}
+
+template <>
+void transpose(
+    bool is_conj,
+    int64_t m, int64_t n,
+    std::complex<double>* A, int64_t lda,
+    std::complex<double>* B, int64_t ldb,
+    blas::Queue& queue)
+{
+    transpose(is_conj, m, n, (sycl::double2 *)A, lda, (sycl::double2 *)B, ldb,
+              queue);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void transpose_batch(
+    bool is_conj,
+    int64_t n,
+    float** Aarray, int64_t lda,
+    int64_t batch_count,
+    blas::Queue& queue);
+
+template
+void transpose_batch(
+    bool is_conj,
+    int64_t n,
+    double** Aarray, int64_t lda,
+    int64_t batch_count,
+    blas::Queue& queue);
+
+//----- rectangular, out-of-place
+template
+void transpose_batch(
+    bool is_conj,
+    int64_t m, int64_t n,
+    float** Aarray, int64_t lda,
+    float** Barray, int64_t ldb,
+    int64_t batch_count,
+    blas::Queue& queue);
+
+template
+void transpose_batch(
+    bool is_conj,
+    int64_t m, int64_t n,
+    double** Aarray, int64_t lda,
+    double** Barray, int64_t ldb,
+    int64_t batch_count,
+    blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void transpose_batch(
+    bool is_conj,
+    int64_t n,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    transpose_batch(is_conj, n, (sycl::float2 **)Aarray, lda, batch_count, queue);
+}
+
+template <>
+void transpose_batch(
+    bool is_conj,
+    int64_t n,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    transpose_batch(is_conj, n, (sycl::double2 **)Aarray, lda, batch_count,
+                    queue);
+}
+
+//----- rectangular, out-of-place
+template <>
+void transpose_batch(
+    bool is_conj,
+    int64_t m, int64_t n,
+    std::complex<float>** Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    transpose_batch(is_conj, m, n, (sycl::float2 **)Aarray, lda,
+                    (sycl::float2 **)Barray, ldb, batch_count, queue);
+}
+
+template <>
+void transpose_batch(
+    bool is_conj,
+    int64_t m, int64_t n,
+    std::complex<double>** Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count,
+    blas::Queue& queue)
+{
+    transpose_batch(is_conj, m, n, (sycl::double2 **)Aarray, lda,
+                    (sycl::double2 **)Barray, ldb, batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp
new file mode 100644
index 000000000..f0b802937
--- /dev/null
+++ b/src/sycl/device_trnorm.dp.cpp
@@ -0,0 +1,632 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Finds the largest absolute value of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Uses dynamic shared memory array of length sizeof(real_t) * m.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by trnorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_maxima
+///     Array of dimension gridDim.x.
+///     On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) )
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void trnorm_max_kernel(
+    lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_maxima, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_max = (real_t*) dynamic_data;
+
+    if (item_ct1.get_local_id(2) < item_ct1.get_local_range(2)) {
+        row_max[item_ct1.get_local_id(2)] = 0;
+    }
+    // Each thread finds max of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        chunk = i % item_ct1.get_local_range(2);
+
+        scalar_t const* row = &tile[ i ];
+
+        real_t max = 0;
+        if (uplo == lapack::Uplo::Lower) {
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    max = 1;
+                for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                    max = max_nan(max, abs(row[j*lda]));
+            }
+            else {
+                for (int64_t j = 0; j <= i && j < n; ++j) // lower
+                    max = max_nan(max, abs(row[j*lda]));
+            }
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    max = 1;
+                for (int64_t j = n-1; j > i; --j) // strictly upper
+                    max = max_nan(max, abs(row[j*lda]));
+            }
+            else {
+                for (int64_t j = n-1; j >= i; --j) // upper
+                    max = max_nan(max, abs(row[j*lda]));
+            }
+        }
+
+        row_max[chunk] = max_nan(max, row_max[chunk]);
+    }
+
+    // Reduction to find max of tile.
+    /*
+    DPCT1065:51: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    max_nan_reduce(item_ct1.get_local_range(2), item_ct1.get_local_id(2),
+                   row_max, item_ct1);
+    if (item_ct1.get_local_id(2) == 0) {
+        tiles_maxima[item_ct1.get_group(2)] = row_max[0];
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each column of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one column.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by trnorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///     Also the number of threads per block (blockDim.x), hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) )
+///     for row j of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void trnorm_one_kernel(
+    lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    // Each thread sums one column.
+    // todo: this doesn't do coalesced reads
+    for (int j = item_ct1.get_local_id(2); j < n;
+         j += item_ct1.get_local_range(2)) {
+
+        scalar_t const* column = &tile[ lda*j ];
+        real_t sum = 0;
+
+        if (uplo == lapack::Uplo::Lower) {
+            if (diag == lapack::Diag::Unit) {
+                if (j < m) // diag
+                    sum += 1;
+                for (int64_t i = j+1; i < m; ++i) // strictly lower
+                    sum += abs(column[i]);
+            }
+            else {
+                for (int64_t i = j; i < m; ++i) // lower
+                    sum += abs(column[i]);
+            }
+        }
+        else {
+            if (diag == lapack::Diag::Unit) {
+                if (j < m) // diag
+                    sum += 1;
+                for (int64_t i = 0; i < j && i < m; ++i) // strictly upper
+                    sum += abs(column[i]);
+            }
+            else {
+                for (int64_t i = 0; i <= j && i < m; ++i) // upper
+                    sum += abs(column[i]);
+            }
+        }
+        tiles_sums[item_ct1.get_group(2) * ldv + j] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of absolute values of each row of elements, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by trnorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_sums
+///     Array of dimension gridDim.x * ldv.
+///     On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) )
+///     for row i of tile A^(k).
+///
+/// @param[in] ldv
+///     Leading dimension of tiles_sums (values) array.
+///
+template <typename scalar_t>
+void trnorm_inf_kernel(
+    lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_sums, int64_t ldv,
+    const sycl::nd_item<3> &item_ct1)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+
+    // Each thread sums one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t const* row = &tile[ i ];
+        real_t sum = 0;
+        if (uplo == lapack::Uplo::Lower) {
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    sum += 1;
+                for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                    sum += abs(row[j*lda]);
+            }
+            else {
+                for (int64_t j = 0; j <= i && j < n; ++j) // lower
+                    sum += abs(row[j*lda]);
+            }
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    sum += 1;
+                for (int64_t j = n-1; j > i; --j) // strictly upper
+                    sum += abs(row[j*lda]);
+            }
+            else {
+                for (int64_t j = n-1; j >= i; --j) // upper
+                    sum += abs(row[j*lda]);
+            }
+        }
+        tiles_sums[item_ct1.get_group(2) * ldv + i] = sum;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum of squares, in scaled representation, for each tile in Aarray.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row, followed by a reduction.
+/// Kernel assumes non-trivial tiles (m, n >= 1).
+/// Launched by trnorm().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///     Also the number of threads per block, hence,
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension blockDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] tiles_values
+///     Array of dimension 2 * blockDim.x.
+///     On exit,
+///         tiles_values[2*k + 0] = scale
+///         tiles_values[2*k + 1] = sumsq
+///     such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2
+///     for tile A^(k).
+///
+template <typename scalar_t>
+void trnorm_fro_kernel(
+    lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* tiles_values, const sycl::nd_item<3> &item_ct1,
+    uint8_t *dpct_local)
+{
+    using real_t = blas::real_type<scalar_t>;
+    scalar_t const *tile = Aarray[item_ct1.get_group(2)];
+    int chunk;
+
+    // Save partial results in shared memory.
+    auto dynamic_data = (char *)dpct_local;
+    real_t* row_scale = (real_t*) &dynamic_data[0];
+    real_t *row_sumsq = &row_scale[item_ct1.get_local_range(2)];
+
+    // Each thread finds sum-of-squares of one row.
+    // This does coalesced reads of one column at a time in parallel.
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        real_t scale = 0;
+        real_t sumsq = 1;
+        chunk = i % item_ct1.get_local_range(2);
+        scalar_t const* row = &tile[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    add_sumsq(scale, sumsq, real_t(1));
+                for (int64_t j = 0; j < i && j < n; ++j) // strictly lower
+                    add_sumsq(scale, sumsq, abs(row[j*lda]));
+            }
+            else {
+                for (int64_t j = 0; j <= i && j < n; ++j) // lower
+                    add_sumsq(scale, sumsq, abs(row[j*lda]));
+            }
+        }
+        else {
+            // Loop backwards (n-1 down to i) to maintain coalesced reads.
+            if (diag == lapack::Diag::Unit) {
+                if (i < n) // diag
+                    add_sumsq(scale, sumsq, real_t(1));
+                for (int64_t j = n-1; j > i; --j) // strictly upper
+                    add_sumsq(scale, sumsq, abs(row[j*lda]));
+            }
+            else {
+                for (int64_t j = n-1; j >= i; --j) // upper
+                    add_sumsq(scale, sumsq, abs(row[j*lda]));
+            }
+        }
+
+        if (i < item_ct1.get_local_range(2)) {
+            row_scale[chunk] = 0;
+            row_sumsq[chunk] = 1;
+        }
+
+        combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
+    }
+
+    // Reduction to find sum-of-squares of tile.
+    // todo: parallel reduction.
+    /*
+    DPCT1065:52: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    if (item_ct1.get_local_id(2) == 0) {
+        real_t tile_scale = row_scale[0];
+        real_t tile_sumsq = row_sumsq[0];
+        for (int64_t chunk = 1;
+             chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        }
+
+        tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
+        tiles_values[item_ct1.get_group(2) * 2 + 1] = tile_sumsq;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine that computes a partial norm for each trapezoidal tile.
+///
+/// todo: rename to tznorm for consistency with other tz routines.
+///
+/// @param[in] norm
+///     Norm to compute. See values for description.
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is upper or lower trapezoidal.
+///
+/// @param[in] diag
+///     Whether or not each Aarray[k] has unit diagonal.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile. lda >= m.
+///
+/// @param[out] values
+///     Array in GPU memory, dimension batch_count * ldv.
+///     - Norm::Max: ldv = 1.
+///         On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count.
+///
+///     - Norm::One: ldv >= n.
+///         On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= j < n.
+///
+///     - Norm::Inf: ldv >= m.
+///         On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) )
+///         for 0 <= k < batch_count, 0 <= i < m.
+///
+///     - Norm::Max: ldv = 2.
+///         On exit,
+///             values[k*2 + 0] = scale_k
+///             values[k*2 + 1] = sumsq_k
+///         where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2
+///         for 0 <= k < batch_count.
+///
+/// @param[in] ldv
+///     Leading dimension of values array.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void trnorm(
+    lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    scalar_t const* const* Aarray, int64_t lda,
+    blas::real_type<scalar_t>* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    using real_t = blas::real_type<scalar_t>;
+    int64_t nb = 512;
+
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:150: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    //---------
+    // max norm
+    if (norm == lapack::Norm::Max) {
+        if (m == 0 || n == 0) {
+            blas::device_memset(values, 0, batch_count, queue);
+        }
+        else {
+            assert(ldv == 1);
+            /*
+            DPCT1083:54: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb;
+            /*
+            DPCT1049:53: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        trnorm_max_kernel(uplo, diag, m, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+    //---------
+    // one norm
+    else if (norm == lapack::Norm::One) {
+        if (m == 0 || n == 0) {
+            blas::device_memset(values, 0, batch_count * n, queue);
+        }
+        else {
+            assert(ldv >= n);
+            /*
+            DPCT1049:55: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))
+                ->parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        trnorm_one_kernel(uplo, diag, m, n, Aarray, lda, values,
+                                          ldv, item_ct1);
+                    });
+        }
+    }
+    //---------
+    // inf norm
+    else if (norm == lapack::Norm::Inf) {
+        if (m == 0 || n == 0) {
+            blas::device_memset(values, 0, batch_count * m, queue);
+        }
+        else {
+            assert(ldv >= m);
+            /*
+            DPCT1049:56: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))
+                ->parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        trnorm_inf_kernel(uplo, diag, m, n, Aarray, lda, values,
+                                          ldv, item_ct1);
+                    });
+        }
+    }
+    //---------
+    // Frobenius norm
+    else if (norm == lapack::Norm::Fro) {
+        if (m == 0 || n == 0) {
+            blas::device_memset(values, 0, batch_count * 2, queue);
+        }
+        else {
+            assert(ldv == 2);
+            /*
+            DPCT1083:58: The size of local memory in the migrated code may be
+            different from the original code. Check that the allocated memory
+            size in the migrated code is correct.
+            */
+            size_t shared_mem = sizeof(real_t) * nb * 2;
+            /*
+            DPCT1049:57: The work-group size passed to the SYCL kernel may
+            exceed the limit. To get the device limit, query
+            info::device::max_work_group_size. Adjust the work-group size if
+            needed.
+            */
+            ((sycl::queue *)(&queue.stream()))->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+                    sycl::range<1>(shared_mem), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                          sycl::range<3>(1, 1, nb),
+                                      sycl::range<3>(1, 1, nb)),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        trnorm_fro_kernel(uplo, diag, m, n, Aarray, lda, values,
+                                          item_ct1,
+                                          dpct_local_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+
+    /*
+    DPCT1010:151: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void trnorm(
+    lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+template
+void trnorm(
+    lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void trnorm(
+    lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    float* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    trnorm(norm, uplo, diag, m, n, (sycl::float2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+template <>
+void trnorm(
+    lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag,
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    double* values, int64_t ldv, int64_t batch_count,
+    blas::Queue &queue)
+{
+    trnorm(norm, uplo, diag, m, n, (sycl::double2 **)Aarray, lda, values, ldv,
+           batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp
new file mode 100644
index 000000000..e588fb4c9
--- /dev/null
+++ b/src/sycl/device_tzadd.dp.cpp
@@ -0,0 +1,213 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile addition.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by tzadd().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+/// @param[in,out] Barray
+///     Array of tiles of dimension gridDim.x,
+///     where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in Barray. ldb >= m.
+///
+template <typename scalar_t>
+void tzadd_kernel(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t alpha, scalar_t** Aarray, int64_t lda,
+    scalar_t beta,  scalar_t** Barray, int64_t ldb,
+    const sycl::nd_item<3> &item_ct1)
+{
+    scalar_t *tileA = Aarray[item_ct1.get_group(2)];
+    scalar_t *tileB = Barray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &tileA[ i ];
+        scalar_t* rowB = &tileB[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= i && j < n; ++j) { // lower
+                rowB[j*ldb] = alpha * rowA[j*lda] + beta * rowB[ j*ldb ];
+                // rowB[j * ldb] = dpct_operator_overloading::operator+(
+                //     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
+                //     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+            }
+        }
+        else {
+            for (int64_t j = n-1; j >= i; --j) { // upper
+                rowB[j*ldb] = alpha * rowA[ j*lda ] + beta * rowB[ j*ldb ];
+                // rowB[j * ldb] = dpct_operator_overloading::operator+(
+                //     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
+                //     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise trapezoidal tile addition.
+/// Sets upper or lower part of
+/// \[
+///     Barray[k] = \alpha Aarray[k] + \beta Barray[k].
+/// \]
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is upper or lower trapezoidal.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] alpha
+///     The scalar alpha.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] beta
+///     The scalar beta.
+///
+/// @param[in,out] Barray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in B. ldb >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray and Barray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void tzadd(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t const& alpha, scalar_t** Aarray, int64_t lda,
+    scalar_t const& beta, scalar_t** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1093:138: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    /*
+    DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           tzadd_kernel(uplo, m, n, alpha, Aarray, lda, beta,
+                                        Barray, ldb, item_ct1);
+                       });
+
+    /*
+    DPCT1010:139: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void tzadd(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float const& alpha, float** Aarray, int64_t lda,
+    float const& beta, float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+template
+void tzadd(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double const& alpha, double** Aarray, int64_t lda,
+    double const& beta, double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void tzadd(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<float> const& alpha, std::complex<float>** Aarray, int64_t lda,
+    std::complex<float> const& beta,  std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzadd(uplo, m, n, sycl::float2(real(alpha), imag(alpha)),
+          (sycl::float2 **)Aarray, lda, sycl::float2(real(beta), imag(beta)),
+          (sycl::float2 **)Barray, ldb, batch_count, queue);
+}
+
+template <>
+void tzadd(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<double> const& alpha, std::complex<double>** Aarray, int64_t lda,
+    std::complex<double> const& beta,  std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzadd(uplo, m, n, sycl::double2(real(alpha), imag(alpha)),
+          (sycl::double2 **)Aarray, lda, sycl::double2(real(beta), imag(beta)),
+          (sycl::double2 **)Barray, ldb, batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_tzcopy.dp.cpp b/src/sycl/device_tzcopy.dp.cpp
new file mode 100644
index 000000000..586ba697f
--- /dev/null
+++ b/src/sycl/device_tzcopy.dp.cpp
@@ -0,0 +1,246 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing copy and precision conversions, copying A to B.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by tzcopy().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+/// @param[out] Barray
+///     Array of tiles of dimension gridDim.x,
+///     where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in Barray. ldb >= m.
+///
+template <typename src_scalar_t, typename dst_scalar_t>
+void tzcopy_kernel(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    src_scalar_t const* const* Aarray, int64_t lda,
+    dst_scalar_t** Barray, int64_t ldb, const sycl::nd_item<3> &item_ct1)
+{
+    src_scalar_t const *tileA = Aarray[item_ct1.get_group(2)];
+    dst_scalar_t *tileB = Barray[item_ct1.get_group(2)];
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        src_scalar_t const* rowA = &tileA[ i ];
+        dst_scalar_t*       rowB = &tileB[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= i && j < n; ++j) { // lower
+                copy(rowA[j*lda], rowB[j*ldb]);
+            }
+        }
+        else {
+            for (int64_t j = n-1; j >= i; --j) { // upper
+                copy(rowA[j*lda], rowB[j*ldb]);
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise trapezoidal copy and precision conversion,
+/// copying A to B. Sets upper or lower part of
+/// \[
+///     Barray[k] = Aarray[k].
+/// \]
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is upper or lower trapezoidal.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[out] Barray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] ldb
+///     Leading dimension of each tile in B. ldb >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray and Barray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename src_scalar_t, typename dst_scalar_t>
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    src_scalar_t const* const* Aarray, int64_t lda,
+    dst_scalar_t** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1093:170: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    /*
+    DPCT1049:72: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           tzcopy_kernel(uplo, m, n, Aarray, lda, Barray, ldb,
+                                         item_ct1);
+                       });
+
+    /*
+    DPCT1010:171: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+
+// float => float
+template
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// float => double
+template
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float const* const* Aarray, int64_t lda,
+    double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// double => double
+template
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    double** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+// double => float
+template
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double const* const* Aarray, int64_t lda,
+    float** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+
+// complex-float => complex-float
+template <>
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzcopy(uplo, m, n, (sycl::float2 **)Aarray, lda, (sycl::float2 **)Barray,
+           ldb, batch_count, queue);
+}
+
+// complex-float => complex-double
+template <>
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<float> const* const* Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzcopy(uplo, m, n, (sycl::float2 **)Aarray, lda, (sycl::double2 **)Barray,
+           ldb, batch_count, queue);
+}
+
+// complex-double => complex-double
+template <>
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    std::complex<double>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzcopy(uplo, m, n, (sycl::double2 **)Aarray, lda, (sycl::double2 **)Barray,
+           ldb, batch_count, queue);
+}
+
+// complex-double => complex-float
+template <>
+void tzcopy(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<double> const* const* Aarray, int64_t lda,
+    std::complex<float>** Barray, int64_t ldb,
+    int64_t batch_count, blas::Queue &queue)
+{
+    tzcopy(uplo, m, n, (sycl::double2 **)Aarray, lda, (sycl::float2 **)Barray,
+           ldb, batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp
new file mode 100644
index 000000000..61df572f2
--- /dev/null
+++ b/src/sycl/device_tzscale.dp.cpp
@@ -0,0 +1,197 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+
+#include <cstdio>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile scale.
+/// Each thread block deals with one tile.
+/// Each thread deals with one row.
+/// Launched by gescale().
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 1.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 1.
+///
+/// @param[in] numer
+///     Scale value numerator.
+///
+/// @param[in] denom
+///     Scale value denominator.
+///
+/// @param[in,out] Aarray
+///     Array of tiles of dimension gridDim.x,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+template <typename scalar_t>
+void tzscale_kernel(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    blas::real_type<scalar_t> numer, blas::real_type<scalar_t> denom,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    scalar_t *tileA = Aarray[item_ct1.get_group(2)];
+    blas::real_type<scalar_t> mul = numer / denom;
+
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int64_t i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &tileA[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= i && j < n; ++j) { // lower
+                rowA[j*lda] = rowA[j*lda] * mul;
+                // rowA[j * lda] =
+                //     dpct_operator_overloading::operator*(rowA[j * lda], mul);
+            }
+        }
+        else {
+            for (int64_t j = n-1; j >= i; --j) // upper
+                rowA[j*lda] = rowA[j*lda] * mul;
+                // rowA[j * lda] =
+                //     dpct_operator_overloading::operator*(rowA[j * lda], mul);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise trapezoidal tile scale.
+/// Sets upper or lower part of
+/// \[
+///     Aarray[k] *= (numer / denom).
+/// \]
+/// This does NOT currently take extra care to avoid over/underflow.
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is upper or lower trapezoidal.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] numer
+///     Scale value numerator.
+///
+/// @param[in] denom
+///     Scale value denominator.
+///
+/// @param[in,out] Aarray
+///     Array in GPU memory of dimension batch_count, containing pointers to tiles,
+///     where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in A. lda >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void tzscale(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    blas::real_type<scalar_t> numer, blas::real_type<scalar_t> denom,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:132: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int64_t nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           tzscale_kernel(uplo, m, n, numer, denom, Aarray, lda,
+                                          item_ct1);
+                       });
+
+    /*
+    DPCT1010:133: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void tzscale(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float numer, float denom, float** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+template
+void tzscale(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double numer, double denom, double** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue);
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void tzscale(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float numer, float denom,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    tzscale(uplo, m, n, numer, denom, (sycl::float2 **)Aarray, lda, batch_count,
+            queue);
+}
+
+template <>
+void tzscale(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double numer, double denom,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue)
+{
+    tzscale(uplo, m, n, numer, denom, (sycl::double2 **)Aarray, lda,
+            batch_count, queue);
+}
+
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_tzset.dp.cpp b/src/sycl/device_tzset.dp.cpp
new file mode 100644
index 000000000..932d1c901
--- /dev/null
+++ b/src/sycl/device_tzset.dp.cpp
@@ -0,0 +1,329 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include "slate/Exception.hh"
+#include "slate/internal/device.hh"
+
+#include "device_util.dp.hpp"
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// Device function implementing element-wise tile set.
+/// Each thread block deals with one tile. gridDim.x == batch_count.
+/// Each thread deals with one row.
+/// Called by tzset_kernel and tzset_batch_kernel.
+///
+/// @copydoc tzset
+///
+template <typename scalar_t>
+void tzset_func(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    // thread per row, if more rows than threads, loop by blockDim.x
+    for (int i = item_ct1.get_local_id(2); i < m;
+         i += item_ct1.get_local_range(2)) {
+        scalar_t* rowA = &A[ i ];
+
+        if (uplo == lapack::Uplo::Lower) {
+            for (int64_t j = 0; j <= i && j < n; ++j) { // lower
+                rowA[ j*lda ] = i == j ? diag_value : offdiag_value;
+            }
+        }
+        else {
+            for (int64_t j = n-1; j >= i; --j) { // upper
+                rowA[ j*lda ] = i == j ? diag_value : offdiag_value;
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile set.
+/// @copydoc tzset
+template <typename scalar_t>
+void tzset_kernel(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t* A, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    tzset_func(uplo, m, n, offdiag_value, diag_value, A, lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Kernel implementing element-wise tile set.
+/// @copydoc tzset_batch
+template <typename scalar_t>
+void tzset_batch_kernel(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t offdiag_value,
+    scalar_t diag_value,
+    scalar_t** Aarray, int64_t lda, const sycl::nd_item<3> &item_ct1)
+{
+    tzset_func(uplo, m, n, offdiag_value, diag_value,
+               Aarray[item_ct1.get_group(2)], lda, item_ct1);
+}
+
+//------------------------------------------------------------------------------
+/// Element-wise trapezoidal tile set.
+/// Sets upper or lower part of Aarray[k] to
+/// diag_value on the diagonal and offdiag_value on the off-diagonals.
+///
+/// @param[in] uplo
+///     Whether each Aarray[k] is upper or lower trapezoidal.
+///
+/// @param[in] m
+///     Number of rows of A. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of A. n >= 0.
+///
+/// @param[in] offdiag_value
+///     Constant to set offdiagonal entries to.
+///
+/// @param[in] diag_value
+///     Constant to set diagonal entries to.
+///
+/// @param[out] A
+///     An m-by-n matrix stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of A. lda >= m.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t const& offdiag_value,
+    scalar_t const& diag_value,
+    scalar_t* A, int64_t lda,
+    blas::Queue& queue )
+{
+    /*
+    DPCT1093:166: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:70: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           tzset_kernel(uplo, m, n, offdiag_value, diag_value,
+                                        A, lda, item_ct1);
+                       });
+
+    /*
+    DPCT1010:167: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float const& offdiag_value,
+    float const& diag_value,
+    float* A, int64_t lda,
+    blas::Queue& queue );
+
+template
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double const& offdiag_value,
+    double const& diag_value,
+    double* A, int64_t lda,
+    blas::Queue& queue );
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<float> const& offdiag_value,
+    std::complex<float> const& diag_value,
+    std::complex<float>* A, int64_t lda,
+    blas::Queue& queue )
+{
+    tzset(uplo, m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)),
+          sycl::float2(real(diag_value), imag(diag_value)), (sycl::float2 *)A,
+          lda, queue);
+}
+
+template <>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<double> const& offdiag_value,
+    std::complex<double> const& diag_value,
+    std::complex<double>* A, int64_t lda,
+    blas::Queue& queue )
+{
+    tzset(uplo, m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)),
+          sycl::double2(real(diag_value), imag(diag_value)), (sycl::double2 *)A,
+          lda, queue);
+}
+
+//==============================================================================
+namespace batch {
+
+//------------------------------------------------------------------------------
+/// Batched routine for element-wise trapezoidal tile set.
+/// Sets upper or lower part of Aarray[k] to
+/// diag_value on the diagonal and offdiag_value on the off-diagonals.
+///
+/// @param[in] m
+///     Number of rows of each tile. m >= 0.
+///
+/// @param[in] n
+///     Number of columns of each tile. n >= 0.
+///
+/// @param[in] offdiag_value
+///     Constant to set offdiagonal entries to.
+///
+/// @param[in] diag_value
+///     Constant to set diagonal entries to.
+///
+/// @param[out] Aarray
+///     Array in GPU memory of dimension batch_count, containing
+///     pointers to tiles, where each Aarray[k] is an m-by-n matrix
+///     stored in an lda-by-n array in GPU memory.
+///
+/// @param[in] lda
+///     Leading dimension of each tile in Aarray. lda >= m.
+///
+/// @param[in] batch_count
+///     Size of Aarray. batch_count >= 0.
+///
+/// @param[in] queue
+///     BLAS++ queue to execute in.
+///
+template <typename scalar_t>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    scalar_t const& offdiag_value,
+    scalar_t const& diag_value,
+    scalar_t** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue )
+{
+    // quick return
+    if (batch_count == 0)
+        return;
+
+    /*
+    DPCT1093:168: The "queue.device()" device may be not the one intended for
+    use. Adjust the selected device if needed.
+    */
+    dpct::select_device(queue.device());
+
+    // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
+    int nthreads = std::min( int64_t( 1024 ), m );
+
+    /*
+    DPCT1049:71: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    ((sycl::queue *)(&queue.stream()))
+        ->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, batch_count) *
+                                             sycl::range<3>(1, 1, nthreads),
+                                         sycl::range<3>(1, 1, nthreads)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                           tzset_batch_kernel(uplo, m, n, offdiag_value,
+                                              diag_value, Aarray, lda,
+                                              item_ct1);
+                       });
+
+    /*
+    DPCT1010:169: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    dpct::err0 error = 0;
+    slate_assert(error == 0);
+}
+
+//------------------------------------------------------------------------------
+// Explicit instantiations.
+template
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    float const& offdiag_value,
+    float const& diag_value,
+    float** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue );
+
+template
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    double const& offdiag_value,
+    double const& diag_value,
+    double** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue );
+
+//------------------------------------------------------------------------------
+// Specializations to cast std::complex => cuComplex.
+template <>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<float> const& offdiag_value,
+    std::complex<float> const& diag_value,
+    std::complex<float>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue )
+{
+    tzset(uplo, m, n, sycl::float2(real(offdiag_value), imag(offdiag_value)),
+          sycl::float2(real(diag_value), imag(diag_value)),
+          (sycl::float2 **)Aarray, lda, batch_count, queue);
+}
+
+template <>
+void tzset(
+    lapack::Uplo uplo,
+    int64_t m, int64_t n,
+    std::complex<double> const& offdiag_value,
+    std::complex<double> const& diag_value,
+    std::complex<double>** Aarray, int64_t lda,
+    int64_t batch_count, blas::Queue& queue )
+{
+    tzset(uplo, m, n, sycl::double2(real(offdiag_value), imag(offdiag_value)),
+          sycl::double2(real(diag_value), imag(diag_value)),
+          (sycl::double2 **)Aarray, lda, batch_count, queue);
+}
+
+} // namespace batch
+} // namespace device
+} // namespace slate
diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp
new file mode 100644
index 000000000..6a3ded79c
--- /dev/null
+++ b/src/sycl/device_util.dp.hpp
@@ -0,0 +1,1342 @@
+// Copyright (c) 2017-2022, University of Tennessee. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+// This program is free software: you can redistribute it and/or modify it under
+// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
+
+#ifndef SLATE_DEVICE_UTIL_CUH
+#define SLATE_DEVICE_UTIL_CUH
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <complex>
+
+namespace slate {
+namespace device {
+
+//------------------------------------------------------------------------------
+/// max that propogates nan consistently:
+///     max_nan( 1,   nan ) = nan
+///     max_nan( nan, 1   ) = nan
+template <typename real_t>
+
+inline real_t max_nan(real_t x, real_t y)
+{
+    return (sycl::isnan(y) || (y) >= (x) ? (y) : (x));
+}
+
+//------------------------------------------------------------------------------
+/// Max reduction of n-element array x, leaving total in x[0]. Propogates NaN
+/// values consistently.
+/// With k threads, can reduce array up to 2*k in size. Assumes number of
+/// threads <= 1024, which is the current max number of CUDA threads.
+///
+/// @param[in] n
+///     Size of array.
+///
+/// @param[in] tid
+///     Thread id.
+///
+/// @param[in] x
+///     Array of dimension n. On exit, x[0] = max(x[0], ..., x[n-1]);
+///     the rest of x is overwritten.
+///
+template <typename real_t>
+
+void max_nan_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1)
+{
+    /*
+      DPCT1065:0: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 1024) {
+        if (tid < 1024 && tid + 1024 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 1024]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:1: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 512) {
+        if (tid < 512 && tid + 512 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 512]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:2: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 256) {
+        if (tid < 256 && tid + 256 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 256]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:3: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 128) {
+        if (tid < 128 && tid + 128 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 128]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:4: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 64) {
+        if (tid < 64 && tid + 64 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 64]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:5: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 32) {
+        if (tid < 32 && tid + 32 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 32]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:6: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 16) {
+        if (tid < 16 && tid + 16 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 16]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:7: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 8) {
+        if (tid < 8 && tid + 8 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 8]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:8: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 4) {
+        if (tid < 4 && tid + 4 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 4]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:9: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 2) {
+        if (tid < 2 && tid + 2 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 2]);
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:10: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 1) {
+        if (tid < 1 && tid + 1 < n) {
+            x[tid] = max_nan(x[tid], x[tid + 1]);
+        } item_ct1.barrier();
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Sum reduction of n-element array x, leaving total in x[0].
+/// With k threads, can reduce array up to 2*k in size. Assumes number of
+/// threads <= 1024 (which is current max number of CUDA threads).
+///
+/// @param[in] n
+///     Size of array.
+///
+/// @param[in] tid
+///     Thread id.
+///
+/// @param[in] x
+///     Array of dimension n. On exit, x[0] = sum(x[0], ..., x[n-1]);
+///     rest of x is overwritten.
+///
+template <typename real_t>
+
+void sum_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1)
+{
+    /*
+      DPCT1065:11: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 1024) {
+        if (tid < 1024 && tid + 1024 < n) {
+            x[tid] += x[tid + 1024];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:12: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 512) {
+        if (tid < 512 && tid + 512 < n) {
+            x[tid] += x[tid + 512];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:13: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 256) {
+        if (tid < 256 && tid + 256 < n) {
+            x[tid] += x[tid + 256];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:14: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 128) {
+        if (tid < 128 && tid + 128 < n) {
+            x[tid] += x[tid + 128];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:15: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 64) {
+        if (tid < 64 && tid + 64 < n) {
+            x[tid] += x[tid + 64];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:16: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 32) {
+        if (tid < 32 && tid + 32 < n) {
+            x[tid] += x[tid + 32];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:17: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 16) {
+        if (tid < 16 && tid + 16 < n) {
+            x[tid] += x[tid + 16];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:18: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 8) {
+        if (tid < 8 && tid + 8 < n) {
+            x[tid] += x[tid + 8];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:19: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 4) {
+        if (tid < 4 && tid + 4 < n) {
+            x[tid] += x[tid + 4];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:20: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 2) {
+        if (tid < 2 && tid + 2 < n) {
+            x[tid] += x[tid + 2];
+        } item_ct1.barrier();
+    }
+    /*
+      DPCT1065:21: Consider replacing sycl::nd_item::barrier() with
+      sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+      performance if there is no access to global memory.
+    */
+    if (n > 1) {
+        if (tid < 1 && tid + 1 < n) {
+            x[tid] += x[tid + 1];
+        } item_ct1.barrier();
+    }
+}
+
+//==============================================================================
+// real, imag, conj.
+
+#ifdef DPCT_COMPATIBILITY_TEMP
+
+// CUDA doesn't provide real, imag, conj.
+inline double real(sycl::double2 x) { return x.x(); }
+inline float real(sycl::float2 x) { return x.x(); }
+
+inline double imag(sycl::double2 x) { return x.y(); }
+inline float imag(sycl::float2 x) { return x.y(); }
+
+inline sycl::double2 conj(sycl::double2 x) { return dpct::conj<double>(x); }
+inline sycl::float2 conj(sycl::float2 x) { return dpct::conj<float>(x); }
+
+#else
+
+__host__ __device__ inline double real( rocblas_double_complex x ) { return x.real(); }
+__host__ __device__ inline float  real( rocblas_float_complex  x ) { return x.real(); }
+
+__host__ __device__ inline double imag( rocblas_double_complex x ) { return x.imag(); }
+__host__ __device__ inline float  imag( rocblas_float_complex  x ) { return x.imag(); }
+
+__host__ __device__ inline rocblas_double_complex conj( rocblas_double_complex x ) { return { x.real(), -x.imag() }; }
+__host__ __device__ inline rocblas_float_complex  conj( rocblas_float_complex  x ) { return { x.real(), -x.imag() }; }
+
+#endif
+
+//----------------------------------------
+// Overloads for real numbers.
+
+/// @return real component of complex number x; x for real number.
+/// @ingroup complex
+inline double real( double x ) { return x; }
+inline float  real( float  x ) { return x; }
+
+/// @return imaginary component of complex number x; 0 for real number.
+/// @ingroup complex
+inline double imag( double x ) { return 0; }
+inline float  imag( float  x ) { return 0; }
+
+/// @return conjugate of complex number x; x for real number.
+/// @ingroup complex
+inline double conj( double x ) { return x; }
+inline float  conj( float  x ) { return x; }
+
+//------------------------------------------------------------------------------
+/// Overloaded versions of absolute value on device.
+
+inline float abs(float x)
+{
+    return sycl::fabs(x);
+}
+
+//----------------------------------------
+
+inline double abs(double x)
+{
+    return sycl::fabs(x);
+}
+
+//----------------------------------------
+
+inline float abs(sycl::float2 x)
+{
+#ifdef DPCT_COMPATIBILITY_TEMP
+    // CUDA has a good implementation.
+    return dpct::cabs<float>(x);
+#else
+    // For HIP, use our implementation that scales per LAPACK.
+    float a = real( x );
+    float b = imag( x );
+    float z, w, t;
+    if (isnan( a )) {
+        return a;
+    }
+    else if (isnan( b )) {
+        return b;
+    }
+    else {
+        a = fabsf(a);
+        b = fabsf(b);
+        w = max(a, b);
+        z = min(a, b);
+        if (z == 0) {
+            t = w;
+        }
+        else {
+            t = z/w;
+            t = 1 + t*t;
+            t = w * sqrtf(t);
+        }
+        return t;
+    }
+#endif
+}
+
+//----------------------------------------
+
+inline double abs(sycl::double2 x)
+{
+#ifdef DPCT_COMPATIBILITY_TEMP
+    // CUDA has a good implementation.
+    return dpct::cabs<double>(x);
+#else
+    // For HIP, use our implementation that scales per LAPACK.
+    double a = real( x );
+    double b = imag( x );
+    double z, w, t;
+    if (isnan( a )) {
+        return a;
+    }
+    else if (isnan( b )) {
+        return b;
+    }
+    else {
+        a = fabs(a);
+        b = fabs(b);
+        w = max(a, b);
+        z = min(a, b);
+        if (z == 0) {
+            t = w;
+        }
+        else {
+            t = z/w;
+            t = 1.0 + t*t;
+            t = w * sqrt(t);
+        }
+        return t;
+    }
+#endif
+}
+
+//------------------------------------------------------------------------------
+/// Square of number.
+/// @return x^2
+template <typename scalar_t>
+
+inline scalar_t sqr(scalar_t x)
+{
+    return x*x;
+}
+
+//------------------------------------------------------------------------------
+/// Adds two scaled, sum-of-squares representations.
+/// On exit, scale1 and sumsq1 are updated such that:
+///     scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2.
+template <typename real_t>
+
+void combine_sumsq(
+    real_t& scale1, real_t& sumsq1,
+    real_t  scale2, real_t  sumsq2 )
+{
+    if (scale1 > scale2) {
+        sumsq1 = sumsq1 + sumsq2*sqr(scale2 / scale1);
+        // scale1 stays same
+    }
+    else if (scale2 != 0) {
+        sumsq1 = sumsq1*sqr(scale1 / scale2) + sumsq2;
+        scale1 = scale2;
+    }
+}
+
+//------------------------------------------------------------------------------
+/// Adds new value to scaled, sum-of-squares representation.
+/// On exit, scale and sumsq are updated such that:
+///     scale^2 sumsq := scale^2 sumsq + (absx)^2
+template <typename real_t>
+
+void add_sumsq(
+    real_t& scale, real_t& sumsq,
+    real_t absx)
+{
+    if (scale < absx) {
+        sumsq = 1 + sumsq * sqr(scale / absx);
+        scale = absx;
+    }
+    else if (scale != 0) {
+        sumsq = sumsq + sqr(absx / scale);
+    }
+}
+
+//------------------------------------------------------------------------------
+/// @return ceil( x / y ), for integer type T.
+template <typename T>
+
+inline constexpr T ceildiv(T x, T y)
+{
+    return T((x + y - 1) / y);
+}
+
+//------------------------------------------------------------------------------
+/// @return ceil( x / y )*y, i.e., x rounded up to next multiple of y.
+template <typename T>
+
+inline constexpr T roundup(T x, T y)
+{
+    return T((x + y - 1) / y) * y;
+}
+
+//------------------------------------------------------------------------------
+/// Overloaded copy and precision conversion.
+/// Sets b = a, converting from type TA to type TB.
+template <typename TA, typename TB>
+
+inline void copy(TA a, TB& b)
+{
+    b = a;
+}
+
+/// Sets b = a, converting from complex-float to complex-double.
+
+inline void copy(sycl::float2 a, sycl::double2 &b)
+{
+    b = sycl::double2(real(a), imag(a));
+}
+
+/// Sets b = a, converting from complex-double to complex-float.
+
+inline void copy(sycl::double2 a, sycl::float2 &b)
+{
+    b = sycl::float2(real(a), imag(a));
+}
+
+/// Sets b = a, converting from float to complex-float.
+
+inline void copy(float a, sycl::float2 &b)
+{
+    b = sycl::float2(a, 0);
+}
+
+/// Sets b = a, converting from double to complex-double.
+
+inline void copy(double a, sycl::double2 &b)
+{
+    b = sycl::double2(a, 0);
+}
+
+//==============================================================================
+// CUDA doesn't provide operators, so define our own.
+// rocBLAS provides operators.
+//
+// complex-double
+
+#if defined( BLAS_HAVE_SYCL )
+
+// ---------- negate
+/*
+DPCT1011:83: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator-(const sycl::double2 &a)
+{
+    return sycl::double2(-real(a), -imag(a));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:84: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b)
+{
+    return sycl::double2(real(a) + real(b), imag(a) + imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:85: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator+(const sycl::double2 a, const double s)
+{
+    return sycl::double2(real(a) + s, imag(a));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:86: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator+(const double s, const sycl::double2 b)
+{
+    return sycl::double2(s + real(b), imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:87: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b)
+{
+    a = sycl::double2(real(a) + real(b), imag(a) + imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:88: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator += (sycl::double2 &a, const double s)
+{
+    a = sycl::double2(real(a) + s, imag(a));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- subtract
+/*
+DPCT1011:89: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b)
+{
+    return sycl::double2(real(a) - real(b), imag(a) - imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:90: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator-(const sycl::double2 a, const double s)
+{
+    return sycl::double2(real(a) - s, imag(a));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:91: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator-(const double s, const sycl::double2 b)
+{
+    return sycl::double2(s - real(b), -imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:92: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b)
+{
+    a = sycl::double2(real(a) - real(b), imag(a) - imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:93: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator -= (sycl::double2 &a, const double s)
+{
+    a = sycl::double2(real(a) - s, imag(a));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- multiply
+/*
+DPCT1011:94: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b)
+{
+    return sycl::double2(real(a) * real(b) - imag(a) * imag(b),
+                         imag(a) * real(b) + real(a) * imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:95: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator*(const sycl::double2 a, const double s)
+{
+    return sycl::double2(real(a) * s, imag(a) * s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:96: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator*(const sycl::double2 a, const float s)
+{
+    return sycl::double2(real(a) * s, imag(a) * s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:97: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator*(const double s, const sycl::double2 a)
+{
+    return sycl::double2(real(a) * s, imag(a) * s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:98: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b)
+{
+    a = sycl::double2(real(a) * real(b) - imag(a) * imag(b),
+                      imag(a) * real(b) + real(a) * imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:99: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator *= (sycl::double2 &a, const double s)
+{
+    a = sycl::double2(real(a) * s, imag(a) * s);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- divide
+/* From LAPACK DLADIV
+ * Performs complex division in real arithmetic, avoiding unnecessary overflow.
+ *
+ *             a + i*b
+ *  p + i*q = ---------
+ *             c + i*d
+ */
+/*
+DPCT1011:100: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y)
+{
+    double a = real(x);
+    double b = imag(x);
+    double c = real(y);
+    double d = imag(y);
+    double e, f, p, q;
+    if (abs( d ) < abs( c )) {
+        e = d / c;
+        f = c + d*e;
+        p = ( a + b*e ) / f;
+        q = ( b - a*e ) / f;
+    }
+    else {
+        e = c / d;
+        f = d + c*e;
+        p = (  b + a*e ) / f;
+        q = ( -a + b*e ) / f;
+    }
+    return sycl::double2(p, q);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:101: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator/(const sycl::double2 a, const double s)
+{
+    return sycl::double2(real(a) / s, imag(a) / s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:102: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 operator/(const double a, const sycl::double2 y)
+{
+    double c = real(y);
+    double d = imag(y);
+    double e, f, p, q;
+    if (abs( d ) < abs( c )) {
+        e = d / c;
+        f = c + d*e;
+        p =  a   / f;
+        q = -a*e / f;
+    }
+    else {
+        e = c / d;
+        f = d + c*e;
+        p =  a*e / f;
+        q = -a   / f;
+    }
+    return sycl::double2(p, q);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:103: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b)
+{
+    a = dpct_operator_overloading::operator/(a, b);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:104: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::double2 &operator /= (sycl::double2 &a, const double s)
+{
+    a = sycl::double2(real(a) / s, imag(a) / s);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+//==============================================================================
+// complex-float
+
+// ---------- negate
+/*
+DPCT1011:105: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator-(const sycl::float2 &a)
+{
+    return sycl::float2(-real(a), -imag(a));
+}
+} // namespace dpct_operator_overloading
+
+// ---------- add
+/*
+DPCT1011:106: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b)
+{
+    return sycl::float2(real(a) + real(b), imag(a) + imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:107: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator+(const sycl::float2 a, const float s)
+{
+    return sycl::float2(real(a) + s, imag(a));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:108: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator+(const float s, const sycl::float2 b)
+{
+    return sycl::float2(s + real(b), imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:109: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b)
+{
+    a = sycl::float2(real(a) + real(b), imag(a) + imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:110: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator += (sycl::float2 &a, const float s)
+{
+    a = sycl::float2(real(a) + s, imag(a));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- subtract
+/*
+DPCT1011:111: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b)
+{
+    return sycl::float2(real(a) - real(b), imag(a) - imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:112: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator-(const sycl::float2 a, const float s)
+{
+    return sycl::float2(real(a) - s, imag(a));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:113: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator-(const float s, const sycl::float2 b)
+{
+    return sycl::float2(s - real(b), -imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:114: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b)
+{
+    a = sycl::float2(real(a) - real(b), imag(a) - imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:115: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator -= (sycl::float2 &a, const float s)
+{
+    a = sycl::float2(real(a) - s, imag(a));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- multiply
+/*
+DPCT1011:116: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b)
+{
+    return sycl::float2(real(a) * real(b) - imag(a) * imag(b),
+                        imag(a) * real(b) + real(a) * imag(b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:117: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator*(const sycl::float2 a, const float s)
+{
+    return sycl::float2(real(a) * s, imag(a) * s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:118: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator*(const float s, const sycl::float2 a)
+{
+    return sycl::float2(real(a) * s, imag(a) * s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:119: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b)
+{
+    a = sycl::float2(real(a) * real(b) - imag(a) * imag(b),
+                     imag(a) * real(b) + real(a) * imag(b));
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:120: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator *= (sycl::float2 &a, const float s)
+{
+    a = sycl::float2(real(a) * s, imag(a) * s);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- divide
+/* From LAPACK DLADIV
+ * Performs complex division in real arithmetic, avoiding unnecessary overflow.
+ *
+ *             a + i*b
+ *  p + i*q = ---------
+ *             c + i*d
+ */
+/*
+DPCT1011:121: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y)
+{
+    float a = real(x);
+    float b = imag(x);
+    float c = real(y);
+    float d = imag(y);
+    float e, f, p, q;
+    if (abs( d ) < abs( c )) {
+        e = d / c;
+        f = c + d*e;
+        p = ( a + b*e ) / f;
+        q = ( b - a*e ) / f;
+    }
+    else {
+        e = c / d;
+        f = d + c*e;
+        p = (  b + a*e ) / f;
+        q = ( -a + b*e ) / f;
+    }
+    return sycl::float2(p, q);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:122: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator/(const sycl::float2 a, const float s)
+{
+    return sycl::float2(real(a) / s, imag(a) / s);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:123: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 operator/(const float a, const sycl::float2 y)
+{
+    float c = real(y);
+    float d = imag(y);
+    float e, f, p, q;
+    if (abs( d ) < abs( c )) {
+        e = d / c;
+        f = c + d*e;
+        p =  a   / f;
+        q = -a*e / f;
+    }
+    else {
+        e = c / d;
+        f = d + c*e;
+        p =  a*e / f;
+        q = -a   / f;
+    }
+    return sycl::float2(p, q);
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:124: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b)
+{
+    a = dpct_operator_overloading::operator/(a, b);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:125: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline sycl::float2 &operator /= (sycl::float2 &a, const float s)
+{
+    a = sycl::float2(real(a) / s, imag(a) / s);
+    return a;
+}
+} // namespace dpct_operator_overloading
+
+// ---------- equality
+/*
+DPCT1011:126: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator *= =(const sycl::float2 a, const sycl::float2 b)
+{
+    return ( real(a) == real(b) &&
+             imag(a) == imag(b) );
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:127: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator *= =(const sycl::float2 a, const float s)
+{
+    return ( real(a) == s &&
+             imag(a) == 0. );
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:128: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator *= =(const float s, const sycl::float2 a)
+{
+    return ( real(a) == s &&
+             imag(a) == 0. );
+}
+} // namespace dpct_operator_overloading
+
+// ---------- not equality
+/*
+DPCT1011:129: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator != (const sycl::float2 a, const sycl::float2 b)
+{
+    return !(dpct_operator_overloading::operator *= =(a, b));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:130: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator != (const sycl::float2 a, const float s)
+{
+    return !(dpct_operator_overloading::operator *= =(a, s));
+}
+} // namespace dpct_operator_overloading
+
+/*
+DPCT1011:131: The tool detected overloaded operators for built-in vector types,
+which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+standard operators instead.
+*/
+namespace dpct_operator_overloading {
+
+inline bool operator != (const float s, const sycl::float2 a)
+{
+    return !(dpct_operator_overloading::operator *= =(a, s));
+}
+} // namespace dpct_operator_overloading
+
+#endif // BLAS_WITH_CUBLAS
+
+} // namespace device
+} // namespace slate
+
+#endif // SLATE_DEVICE_UTIL_CUH

From 41cbf01fe58c18f1e5cd6e3f2a7dd2e7bec9e8c9 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Wed, 15 Nov 2023 23:49:48 +0000
Subject: [PATCH 02/10] Build for sycl using sycl/device_ kernels (instead of
 omptarget).

---
 GNUmakefile | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index aae7fa9e1..01e95f38a 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -147,9 +147,8 @@ omptarget = 0
 ifneq ($(cuda),1)
 ifneq ($(hip),1)
     ifeq (${gpu_backend},sycl)
-        # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
-        $(info Note: enabling omp-target-offload kernels)
-        omptarget = 1
+        # enable the kernels in SLATE for oneMKL-SYCL devices
+        sycl = 1
 
         # -Wno-unused-command-line-argument avoids
         # icpx warning: -Wl,-rpath,...: 'linker' input unused.
@@ -163,7 +162,18 @@ ifneq ($(hip),1)
         CXXFLAGS += -fsycl -fp-model=precise -Wno-unused-command-line-argument \
                     -Wno-c99-extensions -Wno-pass-failed
         LIBS += -lsycl
-    endif
+
+        # How should the slate kernels be compiled
+        ifeq (${sycl_kernels},omptarget) # src/omptarget kernels
+            # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
+		    omptarget = 1
+        else # src/sycl kernels - default/fall-through option
+            sycl_kernels = 1
+            CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
+            LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
+        endif
+
+   endif
 endif
 endif
 
@@ -210,8 +220,13 @@ endif
 ifeq ($(openmp),1)
     ifeq (${gpu_backend},sycl)
         # Intel icpx options for OpenMP offload.
-        CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-        LDFLAGS  += -fiopenmp -fopenmp-targets=spir64
+        CXXFLAGS += -fiopenmp
+        LDFLAGS  += -fiopenmp
+	    ifeq (${omptarget},1)
+            # If SYCL + OpenMP-offload-kernels, specify omp device type
+	        CXXFLAGS += -fopenmp-targets=spir64
+		    LDFLAGS  += -fopenmp-targets=spir64
+        endif
     else
         # Most other compilers recognize this.
         CXXFLAGS += -fopenmp
@@ -542,6 +557,10 @@ cuda_hdr := \
 hip_src := $(patsubst src/cuda/%.cu,src/hip/%.hip.cc,$(cuda_src))
 hip_hdr := $(patsubst src/cuda/%.cuh,src/hip/%.hip.hh,$(cuda_hdr))
 
+# SYCL implementations of device kernels
+sycl_kernels_src := $(patsubst src/cuda/%.cu,src/sycl/%.dp.cpp,$(cuda_src))
+sycl_kernels_hdr := $(patsubst src/cuda/%.cuh,src/sycl/%.dp.hpp,$(cuda_hdr))
+
 # OpenMP implementations of device kernels
 omptarget_src := \
         src/omptarget/device_geadd.cc \
@@ -564,6 +583,8 @@ ifeq (${cuda},1)
     libslate_src += ${cuda_src}
 else ifeq (${hip},1)
     libslate_src += ${hip_src}
+else ifeq ($(sycl_kernels),1)
+    libslate_src += $(sycl_kernels_src)
 else
     # Used for both OpenMP offload (${omptarget} == 1) and as stubs for
     # CPU-only build.
@@ -1322,6 +1343,9 @@ hooks: ${hooks}
 %.hip.o: %.hip.cc | $(hip_hdr)
 	$(HIPCC) $(HIPCCFLAGS) -c $< -o $@
 
+%.dp.o: %.dp.cpp | $(sycl_kernels_hdr)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 %.o: %.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
@@ -1462,6 +1486,12 @@ echo:
 	@echo "---------- OMP target-offload kernel options"
 	@echo "omptarget     = '${omptarget}'"
 	@echo "omptarget_src = ${omptarget_src}"
+	@echo "omptarget_hdr = ${omptarget_hdr}"
+	@echo
+	@echo "---------- SYCL device kernels"
+	@echo "sycl_kernels  = '$(sycl_kernels)'"
+	@echo "sycl_kernels_src  = '$(sycl_kernels_src)'"
+	@echo "sycl_kernels_hdr  = '$(sycl_kernels_hdr)'"
 	@echo
 	@echo "---------- Fortran compiler"
 	@echo "FC            = $(FC)"

From c47b9e8287c03b687c382336c41f9e33745fa920 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Thu, 16 Nov 2023 21:27:41 +0000
Subject: [PATCH 03/10] Update sycl-kernels to handle multiplication, need for
 complex (axpby, multiply_ab).

---
 src/sycl/device_geadd.dp.cpp           |    5 +-
 src/sycl/device_gescale.dp.cpp         |    3 +-
 src/sycl/device_gescale_row_col.dp.cpp |    9 +-
 src/sycl/device_tzadd.dp.cpp           |   10 +-
 src/sycl/device_tzscale.dp.cpp         |    8 +-
 src/sycl/device_util.dp.hpp            | 1653 ++++++++++++------------
 6 files changed, 851 insertions(+), 837 deletions(-)

diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp
index 3344ec0b3..ebeddb9ed 100644
--- a/src/sycl/device_geadd.dp.cpp
+++ b/src/sycl/device_geadd.dp.cpp
@@ -54,10 +54,7 @@ void geadd_func(
         scalar_t* rowB = &B[ i ];
 
         for (int64_t j = 0; j < n; ++j)
-            rowB[ j*ldb ] = (alpha * rowA[ j*lda ]) + (beta * rowB[ j*ldb ]);
-            // rowB[j * ldb] = dpct_operator_overloading::operator+(
-            //                     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
-            //                     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+            rowB[ j*ldb ] = axpby( alpha, rowA[ j*lda ], beta, rowB[ j*ldb ] );
     }
 }
 
diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp
index f76343c7f..afe24370a 100644
--- a/src/sycl/device_gescale.dp.cpp
+++ b/src/sycl/device_gescale.dp.cpp
@@ -35,8 +35,7 @@ void gescale_func(
          i += item_ct1.get_local_range(2)) {
         scalar_t* rowA = &A[ i ];
         for (int64_t j = 0; j < n; ++j)
-            rowA[ j*lda ] = rowA[ j*lda ] * mul;
-            // rowA[j * lda] = dpct_operator_overloading::operator*(rowA[j * lda], mul);
+            rowA[ j*lda ] = multiply_ax( mul, rowA[ j*lda ] );
     }
 }
 
diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp
index ffb727c4d..a4b145d46 100644
--- a/src/sycl/device_gescale_row_col.dp.cpp
+++ b/src/sycl/device_gescale_row_col.dp.cpp
@@ -57,11 +57,10 @@ void gescale_row_col_batch_kernel(
          i += item_ct1.get_local_range(2)) {
         scalar_t* rowA = &tileA[ i ];
         scalar_t2 ri = R[ i ];
-        for (int64_t j = 0; j < n; ++j)
-            rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]);
-            // rowA[j * lda] = dpct_operator_overloading::operator*(
-            //     rowA[j * lda],
-            //     dpct_operator_overloading::operator*(ri, C[j])));
+        for (int64_t j = 0; j < n; ++j) {
+            rowA[ j*lda ] = multiply_ax( multiply_ax(ri, C[ j ]), rowA[ j*lda ] );
+            // rowA[ j*lda ] = rowA[ j*lda ] * (ri * C[ j ]);
+        }
     }
 }
 
diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp
index e588fb4c9..158eb92ab 100644
--- a/src/sycl/device_tzadd.dp.cpp
+++ b/src/sycl/device_tzadd.dp.cpp
@@ -61,18 +61,12 @@ void tzadd_kernel(
 
         if (uplo == lapack::Uplo::Lower) {
             for (int64_t j = 0; j <= i && j < n; ++j) { // lower
-                rowB[j*ldb] = alpha * rowA[j*lda] + beta * rowB[ j*ldb ];
-                // rowB[j * ldb] = dpct_operator_overloading::operator+(
-                //     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
-                //     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+                rowB[j*ldb] = axpby( alpha, rowA[j*lda], beta, rowB[ j*ldb ] );
             }
         }
         else {
             for (int64_t j = n-1; j >= i; --j) { // upper
-                rowB[j*ldb] = alpha * rowA[ j*lda ] + beta * rowB[ j*ldb ];
-                // rowB[j * ldb] = dpct_operator_overloading::operator+(
-                //     dpct_operator_overloading::operator*(alpha, rowA[j * lda]),
-                //     dpct_operator_overloading::operator*(beta, rowB[j * ldb]));
+                rowB[j*ldb] = axpby( alpha, rowA[j*lda], beta, rowB[ j*ldb ] );
             }
         }
     }
diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp
index 61df572f2..01bc55ea5 100644
--- a/src/sycl/device_tzscale.dp.cpp
+++ b/src/sycl/device_tzscale.dp.cpp
@@ -58,16 +58,12 @@ void tzscale_kernel(
 
         if (uplo == lapack::Uplo::Lower) {
             for (int64_t j = 0; j <= i && j < n; ++j) { // lower
-                rowA[j*lda] = rowA[j*lda] * mul;
-                // rowA[j * lda] =
-                //     dpct_operator_overloading::operator*(rowA[j * lda], mul);
+                rowA[j*lda] = multiply_ax( mul, rowA[j*lda] );
             }
         }
         else {
             for (int64_t j = n-1; j >= i; --j) // upper
-                rowA[j*lda] = rowA[j*lda] * mul;
-                // rowA[j * lda] =
-                //     dpct_operator_overloading::operator*(rowA[j * lda], mul);
+                rowA[j*lda] = multiply_ax( mul, rowA[j*lda] );
         }
     }
 }
diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp
index 6a3ded79c..8c440ae50 100644
--- a/src/sycl/device_util.dp.hpp
+++ b/src/sycl/device_util.dp.hpp
@@ -353,7 +353,7 @@ inline double abs(double x)
 inline float abs(sycl::float2 x)
 {
 #ifdef DPCT_COMPATIBILITY_TEMP
-    // CUDA has a good implementation.
+    // Use DPCT routine
     return dpct::cabs<float>(x);
 #else
     // For HIP, use our implementation that scales per LAPACK.
@@ -389,7 +389,7 @@ inline float abs(sycl::float2 x)
 inline double abs(sycl::double2 x)
 {
 #ifdef DPCT_COMPATIBILITY_TEMP
-    // CUDA has a good implementation.
+    // Use DPCT routine
     return dpct::cabs<double>(x);
 #else
     // For HIP, use our implementation that scales per LAPACK.
@@ -424,7 +424,6 @@ inline double abs(sycl::double2 x)
 /// Square of number.
 /// @return x^2
 template <typename scalar_t>
-
 inline scalar_t sqr(scalar_t x)
 {
     return x*x;
@@ -435,7 +434,6 @@ inline scalar_t sqr(scalar_t x)
 /// On exit, scale1 and sumsq1 are updated such that:
 ///     scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2.
 template <typename real_t>
-
 void combine_sumsq(
     real_t& scale1, real_t& sumsq1,
     real_t  scale2, real_t  sumsq2 )
@@ -455,7 +453,6 @@ void combine_sumsq(
 /// On exit, scale and sumsq are updated such that:
 ///     scale^2 sumsq := scale^2 sumsq + (absx)^2
 template <typename real_t>
-
 void add_sumsq(
     real_t& scale, real_t& sumsq,
     real_t absx)
@@ -472,7 +469,6 @@ void add_sumsq(
 //------------------------------------------------------------------------------
 /// @return ceil( x / y ), for integer type T.
 template <typename T>
-
 inline constexpr T ceildiv(T x, T y)
 {
     return T((x + y - 1) / y);
@@ -481,7 +477,6 @@ inline constexpr T ceildiv(T x, T y)
 //------------------------------------------------------------------------------
 /// @return ceil( x / y )*y, i.e., x rounded up to next multiple of y.
 template <typename T>
-
 inline constexpr T roundup(T x, T y)
 {
     return T((x + y - 1) / y) * y;
@@ -491,850 +486,884 @@ inline constexpr T roundup(T x, T y)
 /// Overloaded copy and precision conversion.
 /// Sets b = a, converting from type TA to type TB.
 template <typename TA, typename TB>
-
 inline void copy(TA a, TB& b)
 {
     b = a;
 }
 
 /// Sets b = a, converting from complex-float to complex-double.
-
 inline void copy(sycl::float2 a, sycl::double2 &b)
 {
     b = sycl::double2(real(a), imag(a));
 }
 
 /// Sets b = a, converting from complex-double to complex-float.
-
 inline void copy(sycl::double2 a, sycl::float2 &b)
 {
     b = sycl::float2(real(a), imag(a));
 }
 
 /// Sets b = a, converting from float to complex-float.
-
 inline void copy(float a, sycl::float2 &b)
 {
     b = sycl::float2(a, 0);
 }
 
 /// Sets b = a, converting from double to complex-double.
-
 inline void copy(double a, sycl::double2 &b)
 {
     b = sycl::double2(a, 0);
 }
 
-//==============================================================================
-// CUDA doesn't provide operators, so define our own.
-// rocBLAS provides operators.
-//
-// complex-double
-
-#if defined( BLAS_HAVE_SYCL )
-
-// ---------- negate
-/*
-DPCT1011:83: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator-(const sycl::double2 &a)
-{
-    return sycl::double2(-real(a), -imag(a));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:84: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b)
-{
-    return sycl::double2(real(a) + real(b), imag(a) + imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:85: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator+(const sycl::double2 a, const double s)
-{
-    return sycl::double2(real(a) + s, imag(a));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:86: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator+(const double s, const sycl::double2 b)
-{
-    return sycl::double2(s + real(b), imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:87: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b)
-{
-    a = sycl::double2(real(a) + real(b), imag(a) + imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:88: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator += (sycl::double2 &a, const double s)
-{
-    a = sycl::double2(real(a) + s, imag(a));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- subtract
-/*
-DPCT1011:89: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b)
-{
-    return sycl::double2(real(a) - real(b), imag(a) - imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:90: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator-(const sycl::double2 a, const double s)
-{
-    return sycl::double2(real(a) - s, imag(a));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:91: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator-(const double s, const sycl::double2 b)
-{
-    return sycl::double2(s - real(b), -imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:92: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b)
-{
-    a = sycl::double2(real(a) - real(b), imag(a) - imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:93: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator -= (sycl::double2 &a, const double s)
-{
-    a = sycl::double2(real(a) - s, imag(a));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- multiply
-/*
-DPCT1011:94: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b)
-{
-    return sycl::double2(real(a) * real(b) - imag(a) * imag(b),
-                         imag(a) * real(b) + real(a) * imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:95: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator*(const sycl::double2 a, const double s)
-{
-    return sycl::double2(real(a) * s, imag(a) * s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:96: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator*(const sycl::double2 a, const float s)
-{
-    return sycl::double2(real(a) * s, imag(a) * s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:97: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator*(const double s, const sycl::double2 a)
-{
-    return sycl::double2(real(a) * s, imag(a) * s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:98: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b)
-{
-    a = sycl::double2(real(a) * real(b) - imag(a) * imag(b),
-                      imag(a) * real(b) + real(a) * imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:99: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator *= (sycl::double2 &a, const double s)
-{
-    a = sycl::double2(real(a) * s, imag(a) * s);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- divide
-/* From LAPACK DLADIV
- * Performs complex division in real arithmetic, avoiding unnecessary overflow.
- *
- *             a + i*b
- *  p + i*q = ---------
- *             c + i*d
- */
-/*
-DPCT1011:100: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y)
-{
-    double a = real(x);
-    double b = imag(x);
-    double c = real(y);
-    double d = imag(y);
-    double e, f, p, q;
-    if (abs( d ) < abs( c )) {
-        e = d / c;
-        f = c + d*e;
-        p = ( a + b*e ) / f;
-        q = ( b - a*e ) / f;
-    }
-    else {
-        e = c / d;
-        f = d + c*e;
-        p = (  b + a*e ) / f;
-        q = ( -a + b*e ) / f;
-    }
-    return sycl::double2(p, q);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:101: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator/(const sycl::double2 a, const double s)
-{
-    return sycl::double2(real(a) / s, imag(a) / s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:102: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 operator/(const double a, const sycl::double2 y)
-{
-    double c = real(y);
-    double d = imag(y);
-    double e, f, p, q;
-    if (abs( d ) < abs( c )) {
-        e = d / c;
-        f = c + d*e;
-        p =  a   / f;
-        q = -a*e / f;
-    }
-    else {
-        e = c / d;
-        f = d + c*e;
-        p =  a*e / f;
-        q = -a   / f;
-    }
-    return sycl::double2(p, q);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:103: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b)
-{
-    a = dpct_operator_overloading::operator/(a, b);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:104: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::double2 &operator /= (sycl::double2 &a, const double s)
-{
-    a = sycl::double2(real(a) / s, imag(a) / s);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-//==============================================================================
-// complex-float
-
-// ---------- negate
-/*
-DPCT1011:105: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator-(const sycl::float2 &a)
-{
-    return sycl::float2(-real(a), -imag(a));
-}
-} // namespace dpct_operator_overloading
-
-// ---------- add
-/*
-DPCT1011:106: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b)
-{
-    return sycl::float2(real(a) + real(b), imag(a) + imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:107: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator+(const sycl::float2 a, const float s)
-{
-    return sycl::float2(real(a) + s, imag(a));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:108: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator+(const float s, const sycl::float2 b)
-{
-    return sycl::float2(s + real(b), imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:109: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b)
-{
-    a = sycl::float2(real(a) + real(b), imag(a) + imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:110: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator += (sycl::float2 &a, const float s)
-{
-    a = sycl::float2(real(a) + s, imag(a));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- subtract
-/*
-DPCT1011:111: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b)
-{
-    return sycl::float2(real(a) - real(b), imag(a) - imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:112: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator-(const sycl::float2 a, const float s)
-{
-    return sycl::float2(real(a) - s, imag(a));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:113: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator-(const float s, const sycl::float2 b)
-{
-    return sycl::float2(s - real(b), -imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:114: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b)
-{
-    a = sycl::float2(real(a) - real(b), imag(a) - imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:115: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator -= (sycl::float2 &a, const float s)
-{
-    a = sycl::float2(real(a) - s, imag(a));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- multiply
-/*
-DPCT1011:116: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b)
-{
-    return sycl::float2(real(a) * real(b) - imag(a) * imag(b),
-                        imag(a) * real(b) + real(a) * imag(b));
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:117: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator*(const sycl::float2 a, const float s)
-{
-    return sycl::float2(real(a) * s, imag(a) * s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:118: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator*(const float s, const sycl::float2 a)
-{
-    return sycl::float2(real(a) * s, imag(a) * s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:119: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b)
-{
-    a = sycl::float2(real(a) * real(b) - imag(a) * imag(b),
-                     imag(a) * real(b) + real(a) * imag(b));
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:120: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator *= (sycl::float2 &a, const float s)
-{
-    a = sycl::float2(real(a) * s, imag(a) * s);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- divide
-/* From LAPACK DLADIV
- * Performs complex division in real arithmetic, avoiding unnecessary overflow.
- *
- *             a + i*b
- *  p + i*q = ---------
- *             c + i*d
- */
-/*
-DPCT1011:121: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y)
-{
-    float a = real(x);
-    float b = imag(x);
-    float c = real(y);
-    float d = imag(y);
-    float e, f, p, q;
-    if (abs( d ) < abs( c )) {
-        e = d / c;
-        f = c + d*e;
-        p = ( a + b*e ) / f;
-        q = ( b - a*e ) / f;
-    }
-    else {
-        e = c / d;
-        f = d + c*e;
-        p = (  b + a*e ) / f;
-        q = ( -a + b*e ) / f;
-    }
-    return sycl::float2(p, q);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:122: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator/(const sycl::float2 a, const float s)
-{
-    return sycl::float2(real(a) / s, imag(a) / s);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:123: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 operator/(const float a, const sycl::float2 y)
-{
-    float c = real(y);
-    float d = imag(y);
-    float e, f, p, q;
-    if (abs( d ) < abs( c )) {
-        e = d / c;
-        f = c + d*e;
-        p =  a   / f;
-        q = -a*e / f;
-    }
-    else {
-        e = c / d;
-        f = d + c*e;
-        p =  a*e / f;
-        q = -a   / f;
-    }
-    return sycl::float2(p, q);
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:124: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b)
-{
-    a = dpct_operator_overloading::operator/(a, b);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:125: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline sycl::float2 &operator /= (sycl::float2 &a, const float s)
-{
-    a = sycl::float2(real(a) / s, imag(a) / s);
-    return a;
-}
-} // namespace dpct_operator_overloading
-
-// ---------- equality
-/*
-DPCT1011:126: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline bool operator *= =(const sycl::float2 a, const sycl::float2 b)
-{
-    return ( real(a) == real(b) &&
-             imag(a) == imag(b) );
-}
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:127: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline bool operator *= =(const sycl::float2 a, const float s)
+//------------------------------------------------------------------------------
+/// Overloaded versions of Ax+By on device, specified for complex
+template <typename T>
+inline T axpby(T alpha, T x, T beta, T y)
 {
-    return ( real(a) == s &&
-             imag(a) == 0. );
+    return alpha*x + beta*y;
 }
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:128: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
 
-inline bool operator *= =(const float s, const sycl::float2 a)
-{
-    return ( real(a) == s &&
-             imag(a) == 0. );
-}
-} // namespace dpct_operator_overloading
-
-// ---------- not equality
-/*
-DPCT1011:129: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
-
-inline bool operator != (const sycl::float2 a, const sycl::float2 b)
+inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x,
+                          sycl::float2 beta, sycl::float2 y)
 {
-    return !(dpct_operator_overloading::operator *= =(a, b));
+    return dpct::cmul<float>(alpha, x) + dpct::cmul<float>(beta, y);
 }
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:130: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
 
-inline bool operator != (const sycl::float2 a, const float s)
+inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x,
+                           sycl::double2 beta, sycl::double2 y)
 {
-    return !(dpct_operator_overloading::operator *= =(a, s));
+    return dpct::cmul<double>(alpha, x) + dpct::cmul<double>(beta, y);
 }
-} // namespace dpct_operator_overloading
-
-/*
-DPCT1011:131: The tool detected overloaded operators for built-in vector types,
-which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
-interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
-standard operators instead.
-*/
-namespace dpct_operator_overloading {
 
-inline bool operator != (const float s, const sycl::float2 a)
-{
-    return !(dpct_operator_overloading::operator *= =(a, s));
-}
-} // namespace dpct_operator_overloading
 
-#endif // BLAS_WITH_CUBLAS
+//------------------------------------------------------------------------------
+/// Overloaded versions of multiply on device, specified for complex
+template <typename scalar_t, typename scalar_t2>
+inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x)
+{
+    return alpha * x;
+}
+
+inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x)
+{
+    return dpct::cmul<float>(alpha, x);
+}
+
+inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x)
+{
+    return dpct::cmul<double>(alpha, x);
+}
+
+// //==============================================================================
+// // CUDA doesn't provide operators, so define our own.
+// // rocBLAS provides operators.
+// //
+// // complex-double
+
+// #if defined( BLAS_HAVE_SYCL )
+
+// // ---------- negate
+// /*
+// DPCT1011:83: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator-(const sycl::double2 &a)
+// {
+//     return sycl::double2(-real(a), -imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:84: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator+(const sycl::double2 a, const sycl::double2 b)
+// {
+//     return sycl::double2(real(a) + real(b), imag(a) + imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:85: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator+(const sycl::double2 a, const double s)
+// {
+//     return sycl::double2(real(a) + s, imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:86: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator+(const double s, const sycl::double2 b)
+// {
+//     return sycl::double2(s + real(b), imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:87: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator += (sycl::double2 &a, const sycl::double2 b)
+// {
+//     a = sycl::double2(real(a) + real(b), imag(a) + imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:88: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator += (sycl::double2 &a, const double s)
+// {
+//     a = sycl::double2(real(a) + s, imag(a));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- subtract
+// /*
+// DPCT1011:89: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator-(const sycl::double2 a, const sycl::double2 b)
+// {
+//     return sycl::double2(real(a) - real(b), imag(a) - imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:90: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator-(const sycl::double2 a, const double s)
+// {
+//     return sycl::double2(real(a) - s, imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:91: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator-(const double s, const sycl::double2 b)
+// {
+//     return sycl::double2(s - real(b), -imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:92: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator -= (sycl::double2 &a, const sycl::double2 b)
+// {
+//     a = sycl::double2(real(a) - real(b), imag(a) - imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:93: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator -= (sycl::double2 &a, const double s)
+// {
+//     a = sycl::double2(real(a) - s, imag(a));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- multiply
+// /*
+// DPCT1011:94: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator*(const sycl::double2 a, const sycl::double2 b)
+// {
+//     return sycl::double2(real(a) * real(b) - imag(a) * imag(b),
+//                          imag(a) * real(b) + real(a) * imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:95: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator*(const sycl::double2 a, const double s)
+// {
+//     return sycl::double2(real(a) * s, imag(a) * s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:96: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator*(const sycl::double2 a, const float s)
+// {
+//     return sycl::double2(real(a) * s, imag(a) * s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:97: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator*(const double s, const sycl::double2 a)
+// {
+//     return sycl::double2(real(a) * s, imag(a) * s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:98: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator *= (sycl::double2 &a, const sycl::double2 b)
+// {
+//     a = sycl::double2(real(a) * real(b) - imag(a) * imag(b),
+//                       imag(a) * real(b) + real(a) * imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:99: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator *= (sycl::double2 &a, const double s)
+// {
+//     a = sycl::double2(real(a) * s, imag(a) * s);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- divide
+// /* From LAPACK DLADIV
+//  * Performs complex division in real arithmetic, avoiding unnecessary overflow.
+//  *
+//  *             a + i*b
+//  *  p + i*q = ---------
+//  *             c + i*d
+//  */
+// /*
+// DPCT1011:100: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator/(const sycl::double2 x, const sycl::double2 y)
+// {
+//     double a = real(x);
+//     double b = imag(x);
+//     double c = real(y);
+//     double d = imag(y);
+//     double e, f, p, q;
+//     if (abs( d ) < abs( c )) {
+//         e = d / c;
+//         f = c + d*e;
+//         p = ( a + b*e ) / f;
+//         q = ( b - a*e ) / f;
+//     }
+//     else {
+//         e = c / d;
+//         f = d + c*e;
+//         p = (  b + a*e ) / f;
+//         q = ( -a + b*e ) / f;
+//     }
+//     return sycl::double2(p, q);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:101: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator/(const sycl::double2 a, const double s)
+// {
+//     return sycl::double2(real(a) / s, imag(a) / s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:102: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 operator/(const double a, const sycl::double2 y)
+// {
+//     double c = real(y);
+//     double d = imag(y);
+//     double e, f, p, q;
+//     if (abs( d ) < abs( c )) {
+//         e = d / c;
+//         f = c + d*e;
+//         p =  a   / f;
+//         q = -a*e / f;
+//     }
+//     else {
+//         e = c / d;
+//         f = d + c*e;
+//         p =  a*e / f;
+//         q = -a   / f;
+//     }
+//     return sycl::double2(p, q);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:103: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator /= (sycl::double2 &a, const sycl::double2 b)
+// {
+//     a = dpct_operator_overloading::operator/(a, b);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:104: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::double2 &operator /= (sycl::double2 &a, const double s)
+// {
+//     a = sycl::double2(real(a) / s, imag(a) / s);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// //==============================================================================
+// // complex-float
+
+// // ---------- negate
+// /*
+// DPCT1011:105: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator-(const sycl::float2 &a)
+// {
+//     return sycl::float2(-real(a), -imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- add
+// /*
+// DPCT1011:106: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator+(const sycl::float2 a, const sycl::float2 b)
+// {
+//     return sycl::float2(real(a) + real(b), imag(a) + imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:107: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator+(const sycl::float2 a, const float s)
+// {
+//     return sycl::float2(real(a) + s, imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:108: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator+(const float s, const sycl::float2 b)
+// {
+//     return sycl::float2(s + real(b), imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:109: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator += (sycl::float2 &a, const sycl::float2 b)
+// {
+//     a = sycl::float2(real(a) + real(b), imag(a) + imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:110: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator += (sycl::float2 &a, const float s)
+// {
+//     a = sycl::float2(real(a) + s, imag(a));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- subtract
+// /*
+// DPCT1011:111: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator-(const sycl::float2 a, const sycl::float2 b)
+// {
+//     return sycl::float2(real(a) - real(b), imag(a) - imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:112: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator-(const sycl::float2 a, const float s)
+// {
+//     return sycl::float2(real(a) - s, imag(a));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:113: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator-(const float s, const sycl::float2 b)
+// {
+//     return sycl::float2(s - real(b), -imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:114: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator -= (sycl::float2 &a, const sycl::float2 b)
+// {
+//     a = sycl::float2(real(a) - real(b), imag(a) - imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:115: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator -= (sycl::float2 &a, const float s)
+// {
+//     a = sycl::float2(real(a) - s, imag(a));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- multiply
+// /*
+// DPCT1011:116: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator*(const sycl::float2 a, const sycl::float2 b)
+// {
+//     return sycl::float2(real(a) * real(b) - imag(a) * imag(b),
+//                         imag(a) * real(b) + real(a) * imag(b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:117: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator*(const sycl::float2 a, const float s)
+// {
+//     return sycl::float2(real(a) * s, imag(a) * s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:118: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator*(const float s, const sycl::float2 a)
+// {
+//     return sycl::float2(real(a) * s, imag(a) * s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:119: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator *= (sycl::float2 &a, const sycl::float2 b)
+// {
+//     a = sycl::float2(real(a) * real(b) - imag(a) * imag(b),
+//                      imag(a) * real(b) + real(a) * imag(b));
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:120: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator *= (sycl::float2 &a, const float s)
+// {
+//     a = sycl::float2(real(a) * s, imag(a) * s);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- divide
+// /* From LAPACK DLADIV
+//  * Performs complex division in real arithmetic, avoiding unnecessary overflow.
+//  *
+//  *             a + i*b
+//  *  p + i*q = ---------
+//  *             c + i*d
+//  */
+// /*
+// DPCT1011:121: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator/(const sycl::float2 x, const sycl::float2 y)
+// {
+//     float a = real(x);
+//     float b = imag(x);
+//     float c = real(y);
+//     float d = imag(y);
+//     float e, f, p, q;
+//     if (abs( d ) < abs( c )) {
+//         e = d / c;
+//         f = c + d*e;
+//         p = ( a + b*e ) / f;
+//         q = ( b - a*e ) / f;
+//     }
+//     else {
+//         e = c / d;
+//         f = d + c*e;
+//         p = (  b + a*e ) / f;
+//         q = ( -a + b*e ) / f;
+//     }
+//     return sycl::float2(p, q);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:122: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator/(const sycl::float2 a, const float s)
+// {
+//     return sycl::float2(real(a) / s, imag(a) / s);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:123: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 operator/(const float a, const sycl::float2 y)
+// {
+//     float c = real(y);
+//     float d = imag(y);
+//     float e, f, p, q;
+//     if (abs( d ) < abs( c )) {
+//         e = d / c;
+//         f = c + d*e;
+//         p =  a   / f;
+//         q = -a*e / f;
+//     }
+//     else {
+//         e = c / d;
+//         f = d + c*e;
+//         p =  a*e / f;
+//         q = -a   / f;
+//     }
+//     return sycl::float2(p, q);
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:124: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator /= (sycl::float2 &a, const sycl::float2 b)
+// {
+//     a = dpct_operator_overloading::operator/(a, b);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:125: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline sycl::float2 &operator /= (sycl::float2 &a, const float s)
+// {
+//     a = sycl::float2(real(a) / s, imag(a) / s);
+//     return a;
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- equality
+// /*
+// DPCT1011:126: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator *= (const sycl::float2 a, const sycl::float2 b)
+// {
+//     return ( real(a) == real(b) &&
+//              imag(a) == imag(b) );
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:127: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator *= (const sycl::float2 a, const float s)
+// {
+//     return ( real(a) == s &&
+//              imag(a) == 0. );
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:128: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator *= (const float s, const sycl::float2 a)
+// {
+//     return ( real(a) == s &&
+//              imag(a) == 0. );
+// }
+// } // namespace dpct_operator_overloading
+
+// // ---------- not equality
+// /*
+// DPCT1011:129: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator != (const sycl::float2 a, const sycl::float2 b)
+// {
+//     return !(dpct_operator_overloading::operator *= (a, b));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:130: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator != (const sycl::float2 a, const float s)
+// {
+//     return !(dpct_operator_overloading::operator *= (a, s));
+// }
+// } // namespace dpct_operator_overloading
+
+// /*
+// DPCT1011:131: The tool detected overloaded operators for built-in vector types,
+// which may conflict with the SYCL 2020 standard operators (see 4.14.2.1 Vec
+// interface). The tool inserted a namespace to avoid the conflict. Use SYCL 2020
+// standard operators instead.
+// */
+// namespace dpct_operator_overloading {
+
+// inline bool operator != (const float s, const sycl::float2 a)
+// {
+//     return !(dpct_operator_overloading::operator *= (a, s));
+// }
+// } // namespace dpct_operator_overloading
+
+// #endif // BLAS_WITH_CUBLAS
 
 } // namespace device
 } // namespace slate

From 974e9c77d5aa65055171b800c389df327d580b45 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Thu, 16 Nov 2023 21:29:10 +0000
Subject: [PATCH 04/10] Better separation for sycl build and
 omptarget-vs-sycl-kernels.

---
 GNUmakefile | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 01e95f38a..d35192878 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -143,7 +143,8 @@ ifneq ($(cuda),1)
     endif
 endif
 
-omptarget = 0
+use_omptarget_kernels = 0
+use_sycl_kernels = 0
 ifneq ($(cuda),1)
 ifneq ($(hip),1)
     ifeq (${gpu_backend},sycl)
@@ -166,9 +167,9 @@ ifneq ($(hip),1)
         # How should the slate kernels be compiled
         ifeq (${sycl_kernels},omptarget) # src/omptarget kernels
             # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
-		    omptarget = 1
+		    use_omptarget_kernels = 1
         else # src/sycl kernels - default/fall-through option
-            sycl_kernels = 1
+			use_sycl_kernels = 1
             CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
             LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
         endif
@@ -222,7 +223,7 @@ ifeq ($(openmp),1)
         # Intel icpx options for OpenMP offload.
         CXXFLAGS += -fiopenmp
         LDFLAGS  += -fiopenmp
-	    ifeq (${omptarget},1)
+	    ifeq (${use_omptarget_kernels},1)
             # If SYCL + OpenMP-offload-kernels, specify omp device type
 	        CXXFLAGS += -fopenmp-targets=spir64
 		    LDFLAGS  += -fopenmp-targets=spir64
@@ -583,7 +584,7 @@ ifeq (${cuda},1)
     libslate_src += ${cuda_src}
 else ifeq (${hip},1)
     libslate_src += ${hip_src}
-else ifeq ($(sycl_kernels),1)
+else ifeq ($(use_sycl_kernels),1)
     libslate_src += $(sycl_kernels_src)
 else
     # Used for both OpenMP offload (${omptarget} == 1) and as stubs for
@@ -1484,12 +1485,12 @@ echo:
 	@echo "sycl          = '$(sycl)'"
 	@echo
 	@echo "---------- OMP target-offload kernel options"
-	@echo "omptarget     = '${omptarget}'"
+	@echo "omptarget     = '${use_omptarget_kernels}'"
 	@echo "omptarget_src = ${omptarget_src}"
 	@echo "omptarget_hdr = ${omptarget_hdr}"
 	@echo
 	@echo "---------- SYCL device kernels"
-	@echo "sycl_kernels  = '$(sycl_kernels)'"
+	@echo "sycl_kernels  = '$(use_sycl_kernels)'"
 	@echo "sycl_kernels_src  = '$(sycl_kernels_src)'"
 	@echo "sycl_kernels_hdr  = '$(sycl_kernels_hdr)'"
 	@echo

From 21d4025798553865589e8f6a0c15043b29261ef5 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Mon, 20 Nov 2023 19:02:30 +0000
Subject: [PATCH 05/10] For sycl kernels (generated by dpct tool), remove
 dependency on dpct/dpct.hpp header.

---
 src/sycl/device_geadd.dp.cpp           | 26 --------
 src/sycl/device_gecopy.dp.cpp          | 13 ----
 src/sycl/device_genorm.dp.cpp          | 13 ----
 src/sycl/device_gescale.dp.cpp         | 25 --------
 src/sycl/device_gescale_row_col.dp.cpp | 13 ----
 src/sycl/device_geset.dp.cpp           | 25 --------
 src/sycl/device_henorm.dp.cpp          | 13 ----
 src/sycl/device_synorm.dp.cpp          | 25 --------
 src/sycl/device_transpose.dp.cpp       | 49 ---------------
 src/sycl/device_trnorm.dp.cpp          | 13 ----
 src/sycl/device_tzadd.dp.cpp           | 13 ----
 src/sycl/device_tzcopy.dp.cpp          | 13 ----
 src/sycl/device_tzscale.dp.cpp         | 13 ----
 src/sycl/device_tzset.dp.cpp           | 25 --------
 src/sycl/device_util.dp.hpp            | 87 +++++++++++++++++++-------
 15 files changed, 63 insertions(+), 303 deletions(-)

diff --git a/src/sycl/device_geadd.dp.cpp b/src/sycl/device_geadd.dp.cpp
index ebeddb9ed..53ee3e69d 100644
--- a/src/sycl/device_geadd.dp.cpp
+++ b/src/sycl/device_geadd.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -129,12 +128,6 @@ void geadd(
     if (m == 0 || n == 0)
         return;
 
-    /*
-    DPCT1093:146: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -150,13 +143,6 @@ void geadd(
                            geadd_kernel(m, n, alpha, A, lda, beta, B, ldb,
                                         item_ct1);
                        });
-
-    /*
-    DPCT1010:147: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -257,12 +243,6 @@ void geadd(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:148: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -280,12 +260,6 @@ void geadd(
                                               Barray, ldb, item_ct1);
                        });
 
-    /*
-    DPCT1010:149: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_gecopy.dp.cpp b/src/sycl/device_gecopy.dp.cpp
index 660946f00..299363c12 100644
--- a/src/sycl/device_gecopy.dp.cpp
+++ b/src/sycl/device_gecopy.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -107,12 +106,6 @@ void gecopy(
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
-    /*
-    DPCT1093:152: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     /*
     DPCT1049:59: The work-group size passed to the SYCL kernel may exceed the
     limit. To get the device limit, query info::device::max_work_group_size.
@@ -127,12 +120,6 @@ void gecopy(
                                          item_ct1);
                        });
 
-    /*
-    DPCT1010:153: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp
index bb20deaa9..45136ca52 100644
--- a/src/sycl/device_genorm.dp.cpp
+++ b/src/sycl/device_genorm.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -427,12 +426,6 @@ void genorm(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:144: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     if (scope == NormScope::Matrix) {
 
         //---------
@@ -610,12 +603,6 @@ void genorm(
         slate_not_implemented("The norm scope isn't yet supported.");
     }
 
-    /*
-    DPCT1010:145: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_gescale.dp.cpp b/src/sycl/device_gescale.dp.cpp
index afe24370a..57d04b203 100644
--- a/src/sycl/device_gescale.dp.cpp
+++ b/src/sycl/device_gescale.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -98,12 +97,6 @@ void gescale(
     if (m == 0 || n == 0)
         return;
 
-    /*
-    DPCT1093:154: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -121,12 +114,6 @@ void gescale(
                            gescale_kernel(m, n, mul, A, lda, item_ct1);
                        });
 
-    /*
-    DPCT1010:155: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -241,12 +228,6 @@ void gescale(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:156: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -266,12 +247,6 @@ void gescale(
                                                 item_ct1);
                        });
 
-    /*
-    DPCT1010:157: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_gescale_row_col.dp.cpp b/src/sycl/device_gescale_row_col.dp.cpp
index a4b145d46..cd24eaeef 100644
--- a/src/sycl/device_gescale_row_col.dp.cpp
+++ b/src/sycl/device_gescale_row_col.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -194,12 +193,6 @@ void gescale_row_col_batch(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:140: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -249,12 +242,6 @@ void gescale_row_col_batch(
                            });
     }
 
-    /*
-    DPCT1010:141: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_geset.dp.cpp b/src/sycl/device_geset.dp.cpp
index f82d58fdf..713a9d3e6 100644
--- a/src/sycl/device_geset.dp.cpp
+++ b/src/sycl/device_geset.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -105,12 +104,6 @@ void geset(
     if (m == 0 || n == 0)
         return;
 
-    /*
-    DPCT1093:134: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -127,12 +120,6 @@ void geset(
                                         item_ct1);
                        });
 
-    /*
-    DPCT1010:135: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -226,12 +213,6 @@ void geset(
     if (m == 0 || n == 0)
         return;
 
-    /*
-    DPCT1093:136: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -249,12 +230,6 @@ void geset(
                                               Aarray, lda, item_ct1);
                        });
 
-    /*
-    DPCT1010:137: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp
index 3a3ea4569..d7a829185 100644
--- a/src/sycl/device_henorm.dp.cpp
+++ b/src/sycl/device_henorm.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -327,12 +326,6 @@ void henorm(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:142: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     //---------
     // max norm
     if (norm == lapack::Norm::Max) {
@@ -431,12 +424,6 @@ void henorm(
         }
     }
 
-    /*
-    DPCT1010:143: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp
index d57370262..26e6e4648 100644
--- a/src/sycl/device_synorm.dp.cpp
+++ b/src/sycl/device_synorm.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -319,12 +318,6 @@ void synorm(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:172: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     //---------
     // max norm
     if (norm == lapack::Norm::Max) {
@@ -423,12 +416,6 @@ void synorm(
         }
     }
 
-    /*
-    DPCT1010:173: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 const int ib  = 32;
@@ -594,12 +581,6 @@ void synormOffdiag(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:174: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     //---------
     // one norm
     if (norm == lapack::Norm::One || norm == lapack::Norm::Inf) {
@@ -631,12 +612,6 @@ void synormOffdiag(
         slate_not_implemented("Only Norm::One and Norm::Inf is supported.");
     }
 
-    /*
-    DPCT1010:175: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_transpose.dp.cpp b/src/sycl/device_transpose.dp.cpp
index 33d7007eb..01ed4d7d6 100644
--- a/src/sycl/device_transpose.dp.cpp
+++ b/src/sycl/device_transpose.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -328,12 +327,6 @@ void transpose(
         return;
     assert(lda >= n);
 
-    /*
-    DPCT1093:158: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     int nt = ceildiv( n, int64_t(ib) );
     assert(nt <= 65535);                // CUDA limitation
 
@@ -393,12 +386,6 @@ void transpose(
                          });
     });
 
-    /*
-    DPCT1010:159: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -434,12 +421,6 @@ void transpose_batch(
         return;
     assert(lda >= n);
 
-    /*
-    DPCT1093:160: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     int nt = ceildiv( n, int64_t(ib) );
     assert(nt <= 65535);                // CUDA limitation
     assert(batch_count <= 2147483647);  // CUDA limitation, 2^31 - 1
@@ -501,12 +482,6 @@ void transpose_batch(
                          });
     });
 
-    /*
-    DPCT1010:161: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -564,12 +539,6 @@ void transpose(
     assert(lda >= m);
     assert(ldat >= n);
 
-    /*
-    DPCT1093:162: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     int mt = ceildiv( m, int64_t(NB) );
     assert(mt <= 65535);                // CUDA limitation
     int nt = ceildiv( n, int64_t(NB) );
@@ -622,12 +591,6 @@ void transpose(
                          });
     });
 
-    /*
-    DPCT1010:163: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -678,12 +641,6 @@ void transpose_batch(
     assert(lda >= m);
     assert(ldat >= n);
 
-    /*
-    DPCT1093:164: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     int mt = ceildiv( m, int64_t(NB) );
     assert(mt <= 65535);                // CUDA limitation
     int nt = ceildiv( n, int64_t(NB) );
@@ -738,12 +695,6 @@ void transpose_batch(
                          });
     });
 
-    /*
-    DPCT1010:165: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp
index f0b802937..9f2e94939 100644
--- a/src/sycl/device_trnorm.dp.cpp
+++ b/src/sycl/device_trnorm.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -447,12 +446,6 @@ void trnorm(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:150: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     //---------
     // max norm
     if (norm == lapack::Norm::Max) {
@@ -576,12 +569,6 @@ void trnorm(
         }
     }
 
-    /*
-    DPCT1010:151: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_tzadd.dp.cpp b/src/sycl/device_tzadd.dp.cpp
index 158eb92ab..b10ec9bce 100644
--- a/src/sycl/device_tzadd.dp.cpp
+++ b/src/sycl/device_tzadd.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -129,12 +128,6 @@ void tzadd(
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
-    /*
-    DPCT1093:138: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     /*
     DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the
     limit. To get the device limit, query info::device::max_work_group_size.
@@ -149,12 +142,6 @@ void tzadd(
                                         Barray, ldb, item_ct1);
                        });
 
-    /*
-    DPCT1010:139: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_tzcopy.dp.cpp b/src/sycl/device_tzcopy.dp.cpp
index 586ba697f..7a2dd5341 100644
--- a/src/sycl/device_tzcopy.dp.cpp
+++ b/src/sycl/device_tzcopy.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -120,12 +119,6 @@ void tzcopy(
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
-    /*
-    DPCT1093:170: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     /*
     DPCT1049:72: The work-group size passed to the SYCL kernel may exceed the
     limit. To get the device limit, query info::device::max_work_group_size.
@@ -140,12 +133,6 @@ void tzcopy(
                                          item_ct1);
                        });
 
-    /*
-    DPCT1010:171: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp
index 01bc55ea5..afb082173 100644
--- a/src/sycl/device_tzscale.dp.cpp
+++ b/src/sycl/device_tzscale.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -116,12 +115,6 @@ void tzscale(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:132: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int64_t nthreads = std::min( int64_t( 1024 ), m );
 
@@ -139,12 +132,6 @@ void tzscale(
                                           item_ct1);
                        });
 
-    /*
-    DPCT1010:133: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_tzset.dp.cpp b/src/sycl/device_tzset.dp.cpp
index 932d1c901..786201076 100644
--- a/src/sycl/device_tzset.dp.cpp
+++ b/src/sycl/device_tzset.dp.cpp
@@ -4,7 +4,6 @@
 // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include "slate/Exception.hh"
 #include "slate/internal/device.hh"
 
@@ -115,12 +114,6 @@ void tzset(
     scalar_t* A, int64_t lda,
     blas::Queue& queue )
 {
-    /*
-    DPCT1093:166: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int nthreads = std::min( int64_t( 1024 ), m );
 
@@ -137,12 +130,6 @@ void tzset(
                                         A, lda, item_ct1);
                        });
 
-    /*
-    DPCT1010:167: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
@@ -242,12 +229,6 @@ void tzset(
     if (batch_count == 0)
         return;
 
-    /*
-    DPCT1093:168: The "queue.device()" device may be not the one intended for
-    use. Adjust the selected device if needed.
-    */
-    dpct::select_device(queue.device());
-
     // Max threads/block=1024 for current CUDA compute capability (<= 7.5)
     int nthreads = std::min( int64_t( 1024 ), m );
 
@@ -266,12 +247,6 @@ void tzset(
                                               item_ct1);
                        });
 
-    /*
-    DPCT1010:169: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    dpct::err0 error = 0;
-    slate_assert(error == 0);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/sycl/device_util.dp.hpp b/src/sycl/device_util.dp.hpp
index 8c440ae50..b0d6694f7 100644
--- a/src/sycl/device_util.dp.hpp
+++ b/src/sycl/device_util.dp.hpp
@@ -7,7 +7,8 @@
 #define SLATE_DEVICE_UTIL_CUH
 
 #include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
+// #include <dpct/dpct.hpp>
+#define DPCT_COMPATIBILITY_TEMP
 #include <complex>
 
 namespace slate {
@@ -41,7 +42,6 @@ inline real_t max_nan(real_t x, real_t y)
 ///     the rest of x is overwritten.
 ///
 template <typename real_t>
-
 void max_nan_reduce(int n, int tid, real_t* x, const sycl::nd_item<3> &item_ct1)
 {
     /*
@@ -299,8 +299,32 @@ inline float real(sycl::float2 x) { return x.x(); }
 inline double imag(sycl::double2 x) { return x.y(); }
 inline float imag(sycl::float2 x) { return x.y(); }
 
-inline sycl::double2 conj(sycl::double2 x) { return dpct::conj<double>(x); }
-inline sycl::float2 conj(sycl::float2 x) { return dpct::conj<float>(x); }
+// inline sycl::double2 conj(sycl::double2 x) { return dpct::conj<double>(x); }
+// inline sycl::float2 conj(sycl::float2 x) { return dpct::conj<float>(x); }
+//------------------------------------------------------------------------------
+/// Computes the complex conjugate of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+// Taken from dpct util.hpp
+template <typename T>
+sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
+    std::complex<T> t(x[0], x[1]);
+    t = std::conj(t);
+    return sycl::vec<T, 2>(t.real(), t.imag());
+}
+
+/// Computes the magnitude of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+// Taken from dpct util.hpp
+template <typename T>
+T cabs(sycl::vec<T, 2> x) {
+    std::complex<T> t(x[0], x[1]);
+    return std::abs(t);
+}
+
 
 #else
 
@@ -354,7 +378,8 @@ inline float abs(sycl::float2 x)
 {
 #ifdef DPCT_COMPATIBILITY_TEMP
     // Use DPCT routine
-    return dpct::cabs<float>(x);
+    //return dpct::cabs<float>(x);
+    return cabs<float>(x);
 #else
     // For HIP, use our implementation that scales per LAPACK.
     float a = real( x );
@@ -390,7 +415,8 @@ inline double abs(sycl::double2 x)
 {
 #ifdef DPCT_COMPATIBILITY_TEMP
     // Use DPCT routine
-    return dpct::cabs<double>(x);
+    // return dpct::cabs<double>(x);
+    return cabs<double>(x);
 #else
     // For HIP, use our implementation that scales per LAPACK.
     double a = real( x );
@@ -516,44 +542,57 @@ inline void copy(double a, sycl::double2 &b)
 }
 
 //------------------------------------------------------------------------------
-/// Overloaded versions of Ax+By on device, specified for complex
+/// Computes the multiplication of two complex numbers.
+/// \tparam T Complex element type
+/// \param [in] x The first input complex number
+/// \param [in] y The second input complex number
+/// \returns The result
 template <typename T>
-inline T axpby(T alpha, T x, T beta, T y)
-{
-    return alpha*x + beta*y;
+sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
+  std::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
+  t1 = t1 * t2;
+  return sycl::vec<T, 2>(t1.real(), t1.imag());
 }
 
-inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x,
-                          sycl::float2 beta, sycl::float2 y)
+//------------------------------------------------------------------------------
+/// Overloaded versions of multiply on device, specified for complex
+template <typename scalar_t, typename scalar_t2>
+inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x)
 {
-    return dpct::cmul<float>(alpha, x) + dpct::cmul<float>(beta, y);
+    return alpha * x;
 }
 
-inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x,
-                           sycl::double2 beta, sycl::double2 y)
+inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x)
 {
-    return dpct::cmul<double>(alpha, x) + dpct::cmul<double>(beta, y);
+    return cmul<float>(alpha, x);
 }
 
+inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x)
+{
+    return cmul<double>(alpha, x);
+}
 
 //------------------------------------------------------------------------------
-/// Overloaded versions of multiply on device, specified for complex
-template <typename scalar_t, typename scalar_t2>
-inline scalar_t multiply_ax(scalar_t2 alpha, scalar_t x)
+/// Overloaded versions of Ax+By on device, specified for complex
+template <typename T>
+inline T axpby(T alpha, T x, T beta, T y)
 {
-    return alpha * x;
+    return alpha*x + beta*y;
 }
 
-inline sycl::float2 multiply_ax(sycl::float2 alpha, sycl::float2 x)
+inline sycl::float2 axpby(sycl::float2 alpha, sycl::float2 x,
+                          sycl::float2 beta, sycl::float2 y)
 {
-    return dpct::cmul<float>(alpha, x);
+    return cmul<float>(alpha, x) + cmul<float>(beta, y);
 }
 
-inline sycl::double2 multiply_ax(sycl::double2 alpha, sycl::double2 x)
+inline sycl::double2 axpby(sycl::double2 alpha, sycl::double2 x,
+                           sycl::double2 beta, sycl::double2 y)
 {
-    return dpct::cmul<double>(alpha, x);
+    return cmul<double>(alpha, x) + cmul<double>(beta, y);
 }
 
+
 // //==============================================================================
 // // CUDA doesn't provide operators, so define our own.
 // // rocBLAS provides operators.

From 83d345bef0e502fd546a3afc3f69a33c3dc240c8 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Tue, 21 Nov 2023 15:26:25 +0000
Subject: [PATCH 06/10] In sycl device code, rename some shadow-ed variables.

---
 src/sycl/device_genorm.dp.cpp | 6 +++---
 src/sycl/device_henorm.dp.cpp | 6 +++---
 src/sycl/device_synorm.dp.cpp | 6 +++---
 src/sycl/device_trnorm.dp.cpp | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/sycl/device_genorm.dp.cpp b/src/sycl/device_genorm.dp.cpp
index 45136ca52..b4d9b0fac 100644
--- a/src/sycl/device_genorm.dp.cpp
+++ b/src/sycl/device_genorm.dp.cpp
@@ -300,9 +300,9 @@ void genorm_fro_kernel(
     if (item_ct1.get_local_id(2) == 0) {
         tile_scale = row_scale[0];
         tile_sumsq = row_sumsq[0];
-        for (int64_t chunk = 1;
-             chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) {
-            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        for (int64_t chunk1 = 1;
+             chunk1 < item_ct1.get_local_range(2) && chunk1 < m; ++chunk1) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]);
         }
 
         tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
diff --git a/src/sycl/device_henorm.dp.cpp b/src/sycl/device_henorm.dp.cpp
index d7a829185..fb172764b 100644
--- a/src/sycl/device_henorm.dp.cpp
+++ b/src/sycl/device_henorm.dp.cpp
@@ -254,9 +254,9 @@ void henorm_fro_kernel(
     if (item_ct1.get_local_id(2) == 0) {
         real_t tile_scale = row_scale[0];
         real_t tile_sumsq = row_sumsq[0];
-        for (int64_t chunk = 1;
-             chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) {
-            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        for (int64_t chunk1 = 1;
+             chunk1 < item_ct1.get_local_range(2) && chunk1 < n; ++chunk1) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]);
         }
 
         tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
diff --git a/src/sycl/device_synorm.dp.cpp b/src/sycl/device_synorm.dp.cpp
index 26e6e4648..ab08d073e 100644
--- a/src/sycl/device_synorm.dp.cpp
+++ b/src/sycl/device_synorm.dp.cpp
@@ -246,9 +246,9 @@ void synorm_fro_kernel(
     if (item_ct1.get_local_id(2) == 0) {
         real_t tile_scale = row_scale[0];
         real_t tile_sumsq = row_sumsq[0];
-        for (int64_t chunk = 1;
-             chunk < item_ct1.get_local_range(2) && chunk < n; ++chunk) {
-            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        for (int64_t chunk1 = 1;
+             chunk1 < item_ct1.get_local_range(2) && chunk1 < n; ++chunk1) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]);
         }
 
         tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;
diff --git a/src/sycl/device_trnorm.dp.cpp b/src/sycl/device_trnorm.dp.cpp
index 9f2e94939..816cf7ae1 100644
--- a/src/sycl/device_trnorm.dp.cpp
+++ b/src/sycl/device_trnorm.dp.cpp
@@ -364,9 +364,9 @@ void trnorm_fro_kernel(
     if (item_ct1.get_local_id(2) == 0) {
         real_t tile_scale = row_scale[0];
         real_t tile_sumsq = row_sumsq[0];
-        for (int64_t chunk = 1;
-             chunk < item_ct1.get_local_range(2) && chunk < m; ++chunk) {
-            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk], row_sumsq[chunk]);
+        for (int64_t chunk1 = 1;
+             chunk1 < item_ct1.get_local_range(2) && chunk1 < m; ++chunk1) {
+            combine_sumsq(tile_scale, tile_sumsq, row_scale[chunk1], row_sumsq[chunk1]);
         }
 
         tiles_values[item_ct1.get_group(2) * 2 + 0] = tile_scale;

From e98380672664f3bc87de1d293236e0dc7b9a213a Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Tue, 21 Nov 2023 17:07:22 +0000
Subject: [PATCH 07/10] Allow CI testing for Intel GPUs.

---
 .github/workflows/test.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/test.sh b/.github/workflows/test.sh
index 655ecf4f3..df203d52c 100755
--- a/.github/workflows/test.sh
+++ b/.github/workflows/test.sh
@@ -12,10 +12,6 @@ err=0
 
 export OMP_NUM_THREADS=8
 
-# Currently, OpenMP offload tests don't work on our Intel GPU.
-# CI checks only compilation.
-if [ "${device}" != "gpu_intel" ]; then
-
 print "======================================== Unit tests"
 cd unit_test
 
@@ -85,7 +81,5 @@ if [ "${maker}" = "make" ]; then
     fi
 fi
 
-fi  # device != gpu_intel
-
 print "======================================== Finished test"
 exit ${err}

From f7c775a1564e14d0c590af136b4a008d13ae1ca4 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Wed, 22 Nov 2023 23:54:08 +0000
Subject: [PATCH 08/10] For sycl-kernels, update CMakefile.

---
 CMakeLists.txt | 49 +++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2ca27b2a..15761171c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -364,28 +364,41 @@ if (gpu_backend MATCHES "^(sycl|auto)$")
         # Intel-IntelLLVM compiler while compiling omptarget offload
         # routines. (the compiler uses fast floating point mode by
         # default).
-        target_compile_options( slate PRIVATE "-fp-model=precise" )
+        target_compile_options( slate PRIVATE
+            "$<$<COMPILE_LANGUAGE:CXX>:-fp-model=precise>" )
 
-        # -Wno-unused-command-line-argument avoids
-        # icpx warning: -Wl,-rpath,...: 'linker' input unused.
-        #
         # -Wno-c99-extensions avoids
         # icpx warning: '_Complex' is a C99 extension.
-        #
-        # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
-        # icpx warning: loop not vectorized.
-        #
         target_compile_options(
             slate PRIVATE
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument>"
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions>"
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>" )
-
-        # Intel icpx options for OpenMP offload.
-        target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
-        target_link_options(    slate PRIVATE "-fopenmp-targets=spir64" )
-
-        # Source files are set below after CUDA and HIP.
+            $<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions> )
+
+        if (sycl_kernels MATCHES "^(omptarget)$") # src/omptarget kernels
+            message( "sycl_kernels = omptarget" )
+            # Enable the OpenMP omptarget offload kernels in SLATE for oneMKL-SYCL devices
+            file( GLOB libslate_omptarget_src CONFIGURE_DEPENDS src/omptarget/*.cc )
+            target_sources( slate PRIVATE ${libslate_omptarget_src} )
+            # -Wno-unused-command-line-argument avoids
+            # icpx warning: -Wl,-rpath,...: 'linker' input unused.
+            target_compile_options( slate PRIVATE
+                $<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument> )
+            # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
+            # icpx warning: loop not vectorized.
+            target_compile_options( slate PRIVATE
+                $<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed> )
+            # specify the OpenMP offload target
+            target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
+            target_link_options(    slate PRIVATE "-fopenmp-targets=spir64" )
+        else() # src/sycl kernels - default/fall-through option
+            message( "sycl_kernels = sycl" )
+            file( GLOB libslate_sycl_src CONFIGURE_DEPENDS src/sycl/*.dp.cpp )
+            target_sources( slate PRIVATE ${libslate_sycl_src} )
+            target_compile_options( slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl> )
+            target_compile_options(
+                slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl-unnamed-lambda> )
+            target_link_options( slate PRIVATE "-fsycl" )
+            target_link_options( slate PRIVATE "-fsycl-unnamed-lambda" )
+        endif()
 
         target_link_libraries( slate PUBLIC -lmkl_sycl -lsycl -lOpenCL )
         message( STATUS "Building SYCL support" )
@@ -669,7 +682,7 @@ endif()
 
 #-------------------------------------------------------------------------------
 # Files for OpenMP offload or CPU-only builds.
-if (NOT "${gpu_backend}" MATCHES "^(cuda|hip)$")
+if (NOT "${gpu_backend}" MATCHES "^(cuda|hip|sycl)$")
     file(
         GLOB libslate_omptarget_src
         CONFIGURE_DEPENDS  # glob at build time

From 7935dcb2e8f4a8014e8f432219a3312d8a2fe62c Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Mon, 27 Nov 2023 19:36:39 +0000
Subject: [PATCH 09/10] The Intel C++ flag "Wno-unused-command-line-argument"
 is set in Make/Cmake.

---
 .github/workflows/configure.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/configure.sh b/.github/workflows/configure.sh
index 0b71dfeb0..eb45750cf 100755
--- a/.github/workflows/configure.sh
+++ b/.github/workflows/configure.sh
@@ -42,10 +42,7 @@ if [ "${maker}" = "make" ]; then
          || exit 10
 
 elif [ "${maker}" = "cmake" ]; then
-    # Intel icpx needs -Wno-unused-command-line-argument to avoid
-    # warnings: 'linker' input unused, which prevent CMake finding OpenMP.
     cmake -Dcolor=no \
-          -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
           -DCMAKE_INSTALL_PREFIX=${top}/install \
           -Dgpu_backend=${gpu_backend} .. \
           || exit 12

From c4ec23e6b97c0b221badf2b988a6a156eb731192 Mon Sep 17 00:00:00 2001
From: Asim YarKhan <yarkhan@icl.utk.edu>
Date: Thu, 21 Dec 2023 16:56:10 +0000
Subject: [PATCH 10/10] In sycl/device_tzscale, add missing namespace batch.

---
 src/sycl/device_tzscale.dp.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sycl/device_tzscale.dp.cpp b/src/sycl/device_tzscale.dp.cpp
index afb082173..2ecc61106 100644
--- a/src/sycl/device_tzscale.dp.cpp
+++ b/src/sycl/device_tzscale.dp.cpp
@@ -67,6 +67,9 @@ void tzscale_kernel(
     }
 }
 
+//==============================================================================
+namespace batch {
+
 //------------------------------------------------------------------------------
 /// Batched routine for element-wise trapezoidal tile scale.
 /// Sets upper or lower part of
@@ -176,5 +179,6 @@ void tzscale(
             batch_count, queue);
 }
 
+} // namespace batch
 } // namespace device
 } // namespace slate