icl-utk-edu · ayarkhan · Nov 15, 2023 · Nov 15, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/.github/workflows/configure.sh b/.github/workflows/configure.sh
@@ -42,10 +42,7 @@ if [ "${maker}" = "make" ]; then
          || exit 10
 
 elif [ "${maker}" = "cmake" ]; then
-    # Intel icpx needs -Wno-unused-command-line-argument to avoid
-    # warnings: 'linker' input unused, which prevent CMake finding OpenMP.
     cmake -Dcolor=no \
-          -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
           -DCMAKE_INSTALL_PREFIX=${top}/install \
           -Dgpu_backend=${gpu_backend} .. \
           || exit 12

diff --git a/.github/workflows/test.sh b/.github/workflows/test.sh
@@ -12,10 +12,6 @@ err=0
 
 export OMP_NUM_THREADS=8
 
-# Currently, OpenMP offload tests don't work on our Intel GPU.
-# CI checks only compilation.
-if [ "${device}" != "gpu_intel" ]; then
-
 print "======================================== Unit tests"
 cd unit_test
 
@@ -85,7 +81,5 @@ if [ "${maker}" = "make" ]; then
     fi
 fi
 
-fi  # device != gpu_intel
-
 print "======================================== Finished test"
 exit ${err}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -364,28 +364,41 @@ if (gpu_backend MATCHES "^(sycl|auto)$")
         # Intel-IntelLLVM compiler while compiling omptarget offload
         # routines. (the compiler uses fast floating point mode by
         # default).
-        target_compile_options( slate PRIVATE "-fp-model=precise" )
+        target_compile_options( slate PRIVATE
+            "$<$<COMPILE_LANGUAGE:CXX>:-fp-model=precise>" )
 
-        # -Wno-unused-command-line-argument avoids
-        # icpx warning: -Wl,-rpath,...: 'linker' input unused.
-        #
         # -Wno-c99-extensions avoids
         # icpx warning: '_Complex' is a C99 extension.
-        #
-        # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
-        # icpx warning: loop not vectorized.
-        #
         target_compile_options(
             slate PRIVATE
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument>"
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions>"
-            "$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>" )
-
-        # Intel icpx options for OpenMP offload.
-        target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
-        target_link_options(    slate PRIVATE "-fopenmp-targets=spir64" )
-
-        # Source files are set below after CUDA and HIP.
+            $<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions> )
+
+        if (sycl_kernels MATCHES "^(omptarget)$") # src/omptarget kernels
+            message( "sycl_kernels = omptarget" )
+            # Enable the OpenMP omptarget offload kernels in SLATE for oneMKL-SYCL devices
+            file( GLOB libslate_omptarget_src CONFIGURE_DEPENDS src/omptarget/*.cc )
+            target_sources( slate PRIVATE ${libslate_omptarget_src} )
+            # -Wno-unused-command-line-argument avoids
+            # icpx warning: -Wl,-rpath,...: 'linker' input unused.
+            target_compile_options( slate PRIVATE
+                $<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument> )
+            # -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
+            # icpx warning: loop not vectorized.
+            target_compile_options( slate PRIVATE
+                $<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed> )
+            # specify the OpenMP offload target
+            target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
+            target_link_options(    slate PRIVATE "-fopenmp-targets=spir64" )
+        else() # src/sycl kernels - default/fall-through option
+            message( "sycl_kernels = sycl" )
+            file( GLOB libslate_sycl_src CONFIGURE_DEPENDS src/sycl/*.dp.cpp )
+            target_sources( slate PRIVATE ${libslate_sycl_src} )
+            target_compile_options( slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl> )
+            target_compile_options(
+                slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl-unnamed-lambda> )
+            target_link_options( slate PRIVATE "-fsycl" )
+            target_link_options( slate PRIVATE "-fsycl-unnamed-lambda" )
+        endif()
 
         target_link_libraries( slate PUBLIC -lmkl_sycl -lsycl -lOpenCL )
         message( STATUS "Building SYCL support" )
@@ -669,7 +682,7 @@ endif()
 
 #-------------------------------------------------------------------------------
 # Files for OpenMP offload or CPU-only builds.
-if (NOT "${gpu_backend}" MATCHES "^(cuda|hip)$")
+if (NOT "${gpu_backend}" MATCHES "^(cuda|hip|sycl)$")
     file(
         GLOB libslate_omptarget_src
         CONFIGURE_DEPENDS  # glob at build time

diff --git a/GNUmakefile b/GNUmakefile
@@ -143,13 +143,13 @@ ifneq ($(cuda),1)
     endif
 endif
 
-omptarget = 0
+use_omptarget_kernels = 0
+use_sycl_kernels = 0
 ifneq ($(cuda),1)
 ifneq ($(hip),1)
     ifeq (${gpu_backend},sycl)
-        # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
-        $(info Note: enabling omp-target-offload kernels)
-        omptarget = 1
+        # enable the kernels in SLATE for oneMKL-SYCL devices
+        sycl = 1
 
         # -Wno-unused-command-line-argument avoids
         # icpx warning: -Wl,-rpath,...: 'linker' input unused.
@@ -163,7 +163,18 @@ ifneq ($(hip),1)
         CXXFLAGS += -fsycl -fp-model=precise -Wno-unused-command-line-argument \
                     -Wno-c99-extensions -Wno-pass-failed
         LIBS += -lsycl
-    endif
+
+        # How should the slate kernels be compiled
+        ifeq (${sycl_kernels},omptarget) # src/omptarget kernels
+            # enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
+		    use_omptarget_kernels = 1
+        else # src/sycl kernels - default/fall-through option
+			use_sycl_kernels = 1
+            CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
+            LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
+        endif
+
+   endif
 endif
 endif
 
@@ -210,8 +221,13 @@ endif
 ifeq ($(openmp),1)
     ifeq (${gpu_backend},sycl)
         # Intel icpx options for OpenMP offload.
-        CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
-        LDFLAGS  += -fiopenmp -fopenmp-targets=spir64
+        CXXFLAGS += -fiopenmp
+        LDFLAGS  += -fiopenmp
+	    ifeq (${use_omptarget_kernels},1)
+            # If SYCL + OpenMP-offload-kernels, specify omp device type
+	        CXXFLAGS += -fopenmp-targets=spir64
+		    LDFLAGS  += -fopenmp-targets=spir64
+        endif
     else
         # Most other compilers recognize this.
         CXXFLAGS += -fopenmp
@@ -542,6 +558,10 @@ cuda_hdr := \
 hip_src := $(patsubst src/cuda/%.cu,src/hip/%.hip.cc,$(cuda_src))
 hip_hdr := $(patsubst src/cuda/%.cuh,src/hip/%.hip.hh,$(cuda_hdr))
 
+# SYCL implementations of device kernels
+sycl_kernels_src := $(patsubst src/cuda/%.cu,src/sycl/%.dp.cpp,$(cuda_src))
+sycl_kernels_hdr := $(patsubst src/cuda/%.cuh,src/sycl/%.dp.hpp,$(cuda_hdr))
+
 # OpenMP implementations of device kernels
 omptarget_src := \
         src/omptarget/device_geadd.cc \
@@ -564,6 +584,8 @@ ifeq (${cuda},1)
     libslate_src += ${cuda_src}
 else ifeq (${hip},1)
     libslate_src += ${hip_src}
+else ifeq ($(use_sycl_kernels),1)
+    libslate_src += $(sycl_kernels_src)
 else
     # Used for both OpenMP offload (${omptarget} == 1) and as stubs for
     # CPU-only build.
@@ -1322,6 +1344,9 @@ hooks: ${hooks}
 %.hip.o: %.hip.cc | $(hip_hdr)
 	$(HIPCC) $(HIPCCFLAGS) -c $< -o $@
 
+%.dp.o: %.dp.cpp | $(sycl_kernels_hdr)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 %.o: %.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
@@ -1460,8 +1485,14 @@ echo:
 	@echo "sycl          = '$(sycl)'"
 	@echo
 	@echo "---------- OMP target-offload kernel options"
-	@echo "omptarget     = '${omptarget}'"
+	@echo "omptarget     = '${use_omptarget_kernels}'"
 	@echo "omptarget_src = ${omptarget_src}"
+	@echo "omptarget_hdr = ${omptarget_hdr}"
+	@echo
+	@echo "---------- SYCL device kernels"
+	@echo "sycl_kernels  = '$(use_sycl_kernels)'"
+	@echo "sycl_kernels_src  = '$(sycl_kernels_src)'"
+	@echo "sycl_kernels_hdr  = '$(sycl_kernels_hdr)'"
 	@echo
 	@echo "---------- Fortran compiler"
 	@echo "FC            = $(FC)"

diff --git a/include/slate/internal/device.hh b/include/slate/internal/device.hh
@@ -68,17 +68,31 @@
     };
 
     } // namespace blas
-#endif // #elif defined( BLAS_HAVE_ROCBLAS )
+
+#elif defined( BLAS_HAVE_SYCL )
+    #include <sycl/sycl.hpp>
+    namespace blas {
+
+    template <typename T>
+    struct blas::real_type_traits< sycl::vec<T, 2> > {
+        using real_t = T;
+    };
+
+    } // namespace blas
+
+#endif // #defined( BLAS_HAVE_{CUBLAS,ROCBLAS,SYCL} )
 
 namespace slate {
 
 /// @namespace slate::device
 /// GPU device implementations of kernels.
 namespace device {
 
-// Use omp-target-kernels when OneMKL-SYCL is used
+// Use when SYCL and oneMKL are used
 #if defined( BLAS_HAVE_SYCL )
-    #define SLATE_HAVE_OMPTARGET
+    // todo: make this build automatically
+    // Manually uncomment to compile OMP target-offload kernels
+    // #define SLATE_HAVE_OMPTARGET
 #endif
 
 // Simplify checking for GPU device support (CUDA / ROCm / SYCL).