Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SYCL kernels 2 #146

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
3 changes: 0 additions & 3 deletions .github/workflows/configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@ if [ "${maker}" = "make" ]; then
|| exit 10

elif [ "${maker}" = "cmake" ]; then
# Intel icpx needs -Wno-unused-command-line-argument to avoid
# warnings: 'linker' input unused, which prevent CMake finding OpenMP.
cmake -Dcolor=no \
-DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
-DCMAKE_INSTALL_PREFIX=${top}/install \
-Dgpu_backend=${gpu_backend} .. \
|| exit 12
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ err=0

export OMP_NUM_THREADS=8

# Currently, OpenMP offload tests don't work on our Intel GPU.
# CI checks only compilation.
if [ "${device}" != "gpu_intel" ]; then

print "======================================== Unit tests"
cd unit_test

Expand Down Expand Up @@ -85,7 +81,5 @@ if [ "${maker}" = "make" ]; then
fi
fi

fi # device != gpu_intel

print "======================================== Finished test"
exit ${err}
49 changes: 31 additions & 18 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -364,28 +364,41 @@ if (gpu_backend MATCHES "^(sycl|auto)$")
# Intel-IntelLLVM compiler while compiling omptarget offload
# routines. (the compiler uses fast floating point mode by
# default).
target_compile_options( slate PRIVATE "-fp-model=precise" )
target_compile_options( slate PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:-fp-model=precise>" )

# -Wno-unused-command-line-argument avoids
# icpx warning: -Wl,-rpath,...: 'linker' input unused.
#
# -Wno-c99-extensions avoids
# icpx warning: '_Complex' is a C99 extension.
#
# -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
# icpx warning: loop not vectorized.
#
target_compile_options(
slate PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument>"
"$<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions>"
"$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed>" )

# Intel icpx options for OpenMP offload.
target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
target_link_options( slate PRIVATE "-fopenmp-targets=spir64" )

# Source files are set below after CUDA and HIP.
$<$<COMPILE_LANGUAGE:CXX>:-Wno-c99-extensions> )

if (sycl_kernels MATCHES "^(omptarget)$") # src/omptarget kernels
message( "sycl_kernels = omptarget" )
# Enable the OpenMP omptarget offload kernels in SLATE for oneMKL-SYCL devices
file( GLOB libslate_omptarget_src CONFIGURE_DEPENDS src/omptarget/*.cc )
target_sources( slate PRIVATE ${libslate_omptarget_src} )
# -Wno-unused-command-line-argument avoids
# icpx warning: -Wl,-rpath,...: 'linker' input unused.
target_compile_options( slate PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-command-line-argument> )
# -Wno-pass-failed avoids (on src/omptarget/device_transpose.cc)
# icpx warning: loop not vectorized.
target_compile_options( slate PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-Wno-pass-failed> )
# specify the OpenMP offload target
target_compile_options( slate PRIVATE "-fopenmp-targets=spir64" )
target_link_options( slate PRIVATE "-fopenmp-targets=spir64" )
else() # src/sycl kernels - default/fall-through option
message( "sycl_kernels = sycl" )
file( GLOB libslate_sycl_src CONFIGURE_DEPENDS src/sycl/*.dp.cpp )
target_sources( slate PRIVATE ${libslate_sycl_src} )
target_compile_options( slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl> )
target_compile_options(
slate PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -fsycl-unnamed-lambda> )
target_link_options( slate PRIVATE "-fsycl" )
target_link_options( slate PRIVATE "-fsycl-unnamed-lambda" )
endif()

target_link_libraries( slate PUBLIC -lmkl_sycl -lsycl -lOpenCL )
message( STATUS "Building SYCL support" )
Expand Down Expand Up @@ -669,7 +682,7 @@ endif()

#-------------------------------------------------------------------------------
# Files for OpenMP offload or CPU-only builds.
if (NOT "${gpu_backend}" MATCHES "^(cuda|hip)$")
if (NOT "${gpu_backend}" MATCHES "^(cuda|hip|sycl)$")
file(
GLOB libslate_omptarget_src
CONFIGURE_DEPENDS # glob at build time
Expand Down
47 changes: 39 additions & 8 deletions GNUmakefile
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ ifneq ($(cuda),1)
endif
endif

omptarget = 0
use_omptarget_kernels = 0
use_sycl_kernels = 0
ifneq ($(cuda),1)
ifneq ($(hip),1)
ifeq (${gpu_backend},sycl)
# enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
$(info Note: enabling omp-target-offload kernels)
omptarget = 1
# enable the kernels in SLATE for oneMKL-SYCL devices
sycl = 1

# -Wno-unused-command-line-argument avoids
# icpx warning: -Wl,-rpath,...: 'linker' input unused.
Expand All @@ -163,7 +163,18 @@ ifneq ($(hip),1)
CXXFLAGS += -fsycl -fp-model=precise -Wno-unused-command-line-argument \
-Wno-c99-extensions -Wno-pass-failed
LIBS += -lsycl
endif

# How should the slate kernels be compiled
ifeq (${sycl_kernels},omptarget) # src/omptarget kernels
# enable the omptarget offload kernels in SLATE for oneMKL-SYCL devices
use_omptarget_kernels = 1
else # src/sycl kernels - default/fall-through option
use_sycl_kernels = 1
CXXFLAGS += -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
LDFLAGS += -fsycl -fsycl-unnamed-lambda # allow unnamed sycl lambda kernels
endif

endif
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Makefiles, use spaces for if-else-endif blocks. Use tabs only for rules.

endif
endif

Expand Down Expand Up @@ -210,8 +221,13 @@ endif
ifeq ($(openmp),1)
ifeq (${gpu_backend},sycl)
# Intel icpx options for OpenMP offload.
CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
LDFLAGS += -fiopenmp -fopenmp-targets=spir64
CXXFLAGS += -fiopenmp
LDFLAGS += -fiopenmp
ifeq (${use_omptarget_kernels},1)
# If SYCL + OpenMP-offload-kernels, specify omp device type
CXXFLAGS += -fopenmp-targets=spir64
LDFLAGS += -fopenmp-targets=spir64
endif
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spaces.

else
# Most other compilers recognize this.
CXXFLAGS += -fopenmp
Expand Down Expand Up @@ -542,6 +558,10 @@ cuda_hdr := \
hip_src := $(patsubst src/cuda/%.cu,src/hip/%.hip.cc,$(cuda_src))
hip_hdr := $(patsubst src/cuda/%.cuh,src/hip/%.hip.hh,$(cuda_hdr))

# SYCL implementations of device kernels
sycl_kernels_src := $(patsubst src/cuda/%.cu,src/sycl/%.dp.cpp,$(cuda_src))
sycl_kernels_hdr := $(patsubst src/cuda/%.cuh,src/sycl/%.dp.hpp,$(cuda_hdr))

# OpenMP implementations of device kernels
omptarget_src := \
src/omptarget/device_geadd.cc \
Expand All @@ -564,6 +584,8 @@ ifeq (${cuda},1)
libslate_src += ${cuda_src}
else ifeq (${hip},1)
libslate_src += ${hip_src}
else ifeq ($(use_sycl_kernels),1)
libslate_src += $(sycl_kernels_src)
else
# Used for both OpenMP offload (${omptarget} == 1) and as stubs for
# CPU-only build.
Expand Down Expand Up @@ -1322,6 +1344,9 @@ hooks: ${hooks}
%.hip.o: %.hip.cc | $(hip_hdr)
$(HIPCC) $(HIPCCFLAGS) -c $< -o $@

%.dp.o: %.dp.cpp | $(sycl_kernels_hdr)
$(CXX) $(CXXFLAGS) -c $< -o $@

%.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@

Expand Down Expand Up @@ -1460,8 +1485,14 @@ echo:
@echo "sycl = '$(sycl)'"
@echo
@echo "---------- OMP target-offload kernel options"
@echo "omptarget = '${omptarget}'"
@echo "omptarget = '${use_omptarget_kernels}'"
@echo "omptarget_src = ${omptarget_src}"
@echo "omptarget_hdr = ${omptarget_hdr}"
@echo
@echo "---------- SYCL device kernels"
@echo "sycl_kernels = '$(use_sycl_kernels)'"
@echo "sycl_kernels_src = '$(sycl_kernels_src)'"
@echo "sycl_kernels_hdr = '$(sycl_kernels_hdr)'"
@echo
@echo "---------- Fortran compiler"
@echo "FC = $(FC)"
Expand Down
20 changes: 17 additions & 3 deletions include/slate/internal/device.hh
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,31 @@
};

} // namespace blas
#endif // #elif defined( BLAS_HAVE_ROCBLAS )

#elif defined( BLAS_HAVE_SYCL )
#include <sycl/sycl.hpp>
namespace blas {

template <typename T>
struct blas::real_type_traits< sycl::vec<T, 2> > {
using real_t = T;
};

} // namespace blas

#endif // #defined( BLAS_HAVE_{CUBLAS,ROCBLAS,SYCL} )

namespace slate {

/// @namespace slate::device
/// GPU device implementations of kernels.
namespace device {

// Use omp-target-kernels when OneMKL-SYCL is used
// Use when SYCL and oneMKL are used
#if defined( BLAS_HAVE_SYCL )
#define SLATE_HAVE_OMPTARGET
// todo: make this build automatically
// Manually uncomment to compile OMP target-offload kernels
// #define SLATE_HAVE_OMPTARGET
#endif

// Simplify checking for GPU device support (CUDA / ROCm / SYCL).
Expand Down
Loading
Loading