diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5bb1043b9..c078ae9a2 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -49,6 +49,7 @@ from typing import (Any, Tuple, Generic, TypeVar, Sequence, ClassVar, Optional, TYPE_CHECKING) +import abc if TYPE_CHECKING: from loopy.typing import ExpressionT @@ -159,6 +160,17 @@ def get_kernel_executor(self, kernel, *args, **kwargs): """ raise NotImplementedError() + @abc.abstractproperty + def single_subkernel_is_entrypoint(self) -> bool: + r""" + Returns *True* if *self* does NOT support generating code for + linearized kernels with more than one + :class:`~loopy.schedule.CallKernel`\ s. This guarantees the + :class:`~loopy.schedule.CallKernel` for which we generate code is the + entrypoint kernel. This also allows the target to skip the invoker + level code. + """ + class ASTBuilderBase(Generic[ASTType]): """An interface for generating (host or device) ASTs. diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index b83b89bc5..b0fc8fdac 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -849,8 +849,15 @@ def get_function_declaration( # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info + from loopy.kernel.tools import get_subkernels skai = get_subkernel_arg_info(kernel, subkernel_name) - passed_names = skai.passed_names + if (self.target.single_subkernel_is_entrypoint + and len(get_subkernels(kernel)) > 1): + raise LoopyError(f"Kernel '{kernel.name}' has more than one" + f" subkernel, not allowed in {self.target}.") + passed_names = (skai.passed_names + if not self.target.single_subkernel_is_entrypoint + else [arg.name for arg in kernel.args]) written_names = skai.written_names else: name = Value("static void", name) @@ -1342,6 +1349,10 @@ def get_dtype_registry(self): fill_registry_with_c99_complex_types(result) return DTypeRegistryWrapper(result) + @property + def single_subkernel_is_entrypoint(self) -> bool: + return True + class CASTBuilder(CFamilyASTBuilder): def preamble_generators(self): @@ -1385,6 +1396,10 @@ def get_host_ast_builder(self): # enable host code generation return CFamilyASTBuilder(self) + @property + def single_subkernel_is_entrypoint(self) -> bool: + return False + # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index e97c84b00..fba6b3007 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -255,6 +255,10 @@ def vector_dtype(self, base, count): # }}} + @property + def single_subkernel_is_entrypoint(self) -> bool: + return False + # }}} diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2fbd6bcf8..ec390085a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -198,6 +198,10 @@ def get_dtype_registry(self): # }}} + @property + def single_subkernel_is_entrypoint(self) -> bool: + return True + class ISPCASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen @@ -222,7 +226,9 @@ def get_function_declaration( # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) - passed_names = skai.passed_names + passed_names = ([arg.name for arg in kernel.args] + if self.target.single_subkernel_is_entrypoint + else skai.passed_names) written_names = skai.written_names else: passed_names = [arg.name for arg in kernel.args] @@ -263,7 +269,8 @@ def get_kernel_call(self, codegen_state: CodeGenerationState, "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - if codegen_state.is_entrypoint: + if (codegen_state.is_entrypoint and + self.target.single_subkernel_is_entrypoint): # subkernel launches occur only as part of entrypoint kernels for now from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d548cf71d..5112abdcd 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -598,6 +598,10 @@ def is_vector_dtype(self, dtype): def vector_dtype(self, base, count): return NumpyType(vec.types[base.numpy_dtype, count]) + @property + def single_subkernel_is_entrypoint(self) -> bool: + return False + # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 92f4bbd96..8d7154f23 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -645,6 +645,10 @@ def with_device(self, device): "stop working in 2022.", DeprecationWarning, stacklevel=2) return self + @property + def single_subkernel_is_entrypoint(self) -> bool: + return False + # }}} diff --git a/test/test_target.py b/test/test_target.py index 13b81502b..7e9e7eba8 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -777,6 +777,24 @@ def test_passing_bajillions_of_svm_args(ctx_factory, with_gbarrier): assert (res[f"c{iargset}"].get() == iargset * multiplier + iargset).all() +def test_non_executable_targets_respect_args(): + # See https://github.com/inducer/loopy/issues/648 + t_unit = lp.make_kernel( + "{ : }", + """ + a[0] = 1729 + """, + [lp.GlobalArg("a,b,c,d,e", + shape=(10,), + dtype="float64")], + target=lp.CTarget() + ) + code_str = lp.generate_code_v2(t_unit).device_code() + + for var in ["b", "c", "d", "e"]: + assert code_str.find(f"double const *__restrict__ {var}") != -1 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])