From 8d8601814cd8da836e618d955592f594fcc1ffbf Mon Sep 17 00:00:00 2001 From: Bangtian Liu Date: Tue, 12 Nov 2024 17:07:54 -0600 Subject: [PATCH] [tuner]:Add a utility function to query supported MMA intrinsics Signed-off-by: Bangtian Liu --- .../iree/compiler/Codegen/LLVMGPU/BUILD.bazel | 1 + .../compiler/Codegen/LLVMGPU/CMakeLists.txt | 1 + .../iree/compiler/Codegen/LLVMGPU/Passes.td | 5 ++ .../LLVMGPU/TestLLVMGPUQueryMMAPass.cpp | 48 +++++++++++++++ .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 + .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 + .../Codegen/LLVMGPU/test/test_query_mma.mlir | 60 +++++++++++++++++++ .../iree/compiler/Codegen/Utils/GPUUtils.cpp | 24 ++++++++ .../iree/compiler/Codegen/Utils/GPUUtils.h | 5 ++ 9 files changed, 146 insertions(+) create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel index fa9277a00b11..3159126442c8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel @@ -112,6 +112,7 @@ iree_compiler_cc_library( "ROCDLKernelConfig.cpp", "ROCDLLowerExecutableTarget.cpp", "ROCDLSelectLoweringStrategy.cpp", + "TestLLVMGPUQueryMMAPass.cpp", "Verifiers.cpp", ], hdrs = [ diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt index b9a962b27d41..fef9ca37e6aa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt @@ -97,6 +97,7 @@ iree_cc_library( "ROCDLKernelConfig.cpp" "ROCDLLowerExecutableTarget.cpp" "ROCDLSelectLoweringStrategy.cpp" + "TestLLVMGPUQueryMMAPass.cpp" "Verifiers.cpp" DEPS ::PassHeaders diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td index a7a8e5643c6e..06d9960f180f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td @@ -163,4 +163,9 @@ def TestLLVMGPUScalarizeMathOpPass : let summary = "Test pass for several legalization patterns."; } +def TestLLVMGPUQueryMMAPass : + Pass<"iree-test-llvmgpu-query-mma", "ModuleOp"> { + let summary = "Test pass for querying the supported mma intrinsic instructions."; +} + #endif // IREE_CODEGEN_LLVMGPU_PASSES diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp new file mode 100644 index 000000000000..fcce9ae77467 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp @@ -0,0 +1,48 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Codegen/LLVMGPU/Passes.h" +#include "iree/compiler/Codegen/Utils/GPUUtils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "iree-test-llvmgpu-query-mma" + +namespace mlir::iree_compiler { + +#define GEN_PASS_DEF_TESTLLVMGPUQUERYMMAPASS +#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc" + +static void printMMAVector(SmallVector &mmaAttrs, + const std::string &extraMessage = {}) { + llvm::outs() << "Printing MMA Collection" << extraMessage + << ", size: " << mmaAttrs.size() << "\n"; + for (const auto &mma : mmaAttrs) { + llvm::outs() << mma << " "; + } + llvm::outs() << "\n"; +} + +namespace { + +struct TestLLVMGPUQueryMMAPass final + : impl::TestLLVMGPUQueryMMAPassBase { + void runOnOperation() override { + ModuleOp moduleOp = getOperation(); + SmallVector mmaCollecton; + // Print mma vector before collection. + printMMAVector(mmaCollecton, + " Before querying supported mma instrinsic instructions"); + // Collect mma intrinsic instructions. + QueryMMAIntrinsics(moduleOp, mmaCollecton); + // Print mma vector after collection. + printMMAVector(mmaCollecton, + " After querying supported mma instrinsic instructions"); + } +}; +} // namespace +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 3a0d7d3da800..0256a74f2ecd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -55,6 +55,7 @@ iree_lit_test_suite( "promote_matmul_to_fit_mma.mlir", "tensor_pad.mlir", "tensorcore_vectorization.mlir", + "test_query_mma.mlir", "transform_dialect_bufferize.mlir", "transform_dialect_eliminate_gpu_barriers.mlir", "transform_dialect_hoist_allocs.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index f628010b1acd..635a49df1694 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -52,6 +52,7 @@ iree_lit_test_suite( "rocdl_pipeline_test.mlir" "tensor_pad.mlir" "tensorcore_vectorization.mlir" + "test_query_mma.mlir" "transform_dialect_bufferize.mlir" "transform_dialect_eliminate_gpu_barriers.mlir" "transform_dialect_hoist_allocs.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir new file mode 100644 index 000000000000..b5b761349093 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir @@ -0,0 +1,60 @@ +// RUN: iree-opt --split-input-file --iree-test-llvmgpu-query-mma %s | FileCheck %s + +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{abi = "hip", iree.gpu.target = #iree_gpu.target, , , +, , , +, , ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], +max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, +max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, +simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}> +#pipeline_layout = #hal.pipeline.layout]> +module { + hal.executable private @main { + hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) { + hal.executable.export public @entry_point layout(#pipeline_layout) + builtin.module { + func.func @fn() { + return + } + } + } + } +} + +// CHECK: Printing MMA Collection Before querying supported mma instrinsic instructions, size: 0 +// CHECK: Printing MMA Collection After querying supported mma instrinsic instructions, size: 9 +// CHECK: MFMA_F32_16x16x4_F32 +// CHECK-SAME: MFMA_F32_16x16x16_F16 +// CHECK-SAME: MFMA_F32_32x32x8_F16 +// CHECK-SAME: MFMA_F32_16x16x16_BF16 +// CHECK-SAME: MFMA_F32_32x32x8_BF16 +// CHECK-SAME: MFMA_F32_16x16x32_F8E4M3FNUZ +// CHECK-SAME: MFMA_F32_16x16x32_F8E5M2FNUZ +// CHECK-SAME: MFMA_I32_16x16x32_I8 +// CHECK-SAME: MFMA_I32_32x32x16_I8 +// CHECK-LABEL: func.func @fn + +// ----- + +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> +#pipeline_layout = #hal.pipeline.layout]> +module { + hal.executable private @main { + hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) { + hal.executable.export public @entry_point layout(#pipeline_layout) + builtin.module { + func.func @fn_empty() { + return + } + } + } + } +} + +// CHECK: Printing MMA Collection Before querying supported mma instrinsic instructions, size: 0 +// CHECK: Printing MMA Collection After querying supported mma instrinsic instructions, size: 0 +// CHECK-LABEL: func.func @fn_empty diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp index e996aba997b1..c23bc23d8df3 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp @@ -18,6 +18,7 @@ #include "llvm/Support/ErrorHandling.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinTypes.h" @@ -1028,4 +1029,27 @@ std::optional getGPUSubgroupSize(mlir::FunctionOpInterface func) { return std::nullopt; } +void QueryMMAIntrinsics(mlir::ModuleOp moduleOp, + SmallVector &mmaAttrs) { + IREE::GPU::TargetAttr target; + + // Walk through all `func::FuncOp` operations in `moduleOp`. + moduleOp.walk([&](func::FuncOp funcOp) { + if (auto attr = getGPUTargetAttr(funcOp)) { + // Store the target attribute if found. + target = attr; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + if (target) { + // Append each MMA attribute from the target's `Wgp` configuration to + // `mmaAttrs`. + for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { + mmaAttrs.emplace_back(mma); + } + } +} + } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h index 4e7c108f7c19..053c5cee14dc 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h @@ -206,6 +206,11 @@ IREE::GPU::TargetAttr getGPUTargetAttr(Operation *op); /// Returns std::nullopt if none found. std::optional getGPUSubgroupSize(mlir::FunctionOpInterface func); +/// Returns supported MMA intrinsic instructions based on the GPU target +/// description stored in `moduleOp` and populates them in `mmaAttrs`. +void QueryMMAIntrinsics(mlir::ModuleOp moduleOp, + SmallVector &mmaAttrs); + } // namespace mlir::iree_compiler #endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_