[tuner]:Add a utility function to query supported MMA intrinsics

Signed-off-by: Bangtian Liu <[email protected]>
iree-org · Nov 12, 2024 · 8d86018 · 8d86018
1 parent fa6aa1c
commit 8d86018
Show file tree

Hide file tree

Showing 9 changed files with 146 additions and 0 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -112,6 +112,7 @@ iree_compiler_cc_library(
         "ROCDLKernelConfig.cpp",
         "ROCDLLowerExecutableTarget.cpp",
         "ROCDLSelectLoweringStrategy.cpp",
+        "TestLLVMGPUQueryMMAPass.cpp",
         "Verifiers.cpp",
     ],
     hdrs = [

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -97,6 +97,7 @@ iree_cc_library(
     "ROCDLKernelConfig.cpp"
     "ROCDLLowerExecutableTarget.cpp"
     "ROCDLSelectLoweringStrategy.cpp"
+    "TestLLVMGPUQueryMMAPass.cpp"
     "Verifiers.cpp"
   DEPS
     ::PassHeaders

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -163,4 +163,9 @@ def TestLLVMGPUScalarizeMathOpPass :
   let summary = "Test pass for several legalization patterns.";
 }
 
+def TestLLVMGPUQueryMMAPass :
+    Pass<"iree-test-llvmgpu-query-mma", "ModuleOp"> {
+  let summary = "Test pass for querying the supported mma intrinsic instructions.";
+}
+
 #endif // IREE_CODEGEN_LLVMGPU_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp
@@ -0,0 +1,48 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "iree-test-llvmgpu-query-mma"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_TESTLLVMGPUQUERYMMAPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
+static void printMMAVector(SmallVector<IREE::GPU::MMAAttr> &mmaAttrs,
+                           const std::string &extraMessage = {}) {
+  llvm::outs() << "Printing MMA Collection" << extraMessage
+               << ", size: " << mmaAttrs.size() << "\n";
+  for (const auto &mma : mmaAttrs) {
+    llvm::outs() << mma << " ";
+  }
+  llvm::outs() << "\n";
+}
+
+namespace {
+
+struct TestLLVMGPUQueryMMAPass final
+    : impl::TestLLVMGPUQueryMMAPassBase<TestLLVMGPUQueryMMAPass> {
+  void runOnOperation() override {
+    ModuleOp moduleOp = getOperation();
+    SmallVector<IREE::GPU::MMAAttr> mmaCollecton;
+    // Print mma vector before collection.
+    printMMAVector(mmaCollecton,
+                   " Before querying supported mma instrinsic instructions");
+    // Collect mma intrinsic instructions.
+    QueryMMAIntrinsics(moduleOp, mmaCollecton);
+    // Print mma vector after collection.
+    printMMAVector(mmaCollecton,
+                   " After querying supported mma instrinsic instructions");
+  }
+};
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -55,6 +55,7 @@ iree_lit_test_suite(
             "promote_matmul_to_fit_mma.mlir",
             "tensor_pad.mlir",
             "tensorcore_vectorization.mlir",
+            "test_query_mma.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_eliminate_gpu_barriers.mlir",
             "transform_dialect_hoist_allocs.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -52,6 +52,7 @@ iree_lit_test_suite(
     "rocdl_pipeline_test.mlir"
     "tensor_pad.mlir"
     "tensorcore_vectorization.mlir"
+    "test_query_mma.mlir"
     "transform_dialect_bufferize.mlir"
     "transform_dialect_eliminate_gpu_barriers.mlir"
     "transform_dialect_hoist_allocs.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir
@@ -0,0 +1,60 @@
+// RUN: iree-opt --split-input-file --iree-test-llvmgpu-query-mma %s | FileCheck %s
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
+wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8,
+subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32,
+mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>,
+<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
+<MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024],
+max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
+max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128,
+simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+module {
+  hal.executable private @main {
+    hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @entry_point layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn() {
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK: Printing MMA Collection Before querying supported mma instrinsic instructions, size: 0
+// CHECK: Printing MMA Collection After querying supported mma instrinsic instructions, size: 9
+// CHECK: MFMA_F32_16x16x4_F32
+// CHECK-SAME: MFMA_F32_16x16x16_F16
+// CHECK-SAME: MFMA_F32_32x32x8_F16
+// CHECK-SAME: MFMA_F32_16x16x16_BF16
+// CHECK-SAME: MFMA_F32_32x32x8_BF16
+// CHECK-SAME: MFMA_F32_16x16x32_F8E4M3FNUZ
+// CHECK-SAME: MFMA_F32_16x16x32_F8E5M2FNUZ
+// CHECK-SAME: MFMA_I32_16x16x32_I8
+// CHECK-SAME: MFMA_I32_32x32x16_I8
+// CHECK-LABEL: func.func @fn
+
+// -----
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+module {
+  hal.executable private @main {
+    hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @entry_point layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn_empty() {
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK: Printing MMA Collection Before querying supported mma instrinsic instructions, size: 0
+// CHECK: Printing MMA Collection After querying supported mma instrinsic instructions, size: 0
+// CHECK-LABEL: func.func @fn_empty
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -1028,4 +1029,27 @@ std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func) {
   return std::nullopt;
 }
 
+void QueryMMAIntrinsics(mlir::ModuleOp moduleOp,
+                        SmallVector<IREE::GPU::MMAAttr> &mmaAttrs) {
+  IREE::GPU::TargetAttr target;
+
+  // Walk through all `func::FuncOp` operations in `moduleOp`.
+  moduleOp.walk([&](func::FuncOp funcOp) {
+    if (auto attr = getGPUTargetAttr(funcOp)) {
+      // Store the target attribute if found.
+      target = attr;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+
+  if (target) {
+    // Append each MMA attribute from the target's `Wgp` configuration to
+    // `mmaAttrs`.
+    for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
+      mmaAttrs.emplace_back(mma);
+    }
+  }
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -206,6 +206,11 @@ IREE::GPU::TargetAttr getGPUTargetAttr(Operation *op);
 /// Returns std::nullopt if none found.
 std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func);
 
+/// Returns supported MMA intrinsic instructions based on the GPU target
+/// description stored in `moduleOp` and populates them in `mmaAttrs`.
+void QueryMMAIntrinsics(mlir::ModuleOp moduleOp,
+                        SmallVector<IREE::GPU::MMAAttr> &mmaAttrs);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_