-
Notifications
You must be signed in to change notification settings - Fork 657
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
A few things were needed: * Populate required fields in KnownTargets.cpp. * Support the case where the intrinsic vector operand size is greater than the load instruction size (here it is 16xf16 = 256 bit). * `buildMmaOperation` creates `vector.insert_strided_slice` to insert the new accumulator vectors into the accumulator tile. In doing so, it was relying on `vector.insert_strided_slice` implicit expand-shape semantics, in ways that worked for the shapes we had seen in CDNA3 but not here. Solved by explicitly expanding shapes with `vector.shape_cast` ops. * In thread-distribution code (populateOperandXxx), we needed to account for the nuance between two distinct thread grids: "layout" vs "distribution". In the case of RDNA3, there is a distribution-only dimension that isn't reflected in the layout-centric TileSwizzle's. * On RDNA3, some float arithmetic is strongly non-IEEE754-compliant: even exactly-representable small integral values, on which float arithmetic should be exact, have epsilon numerical errors! Addressed by tolerance. * Fix a bug: the doubly-nullable type `std::optional<IREE::GPU::DataTiledMMAAttr>` tricked us, change to `IREE::GPU::DataTiledMMAAttr`. --------- Signed-off-by: Benoit Jacob <[email protected]>
- Loading branch information
Showing
8 changed files
with
201 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx1100.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" \ | ||
// RUN: --iree-gpu-test-target=gfx1100 \ | ||
// RUN: --split-input-file %s | FileCheck %s | ||
|
||
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | ||
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> | ||
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | ||
#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>> | ||
#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>> | ||
#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>> | ||
#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [ | ||
#hal.pipeline.binding<storage_buffer>, | ||
#hal.pipeline.binding<storage_buffer>, | ||
#hal.pipeline.binding<storage_buffer> | ||
]> | ||
func.func @matmul_lowering_WMMA_F32_16x16x16_F16() { | ||
%c0 = arith.constant 0 : index | ||
%M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index | ||
%N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index | ||
%K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index | ||
%0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) | ||
: !flow.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_lhs>>{%M, %K} | ||
%1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) | ||
: !flow.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_rhs>>{%K, %N} | ||
%2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) | ||
: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N} | ||
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] | ||
: !flow.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_lhs>>{%M, %K} | ||
-> tensor<?x?xf16, #encoding_lhs> | ||
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] | ||
: !flow.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_rhs>>{%K, %N} | ||
-> tensor<?x?xf16, #encoding_rhs> | ||
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] | ||
: !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N} | ||
-> tensor<?x?xf32, #encoding_result> | ||
%6 = linalg.matmul | ||
ins(%3, %4 : tensor<?x?xf16, #encoding_lhs>, | ||
tensor<?x?xf16, #encoding_rhs>) | ||
outs(%5 : tensor<?x?xf32, #encoding_result>) | ||
-> tensor<?x?xf32, #encoding_result> | ||
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] | ||
: tensor<?x?xf32, #encoding_result> | ||
-> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N} | ||
return | ||
} | ||
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> | ||
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> | ||
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> | ||
// CHECK: func.func @matmul_lowering_WMMA_F32_16x16x16_F16 | ||
// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) | ||
// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) | ||
// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) | ||
// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor<?x?x4x1x16x16xf16> | ||
// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor<?x?x4x1x16x16xf16> | ||
// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor<?x?x4x4x8x2x16xf32> | ||
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] | ||
// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], | ||
// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>] | ||
// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout<intrinsic = WMMA_F32_16x16x16_F16, unroll_m = 4, unroll_n_to_subgroups = 4> | ||
// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters