Skip to content

Commit

Permalink
[GPU] Move tile and distribute pass before packing to intrinsic for …
Browse files Browse the repository at this point in the history
…TileAndfuse pipeline (#19053)

We want to first distribute to workgroups so that we can promote
operands to handle unaligned to intrinsic cases before we concretize the
mma shapes.

Signed-off-by: Nirvedh Meshram <[email protected]>
  • Loading branch information
nirvedhmeshram authored Nov 14, 2024
1 parent 8391943 commit f828914
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,19 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
for (auto [i, mDim] : llvm::enumerate(mDims)) {
workgroupTileSizes[mDim] =
schedule->mSubgroupCounts[i] * schedule->mTileSizes[i];
// Multiply by the intrinsic shape for the inner most dim as we distribute
// to workgroups before packing to intrinsic.
if (i == mDims.size() - 1)
workgroupTileSizes[mDim] *= schedule->mSize;
subgroupTileSizes[mDim] = schedule->mTileSizes[i];
}
for (auto [i, nDim] : llvm::enumerate(nDims)) {
workgroupTileSizes[nDim] =
schedule->nSubgroupCounts[i] * schedule->nTileSizes[i];
// Multiply by the intrinsic shape for the inner most dim as we distribute
// to workgroups before packing to intrinsic.
if (i == nDims.size() - 1)
workgroupTileSizes[nDim] *= schedule->nSize;
subgroupTileSizes[nDim] = schedule->nTileSizes[i];
}

Expand Down
6 changes: 3 additions & 3 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createConvolutionToIGEMMPass());
}

tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
/*convertToDpsOptions=*/std::nullopt);

// Step 1. Promote matmul operands and pack to intrinsic shapes.
funcPassManager.addPass(createGPUPromoteMatmulOperandsPass());
funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass());
Expand All @@ -357,9 +360,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
}
funcPassManager.addPass(createPropagateReshapesByExpansionPass());

tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
/*convertToDpsOptions=*/std::nullopt);

// Step 2. Tile and fuse tileable ops to reduction loops.
{
GPUApplyTilingLevelPassOptions options;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func.func @nhwc_conv_mfma() {
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
// CHECK-SAME: workgroup = [1, 2, 2, 4, 0]
// CHECK-SAME: workgroup = [1, 2, 32, 64, 0]

// -----

Expand Down Expand Up @@ -55,7 +55,7 @@ func.func @nchw_conv_mfma() {
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
// CHECK-SAME: workgroup = [1, 4, 2, 2, 0]
// CHECK-SAME: workgroup = [1, 64, 2, 32, 0]

// -----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 4]
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
// CHECK-SAME: workgroup = [1, 1, 4, 4, 0]
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]

// -----

Expand Down Expand Up @@ -72,7 +72,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1]
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
// CHECK-SAME: workgroup = [2, 2, 2, 2, 0, 0]
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]

// -----

Expand Down Expand Up @@ -107,7 +107,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 0, 1, 1]
// CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0]
// CHECK-SAME: workgroup = [1, 2, 1, 1, 2, 0, 0]
// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0]

// -----

Expand All @@ -132,7 +132,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 2]
// CHECK-SAME: subgroup = [4, 4, 0]
// CHECK-SAME: workgroup = [8, 8, 0]
// CHECK-SAME: workgroup = [128, 128, 0]

// -----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use_igemm_convolution = true>
}>
#config = #iree_gpu.lowering_config<{
workgroup = [1, 4, 1, 16, 0],
workgroup = [1, 4, 16, 256, 0],
reduction = [0, 0, 0, 0, 2],
subgroup = [1, 4, 1, 4, 0],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [4, 4, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [2, 2],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
Expand Down Expand Up @@ -140,7 +140,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [4, 4, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [2, 2],
mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
Expand Down Expand Up @@ -207,7 +207,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [4, 4, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [2, 2],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
Expand Down Expand Up @@ -263,7 +263,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [4, 4, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [2, 2],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
Expand Down Expand Up @@ -319,7 +319,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [2, 2, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [1, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
Expand Down Expand Up @@ -375,7 +375,7 @@ hal.executable public @main {
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [4, 4, 0],
workgroup = [64, 64, 0],
reduction = [0, 0, 2],
subgroup = [2, 2],
mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
Expand Down Expand Up @@ -578,7 +578,7 @@ hal.executable public @main {
mma_kind = #iree_gpu.mma_layout<WMMA_I32_16x16x16_I8>,
reduction = [0, 0, 4],
subgroup = [2, 4, 0],
workgroup = [4, 8, 0],
workgroup = [64, 128, 0],
promote_operands = [0, 1]
}>

Expand Down

0 comments on commit f828914

Please sign in to comment.