diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index bba55216cba3..8aaabfa07053 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -281,11 +281,19 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, for (auto [i, mDim] : llvm::enumerate(mDims)) { workgroupTileSizes[mDim] = schedule->mSubgroupCounts[i] * schedule->mTileSizes[i]; + // Multiply by the intrinsic shape for the inner most dim as we distribute + // to workgroups before packing to intrinsic. + if (i == mDims.size() - 1) + workgroupTileSizes[mDim] *= schedule->mSize; subgroupTileSizes[mDim] = schedule->mTileSizes[i]; } for (auto [i, nDim] : llvm::enumerate(nDims)) { workgroupTileSizes[nDim] = schedule->nSubgroupCounts[i] * schedule->nTileSizes[i]; + // Multiply by the intrinsic shape for the inner most dim as we distribute + // to workgroups before packing to intrinsic. + if (i == nDims.size() - 1) + workgroupTileSizes[nDim] *= schedule->nSize; subgroupTileSizes[nDim] = schedule->nTileSizes[i]; } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 33f3ae90373e..932f7d016f89 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -341,6 +341,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createConvolutionToIGEMMPass()); } + tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true, + /*convertToDpsOptions=*/std::nullopt); + // Step 1. Promote matmul operands and pack to intrinsic shapes. funcPassManager.addPass(createGPUPromoteMatmulOperandsPass()); funcPassManager.addPass(IREE::GPU::createPackToIntrinsicsPass()); @@ -357,9 +360,6 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, } funcPassManager.addPass(createPropagateReshapesByExpansionPass()); - tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true, - /*convertToDpsOptions=*/std::nullopt); - // Step 2. Tile and fuse tileable ops to reduction loops. { GPUApplyTilingLevelPassOptions options; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir index 4a45d88ce980..ca22f60bf7d5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir @@ -26,7 +26,7 @@ func.func @nhwc_conv_mfma() { // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] -// CHECK-SAME: workgroup = [1, 2, 2, 4, 0] +// CHECK-SAME: workgroup = [1, 2, 32, 64, 0] // ----- @@ -55,7 +55,7 @@ func.func @nchw_conv_mfma() { // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] -// CHECK-SAME: workgroup = [1, 4, 2, 2, 0] +// CHECK-SAME: workgroup = [1, 64, 2, 32, 0] // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index a0a1f22b8fc9..2c6926a97136 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4] // CHECK-SAME: subgroup = [1, 1, 4, 1, 0] -// CHECK-SAME: workgroup = [1, 1, 4, 4, 0] +// CHECK-SAME: workgroup = [1, 1, 64, 64, 0] // ----- @@ -72,7 +72,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4 // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1] // CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0] -// CHECK-SAME: workgroup = [2, 2, 2, 2, 0, 0] +// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0] // ----- @@ -107,7 +107,7 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor, %rhs: t // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 0, 1, 1] // CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0] -// CHECK-SAME: workgroup = [1, 2, 1, 1, 2, 0, 0] +// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0] // ----- @@ -132,7 +132,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 2] // CHECK-SAME: subgroup = [4, 4, 0] -// CHECK-SAME: workgroup = [8, 8, 0] +// CHECK-SAME: workgroup = [128, 128, 0] // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir index bec9a29c9544..0763eb4683b9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir @@ -17,7 +17,7 @@ use_igemm_convolution = true> }> #config = #iree_gpu.lowering_config<{ - workgroup = [1, 4, 1, 16, 0], + workgroup = [1, 4, 16, 256, 0], reduction = [0, 0, 0, 0, 2], subgroup = [1, 4, 1, 4, 0], mma_kind = #iree_gpu.mma_layout, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 10e1016d8c09..ab9606b8698c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -73,7 +73,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [4, 4, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -140,7 +140,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [4, 4, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -207,7 +207,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [4, 4, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -263,7 +263,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [4, 4, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -319,7 +319,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [2, 2, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [1, 1], mma_kind = #iree_gpu.mma_layout, @@ -375,7 +375,7 @@ hal.executable public @main { #hal.pipeline.binding ]> #config = #iree_gpu.lowering_config<{ - workgroup = [4, 4, 0], + workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout, @@ -578,7 +578,7 @@ hal.executable public @main { mma_kind = #iree_gpu.mma_layout, reduction = [0, 0, 4], subgroup = [2, 4, 0], - workgroup = [4, 8, 0], + workgroup = [64, 128, 0], promote_operands = [0, 1] }>