diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp index d64c661df8c0a..5582a63ac5581 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp @@ -32,54 +32,55 @@ namespace { /// parameterized by the thread grid. static SmallVector computeSIMDIndex(const LayoutIterator::State &state, LayoutAttr layout, Value laneId, + int64_t subgroupSize, RewriterBase &rewriter) { - MLIRContext *ctx = layout.getContext(); - AffineExpr threadX, threadY, threadZ; - bindSymbols(ctx, threadX, threadY, threadZ); + Location loc = laneId.getLoc(); + + auto [laneDimX, laneDimY, laneDimZ] = layout.getLaneGrid(); + int64_t gridsPerSubgroup = + llvm::divideCeil(subgroupSize, laneDimX * laneDimY * laneDimZ); + // Note: we add an extra entry to the delinearization here so that, if the + // vector layout requires fewer lanes than are present in the subgroup. + // Otherwise, we'd, for example, construct delinearizations with the basis (1, + // 1, 16) when there are 32 lanes, which would simplify to no delinearization + // at all. To resolve this, we add an extra term to the grid to capture the + // overflow. + auto reversedLaneGrid = rewriter.create( + loc, laneId, + ArrayRef{gridsPerSubgroup, laneDimZ, laneDimY, laneDimX}); SmallVector simdIndex; + // Calculate the index for each dim separately. for (PerDimLayoutAttr dimLayout : layout.getLayouts()) { - AffineExpr offset = getAffineConstantExpr(0, ctx); - AffineExpr stride = getAffineConstantExpr(1, ctx); - for (auto [label, shape] : llvm::reverse( - llvm::zip(dimLayout.getLabels(), dimLayout.getShapes()))) { + SmallVector linearizeVals; + for (LayoutDimensionAttr label : dimLayout.getLabels()) { int64_t position = state.lookup(label.getValue()).getPosition(); + // Note: indices are into a reversed lane grid that has an extra leading + // term we must ignore (so the X coordinate is result #3 and the Z + // coordinate is result #1). switch (label.getValue()) { case LayoutDimension::LANEX: - offset = offset + stride * threadX; + linearizeVals.push_back(reversedLaneGrid.getResult(3)); break; case LayoutDimension::LANEY: - offset = offset + stride * threadY; + linearizeVals.push_back(reversedLaneGrid.getResult(2)); break; case LayoutDimension::LANEZ: - offset = offset + stride * threadZ; + linearizeVals.push_back(reversedLaneGrid.getResult(1)); break; default: - offset = offset + stride * getAffineConstantExpr(position, ctx); + linearizeVals.push_back( + rewriter.createOrFold(loc, position)); break; } - stride = stride * getAffineConstantExpr(shape, ctx); } - auto [laneDimX, laneDimY, laneDimZ] = layout.getLaneGrid(); - SmallVector laneGrid = { - rewriter.create(laneId.getLoc(), laneDimZ), - rewriter.create(laneId.getLoc(), laneDimY), - rewriter.create(laneId.getLoc(), laneDimX)}; - FailureOr> maybeReversedLaneGridVals = - affine::delinearizeIndex(rewriter, laneId.getLoc(), laneId, laneGrid); - assert(succeeded(maybeReversedLaneGridVals) && - "Failed to delinearize lane index"); - SmallVector laneGridVals = {(*maybeReversedLaneGridVals)[2], - (*maybeReversedLaneGridVals)[1], - (*maybeReversedLaneGridVals)[0]}; - // Compute the index for the dim. - AffineMap indexMap = AffineMap::get(0, 3, offset); - Value index = rewriter.create( - rewriter.getUnknownLoc(), indexMap, laneGridVals); + Value index = rewriter.create( + rewriter.getUnknownLoc(), linearizeVals, dimLayout.getShapes(), + /*disjoint=*/true); simdIndex.push_back(index); } @@ -199,8 +200,9 @@ struct DistributeXferLayoutAttr : OpDistributionPattern { "expected vector::TransferReadOp or vector::TransferWriteOp"); DistributeXferLayoutAttr(MLIRContext *context, Value laneId, - PatternBenefit benefit = 1) - : OpDistributionPattern(context, benefit), laneId(laneId) {} + int64_t subgroupSize, PatternBenefit benefit = 1) + : OpDistributionPattern(context, benefit), laneId(laneId), + subgroupSize(subgroupSize) {} VectorValue accessMemory(OpTy xferOp, VectorValue accumulator, LayoutAttr vectorLayout, @@ -237,7 +239,7 @@ struct DistributeXferLayoutAttr : OpDistributionPattern { llvm::SmallBitVector &projectedDims, RewriterBase &rewriter) const { SmallVector simdIndices = - computeSIMDIndex(state, memoryLayout, laneId, rewriter); + computeSIMDIndex(state, memoryLayout, laneId, subgroupSize, rewriter); SmallVector memoryIndices(indices); // The memory layout has some projected leading dims that indices doesn't. @@ -272,6 +274,7 @@ struct DistributeXferLayoutAttr : OpDistributionPattern { } Value laneId; + int64_t subgroupSize; }; struct DistributeTransferReadLayoutAttr final @@ -1118,10 +1121,11 @@ void populateGPUDistributionPatterns(RewritePatternSet &patterns) { } void populateGPUDistributionLayoutAttrPatterns(Value laneId, + int64_t subgroupSize, RewritePatternSet &patterns) { patterns .add( - patterns.getContext(), laneId); + patterns.getContext(), laneId, subgroupSize); patterns.add( patterns.getContext()); } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h index 87303844853ff..4044f7a46efaa 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h @@ -32,6 +32,7 @@ void populateDropSharedMemoryDeallocOpPatterns(RewritePatternSet &patterns); void populateGPUDistributionPatterns(RewritePatternSet &patterns); void populateGPUDistributionLayoutAttrPatterns(Value laneId, + int64_t subgroupSize, RewritePatternSet &patterns); void populateGPUReductionDistributionPatterns(RewritePatternSet &patterns, diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir index cf47ca9d47b5d..8bafaf16361b6 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir @@ -197,9 +197,6 @@ builtin.module attributes { transform.with_named_sequence } { #layout_row_major = #iree_vector_ext.layout<<[BATCHX, LANEY], [2, 8]>, <[BATCHY, LANEX, VECTORX], [2, 1, 8]>> #layout_col_major = #iree_vector_ext.layout<<[BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[BATCHY, LANEX], [2, 8]>> -// TODO: Use affine min tricks based on the grid size to elide the mod. -// Note that this IR is invalid if subgroup size != 8. - func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { %c0 = arith.constant 0 : index %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> @@ -208,24 +205,23 @@ func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: : vector<16x16xf16>, memref<64x64xf16> func.return } -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 mod 8)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> // CHECK-LABEL: @distribute_transfer_write_row_major // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index // CHECK-DAG: %[[LANEID:.+]] = gpu.thread_id x -// CHECK: %[[VEC_LANE_Y:.+]] = affine.apply #[[$MAP0]]()[%[[LANEID]]] +// CHECK: %[[SPLIT_ID:.+]]:2 = affine.delinearize_index %[[LANEID]] into (8, 8) // CHECK: %[[DIST_SRC_VEC:.+]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<2x2x8xf16> // CHECK: %[[BATCH_0_0:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 0] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_0_0]], %{{.*}}[%[[VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16> +// CHECK: vector.store %[[BATCH_0_0]], %{{.*}}[%[[SPLIT_ID]]#1, %[[C0]]] : memref<64x64xf16>, vector<8xf16> -// CHECK: %[[NEXT_VEC_LANE_Y:.+]] = affine.apply #[[$MAP1]]()[%[[LANEID]]] +// CHECK: %[[NEXT_VEC_LANE_Y:.+]] = affine.linearize_index disjoint [%[[C1]], %[[SPLIT_ID]]#1] by (2, 8) : index // CHECK: %[[BATCH_1_0:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 0] : vector<8xf16> from vector<2x2x8xf16> // CHECK: vector.store %[[BATCH_1_0]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16> // CHECK: %[[BATCH_0_1:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 1] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_0_1]], %{{.*}}[%[[VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16> +// CHECK: vector.store %[[BATCH_0_1]], %{{.*}}[%[[SPLIT_ID]]#1, %[[C8]]] : memref<64x64xf16>, vector<8xf16> // CHECK: %[[BATCH_1_1:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 1] : vector<8xf16> from vector<2x2x8xf16> // CHECK: vector.store %[[BATCH_1_1]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16> @@ -560,8 +556,6 @@ builtin.module attributes { transform.with_named_sequence } { #layoutB2 = #iree_vector_ext.layout<<[ BATCHX, LANEY, VECTORX], [1, 1, 16]>, <[ BATCHY, LANEX], [1, 16]>> #layoutC2 = #iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [1, 8, 2, 1]>, <[ BATCHY, LANEX], [1, 16]>> -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + (s0 floordiv 32) * 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 16)> // CHECK-LABEL: func.func @resolve_wmma_layout_conflict_with_shared_memory func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf16>, %14 : vector<16x16xf16>, @@ -607,19 +601,18 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1 // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[VEC_INIT:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x16xf16 -// CHECK: %[[VEC_INIT:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x16xf16 -// CHECK: %[[TID_X:.+]] = gpu.thread_id x -// CHECK: %[[TID_Y:.+]] = gpu.thread_id y -// CHECK: %[[TID_Z:.+]] = gpu.thread_id z -// CHECK: %[[SUBGROUP_OFFSET:.+]] = affine.apply #[[$MAP0]]()[%[[TID_X]], %[[TID_Y]], %[[TID_Z]]] +// CHECK: %[[TIDX:.+]] = gpu.thread_id x +// CHECK: %[[TIDY:.+]] = gpu.thread_id y +// CHECK: %[[SUBGROUP_OFFSET:.+]] = affine.linearize_index disjoint [%[[TIDY]], %[[C0]]] by (2, 16) // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16x32xf16, #gpu.address_space> // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][0, %[[SUBGROUP_OFFSET]]] [16, 16] [1, 1] -// CHECK: %[[HALF_LANE_ID:.+]] = affine.apply #[[$MAP1]]()[%[[TID_X]]] -// CHECK-COUNT-8: vector.store %{{.+}}, %[[SUBVIEW]][%{{.+}}, %[[HALF_LANE_ID]]] +// CHECK: %[[SPLIT_LANE_ID:.+]]:2 = affine.delinearize_index %[[TIDX]] into (2, 16) +// CHECK-COUNT-8: vector.store %{{.+}}, %[[SUBVIEW]][%{{.+}}, %[[SPLIT_LANE_ID]]#1] // CHECK-AFTER: gpu.barrier -// CHECK: %[[LANE_OFFSET:.+]] = arith.addi %[[SUBGROUP_OFFSET]], %[[HALF_LANE_ID]] +// CHECK: %[[LANE_OFFSET:.+]] = arith.addi %[[SUBGROUP_OFFSET]], %[[SPLIT_LANE_ID]]#1 // CHECK: %[[LOAD0:.+]] = vector.load %[[ALLOC]][%[[C0]], %[[LANE_OFFSET]]] // CHECK: %[[INSERT0:.+]] = vector.insert_strided_slice %[[LOAD0]], %[[VEC_INIT]] {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> // CHECK: %[[LOAD1:.+]] = vector.load %[[ALLOC]][%[[C1]], %[[LANE_OFFSET]]] @@ -636,7 +629,7 @@ func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf1 builtin.module attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true} : !transform.any_op + transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true, subgroup_size = 32 : i64} : !transform.any_op transform.yield } } diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp index ac8ae7386f55f..d8c5f1c8081f3 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp @@ -1113,18 +1113,17 @@ transform_dialect::TestGpuVectorDistribution::applyToOne( rewriter.setInsertionPointToStart(&target.getFunctionBody().front()); // This is a test op so we unsafely use thread_id x as the lane ID. In // general this should linearize the thread IDs based on the workgroup size - // and divide by the subgroup size. i.e. + // and take the modulo by the subgroup size. i.e. // - // lane_id = (tid_x + tid_y * dim_x + tid_z * dim_y * dim_x) / subgroup_size; + // lane_id = (tid_x + tid_y * dim_x + tid_z * dim_y * dim_x) % subgroup_size; Value laneId = rewriter.create(target.getLoc(), gpu::Dimension::x); + int64_t subgroupSize = getSubgroupSize(); populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); + populateGPUDistributionLayoutAttrPatterns(laneId, subgroupSize, patterns); populateGPUReductionDistributionPatterns(patterns); - // For testing we use subgroup size = 64. - populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, - /*subgroupSize=*/64); + populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize); populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns); if (getExperimental()) populateGPULayoutResolutionDistributionPatterns(patterns); diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td index 5219b4a2da9c9..0c05178043c88 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td @@ -631,7 +631,8 @@ def TestGpuVectorDistribution : }]; let arguments = (ins TransformHandleTypeInterface:$target, - DefaultValuedOptionalAttr:$experimental); + DefaultValuedOptionalAttr:$experimental, + DefaultValuedOptionalAttr:$subgroup_size); let results = (outs); let assemblyFormat = [{ $target attr-dict `:` type($target)}]; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp index 6e8558c7995ad..54b5612b5d1e9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp @@ -34,7 +34,7 @@ class ContractionVectorLayoutOptions : public VectorLayoutOptions { int64_t subgroupSize) : VectorLayoutOptions(root), patterns(root->getContext()) { populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); + populateGPUDistributionLayoutAttrPatterns(laneId, subgroupSize, patterns); populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize); populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp index 3dd0c128008e8..beca786675a48 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp @@ -1473,13 +1473,13 @@ transform_dialect::AMDGPUDistributeVectorsOp::applyToOne( rewriter.setInsertionPointToStart(&target.getFunctionBody().front()); Value laneId = rewriter.create(target.getLoc(), gpu::Dimension::x); + int64_t subgroupSize = getSubgroupSize(); populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); + populateGPUDistributionLayoutAttrPatterns(laneId, subgroupSize, patterns); populateGPUReductionDistributionPatterns(patterns); // For testing we use subgroup size = 64. - populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, - /*subgroupSize=*/64); + populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize); populateAMDGPUDistributionPatterns(patterns); populateGPULayoutResolutionDistributionPatterns(patterns); if (failed(distributeVectorOps(target, patterns, options))) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td index ac3e7eef75136..4f1a8c8f163e2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td @@ -699,7 +699,8 @@ def AMDGPUDistributeVectorsOp : }]; let arguments = (ins TransformHandleTypeInterface:$target, - UnitAttr:$test_conversion); + UnitAttr:$test_conversion, + DefaultValuedOptionalAttr:$subgroup_size); let results = (outs TransformHandleTypeInterface:$result); let assemblyFormat = [{ diff --git a/third_party/llvm-project b/third_party/llvm-project index 8c4e6fb26b8a7..414145e8aac12 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 8c4e6fb26b8a7e85ef825e6250b205ab0a1d516a +Subproject commit 414145e8aac12684caa37bfb57fc15cc64e592da