'memref.alloca' op expected no unbounded stack allocations #19481

AmosLewis · 2024-12-13T03:56:43Z

What happened?

The test pass in torch-mlir e2e but fail with iree.
iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb

cumsumbool.torch.mlir:7:10: error: 'memref.alloca' op expected no unbounded stack allocations
    %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64>
         ^
cumsumbool.torch.mlir:7:10: note: see current operation: %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
cumsumbool.torch.mlir:8:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64>
         ^
cumsumbool.torch.mlir:8:10: note: see current operation: 
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg3: !hal.device, %arg4: index):
    %23 = "arith.constant"() <{value = 1 : index}> : () -> index
    %24 = "arith.constant"() <{value = 1 : index}> : () -> index
    %25 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%23, %24, %25) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 1 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %3 = "arith.constant"() <{value = 0 : index}> : () -> index
      %4 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %6 = "arith.extui"(%4) : (i32) -> i64
      %7 = "arith.extui"(%5) : (i32) -> i64
      %8 = "arith.shli"(%7, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %9 = "arith.ori"(%6, %8) : (i64, i64) -> i64
      %10 = "arith.index_castui"(%9) : (i64) -> index
      %11 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi8>
      "memref.assume_alignment"(%11) <{alignment = 64 : i32}> : (memref<?xi8>) -> ()
      %12 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64>
      "memref.assume_alignment"(%12) <{alignment = 64 : i32}> : (memref<?xi64>) -> ()
      %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg2: index):
        %20 = "memref.load"(%11, %arg2) <{nontemporal = false}> : (memref<?xi8>, index) -> i8
        %21 = "arith.trunci"(%20) : (i8) -> i1
        %22 = "arith.extui"(%21) : (i1) -> i64
        "memref.store"(%22, %13, %arg2) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg1: index):
        "memref.store"(%1, %12, %arg1) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg0: index):
        %14 = "arith.cmpi"(%arg0, %3) <{predicate = 0 : i64}> : (index, index) -> i1
        "scf.if"(%14) ({
          %19 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          "memref.store"(%19, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
          "scf.yield"() : () -> ()
        }, {
          %15 = "arith.subi"(%arg0, %0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %16 = "memref.load"(%12, %15) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          %17 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          %18 = "arith.addi"(%16, %17) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
          "memref.store"(%18, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
          "scf.yield"() : () -> ()
        }) : (i1) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
failed to translate executables

Steps to reproduce your issue

iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb

cumsumbool.torch.mlir

module {
  func.func @main_graph(%arg0: !torch.vtensor<[?],i1>) -> !torch.vtensor<[?],si64> attributes {torch.onnx_meta.ir_version = 9 : si64, torch.onnx_meta.opset_version = 20 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.6.0"} {
    %int0 = torch.constant.int 0
    %none = torch.constant.none
    %int4 = torch.constant.int 4
    %false = torch.constant.bool false
    %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64>
    %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64>
    return %1 : !torch.vtensor<[?],si64>
  }
}

cumsumbool.linalg.mlir

#map = affine_map<(d0) -> (d0)>
module {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<?xi64> {
    %c0_i64 = arith.constant 0 : i64
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor<?xi1>
    %0 = tensor.empty(%dim) : tensor<?xi64>
    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<?xi1>) outs(%0 : tensor<?xi64>) {
    ^bb0(%in: i1, %out: i64):
      %7 = arith.extui %in : i1 to i64
      linalg.yield %7 : i64
    } -> tensor<?xi64>
    %dim_0 = tensor.dim %1, %c0 : tensor<?xi64>
    %2 = tensor.empty(%dim_0) : tensor<?xi64>
    %3 = linalg.fill ins(%c0_i64 : i64) outs(%2 : tensor<?xi64>) -> tensor<?xi64>
    %4 = tensor.empty() : tensor<i64>
    %5 = linalg.fill ins(%c0_i64 : i64) outs(%4 : tensor<i64>) -> tensor<i64>
    %6:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%1 : tensor<?xi64>) outs(%3, %5 : tensor<?xi64>, tensor<i64>) {
    ^bb0(%arg1: i64, %arg2: i64):
      %7 = arith.addi %arg1, %arg2 : i64
      tm_tensor.yield %7 : i64
    } -> tensor<?xi64>, tensor<i64>
    return %6#0 : tensor<?xi64>
  }
}

If you want to test with torch-mlir:

class NonzeroLongModule(torch.nn.Module):
    def __init__(self):
        super().__init__()

    @export
    @annotate_args(
        [
            None,
            ([-1], torch.bool, True),
        ]
    )
    def forward(self, x):
        return x.long()


@register_test_case(module_factory=lambda: NonzeroLongModule())
def NonzeroLongModule_basic(module, tu: TestUtils):
    module.forward(torch.tensor([0, 0, 1, 1, 0, 0], dtype=torch.bool))

python -m e2e_testing.main --config=onnx -v --filter NonzeroCumsumBoolModule
PASS - "NonzeroCumsumBoolModule_basic"
Summary:
    Passed: 1

What component(s) does this issue relate to?

Compiler

Version information

commit cb593895d297ae2e79cef6fef0b7157096cb5b4a (HEAD -> main, upstream/main)
Author: MaheshRavishankar <[email protected]>
Date:   Fri Dec 6 15:09:28 2024 -0800
    [Codegen][LLVMGPU] Avoid long compilation times of warp reduction pipeline (#19381)

Additional context

Finid this error in decompose torch.nonzero for model migraphx_onnx-model-zoo__gpt2-10 llvm/torch-mlir#3876

The text was updated successfully, but these errors were encountered:

AmosLewis added the bug 🐞 Something isn't working label Dec 13, 2024

AmosLewis mentioned this issue Dec 13, 2024

[Torch] Add decomposition for 1d torch.nonzero llvm/torch-mlir#3876

Merged

pdhirajkumarprasad assigned pashu123 Dec 13, 2024

AmosLewis closed this as completed in llvm/torch-mlir#3876 Dec 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

'memref.alloca' op expected no unbounded stack allocations #19481

'memref.alloca' op expected no unbounded stack allocations #19481

AmosLewis commented Dec 13, 2024 •

edited

Loading

'memref.alloca' op expected no unbounded stack allocations #19481

'memref.alloca' op expected no unbounded stack allocations #19481

Comments

AmosLewis commented Dec 13, 2024 • edited Loading

What happened?

Steps to reproduce your issue

What component(s) does this issue relate to?

Version information

Additional context

AmosLewis commented Dec 13, 2024 •

edited

Loading