Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

'memref.alloca' op expected no unbounded stack allocations #19481

Closed
AmosLewis opened this issue Dec 13, 2024 · 0 comments · Fixed by llvm/torch-mlir#3876
Closed

'memref.alloca' op expected no unbounded stack allocations #19481

AmosLewis opened this issue Dec 13, 2024 · 0 comments · Fixed by llvm/torch-mlir#3876
Assignees
Labels
bug 🐞 Something isn't working

Comments

@AmosLewis
Copy link
Contributor

AmosLewis commented Dec 13, 2024

What happened?

The test pass in torch-mlir e2e but fail with iree.
iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb

cumsumbool.torch.mlir:7:10: error: 'memref.alloca' op expected no unbounded stack allocations
    %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64>
         ^
cumsumbool.torch.mlir:7:10: note: see current operation: %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
cumsumbool.torch.mlir:8:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64>
         ^
cumsumbool.torch.mlir:8:10: note: see current operation: 
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg3: !hal.device, %arg4: index):
    %23 = "arith.constant"() <{value = 1 : index}> : () -> index
    %24 = "arith.constant"() <{value = 1 : index}> : () -> index
    %25 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%23, %24, %25) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 1 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %3 = "arith.constant"() <{value = 0 : index}> : () -> index
      %4 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %6 = "arith.extui"(%4) : (i32) -> i64
      %7 = "arith.extui"(%5) : (i32) -> i64
      %8 = "arith.shli"(%7, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %9 = "arith.ori"(%6, %8) : (i64, i64) -> i64
      %10 = "arith.index_castui"(%9) : (i64) -> index
      %11 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi8>
      "memref.assume_alignment"(%11) <{alignment = 64 : i32}> : (memref<?xi8>) -> ()
      %12 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64>
      "memref.assume_alignment"(%12) <{alignment = 64 : i32}> : (memref<?xi64>) -> ()
      %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg2: index):
        %20 = "memref.load"(%11, %arg2) <{nontemporal = false}> : (memref<?xi8>, index) -> i8
        %21 = "arith.trunci"(%20) : (i8) -> i1
        %22 = "arith.extui"(%21) : (i1) -> i64
        "memref.store"(%22, %13, %arg2) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg1: index):
        "memref.store"(%1, %12, %arg1) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %10, %0) ({
      ^bb0(%arg0: index):
        %14 = "arith.cmpi"(%arg0, %3) <{predicate = 0 : i64}> : (index, index) -> i1
        "scf.if"(%14) ({
          %19 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          "memref.store"(%19, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
          "scf.yield"() : () -> ()
        }, {
          %15 = "arith.subi"(%arg0, %0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %16 = "memref.load"(%12, %15) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          %17 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          %18 = "arith.addi"(%16, %17) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
          "memref.store"(%18, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
          "scf.yield"() : () -> ()
        }) : (i1) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
failed to translate executables

Steps to reproduce your issue

iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb

cumsumbool.torch.mlir

module {
  func.func @main_graph(%arg0: !torch.vtensor<[?],i1>) -> !torch.vtensor<[?],si64> attributes {torch.onnx_meta.ir_version = 9 : si64, torch.onnx_meta.opset_version = 20 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.6.0"} {
    %int0 = torch.constant.int 0
    %none = torch.constant.none
    %int4 = torch.constant.int 4
    %false = torch.constant.bool false
    %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64>
    %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64>
    return %1 : !torch.vtensor<[?],si64>
  }
}

cumsumbool.linalg.mlir

#map = affine_map<(d0) -> (d0)>
module {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<?xi64> {
    %c0_i64 = arith.constant 0 : i64
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor<?xi1>
    %0 = tensor.empty(%dim) : tensor<?xi64>
    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<?xi1>) outs(%0 : tensor<?xi64>) {
    ^bb0(%in: i1, %out: i64):
      %7 = arith.extui %in : i1 to i64
      linalg.yield %7 : i64
    } -> tensor<?xi64>
    %dim_0 = tensor.dim %1, %c0 : tensor<?xi64>
    %2 = tensor.empty(%dim_0) : tensor<?xi64>
    %3 = linalg.fill ins(%c0_i64 : i64) outs(%2 : tensor<?xi64>) -> tensor<?xi64>
    %4 = tensor.empty() : tensor<i64>
    %5 = linalg.fill ins(%c0_i64 : i64) outs(%4 : tensor<i64>) -> tensor<i64>
    %6:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%1 : tensor<?xi64>) outs(%3, %5 : tensor<?xi64>, tensor<i64>) {
    ^bb0(%arg1: i64, %arg2: i64):
      %7 = arith.addi %arg1, %arg2 : i64
      tm_tensor.yield %7 : i64
    } -> tensor<?xi64>, tensor<i64>
    return %6#0 : tensor<?xi64>
  }
}

If you want to test with torch-mlir:

class NonzeroLongModule(torch.nn.Module):
    def __init__(self):
        super().__init__()

    @export
    @annotate_args(
        [
            None,
            ([-1], torch.bool, True),
        ]
    )
    def forward(self, x):
        return x.long()


@register_test_case(module_factory=lambda: NonzeroLongModule())
def NonzeroLongModule_basic(module, tu: TestUtils):
    module.forward(torch.tensor([0, 0, 1, 1, 0, 0], dtype=torch.bool))
python -m e2e_testing.main --config=onnx -v --filter NonzeroCumsumBoolModule
PASS - "NonzeroCumsumBoolModule_basic"
Summary:
    Passed: 1

What component(s) does this issue relate to?

Compiler

Version information

commit cb593895d297ae2e79cef6fef0b7157096cb5b4a (HEAD -> main, upstream/main)
Author: MaheshRavishankar <[email protected]>
Date:   Fri Dec 6 15:09:28 2024 -0800
    [Codegen][LLVMGPU] Avoid long compilation times of warp reduction pipeline (#19381)

Additional context

Finid this error in decompose torch.nonzero for model migraphx_onnx-model-zoo__gpt2-10 llvm/torch-mlir#3876

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug 🐞 Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants