Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes #875

Closed
pdhirajkumarprasad opened this issue Nov 4, 2024 · 4 comments
Assignees

Comments

@pdhirajkumarprasad
Copy link

for the given IR

module {
  func.func @main_graph(%arg2: !torch.vtensor<[?,384,?],f32>) -> !torch.vtensor<[?,384,?],f32>  attributes {torch.onnx_meta.ir_version = 6 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.1.0"} {
    %1 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384x384x3xf32>} : () -> !torch.vtensor<[384,384,3],f32> 
    %2 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0.0> : tensor<384xf32>} : () -> !torch.vtensor<[384],f32> 
    %3 = torch.operator "onnx.Conv"(%arg2, %1, %2) {torch.onnx.dilations = [1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [3 : si64], torch.onnx.pads = [1 : si64, 1 : si64], torch.onnx.strides = [2 : si64]} : (!torch.vtensor<[?,384,?],f32>, !torch.vtensor<[384,384,3],f32>, !torch.vtensor<[384],f32>) -> !torch.vtensor<[?,384,?],f32> 
    return %3 : !torch.vtensor<[?,384,?],f32>
  }
}

getting error as

 error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes

command : iree-compile --iree-hal-target-backends=llvm-cpu -o abc.vmfb model.torch_onnx.mlir

@vinayakdsci
Copy link

Compilation fails on this dispatch:

hal.executable public @main_graph$async_dispatch_1 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "icelake-server", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,-nf,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,-rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,-shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @main_graph$async_dispatch_1_elementwise_Dx384xD_f32 ordinal(0) layout(#hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
        %c0 = arith.constant 0 : index
        %c32_i64 = arith.constant 32 : i64
        %cst = arith.constant dense<0.000000e+00> : tensor<384x384x3xf32>
        %cst_0 = arith.constant 0.000000e+00 : f32
        %0 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
        %1 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
        %2 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
        %3 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
        %4 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
        %5 = hal.interface.constant.load layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
        %6 = arith.extui %0 : i32 to i64
        %7 = arith.extui %1 : i32 to i64
        %8 = arith.shli %7, %c32_i64 : i64
        %9 = arith.ori %6, %8 : i64
        %10 = arith.index_castui %9 : i64 to index
        %11 = arith.extui %2 : i32 to i64
        %12 = arith.extui %3 : i32 to i64
        %13 = arith.shli %12, %c32_i64 : i64
        %14 = arith.ori %11, %13 : i64
        %15 = arith.index_castui %14 : i64 to index
        %16 = arith.extui %4 : i32 to i64
        %17 = arith.extui %5 : i32 to i64
        %18 = arith.shli %17, %c32_i64 : i64
        %19 = arith.ori %16, %18 : i64
        %20 = arith.index_castui %19 : i64 to index
        %21:3 = util.assume.int 
            %10<umin = 0, umax = 9007199254740991>, 
            %15<umin = 2, umax = 9007199254740993>, 
            %20<umin = 0, umax = 4503599627370496>
          : index, index, index
        %22 = flow.dispatch.workload.ordinal %21#0, 0 : index
        %23 = flow.dispatch.workload.ordinal %21#1, 1 : index
        %24 = flow.dispatch.workload.ordinal %21#2, 2 : index
        %25 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23}
        %26 = hal.interface.binding.subspan layout(<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        %27 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0], sizes = [%22, 384, %23], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x384x?xf32>>{%22, %23} -> tensor<?x384x?xf32>
        %28 = tensor.empty(%22, %24) : tensor<?x384x?xf32>
        %29 = linalg.fill ins(%cst_0 : f32) outs(%28 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %30 = linalg.conv_1d_ncw_fcw {dilations = dense<1> : vector<1xi64>, strides = dense<2> : vector<1xi64>} ins(%27, %cst : tensor<?x384x?xf32>, tensor<384x384x3xf32>) outs(%29 : tensor<?x384x?xf32>) -> tensor<?x384x?xf32>
        %31 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%30 : tensor<?x384x?xf32>) outs(%28 : tensor<?x384x?xf32>) {
        ^bb0(%in: f32, %out: f32):
          %32 = arith.addf %in, %cst_0 : f32
          linalg.yield %32 : f32
        } -> tensor<?x384x?xf32>
        flow.dispatch.tensor.store %31, %26, offsets = [0, 0, 0], sizes = [%22, 384, %24], strides = [1, 1, 1] : tensor<?x384x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x384x?xf32>>{%22, %24}
        return
      }
    }
  }
}

The error message on this dispatch is

./debug_875/module_main_graph$async_dispatch_1.mlir:9:6: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 524288 bytes
      func.func @main_graph$async_dispatch_1_elementwise_Dx384xD_f32() {
     ^

@pashu123
Copy link

pashu123 commented Nov 5, 2024

Added a patch for such cases: iree-org/iree#19027

@vinayakdsci
Copy link

Added a patch for such cases: iree-org/iree#19027

Thanks! The failing dispatch is compiling successfully with changes in the linked PR.

@pdhirajkumarprasad
Copy link
Author

closing this as issue is tracked by iree-org/iree#19027

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants