We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
The test pass in torch-mlir e2e but fail with iree. iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb
iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb
cumsumbool.torch.mlir:7:10: error: 'memref.alloca' op expected no unbounded stack allocations %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64> ^ cumsumbool.torch.mlir:7:10: note: see current operation: %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64> cumsumbool.torch.mlir:8:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64> ^ cumsumbool.torch.mlir:8:10: note: see current operation: "hal.executable.variant"() ({ "hal.executable.export"() ({ ^bb0(%arg3: !hal.device, %arg4: index): %23 = "arith.constant"() <{value = 1 : index}> : () -> index %24 = "arith.constant"() <{value = 1 : index}> : () -> index %25 = "arith.constant"() <{value = 1 : index}> : () -> index "hal.return"(%23, %24, %25) : (index, index, index) -> () }) {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> () "builtin.module"() ({ "func.func"() <{function_type = () -> (), sym_name = "main_graph$async_dispatch_0_scan_Dxi64_dispatch_tensor_store"}> ({ %0 = "arith.constant"() <{value = 1 : index}> : () -> index %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64 %2 = "arith.constant"() <{value = 32 : i64}> : () -> i64 %3 = "arith.constant"() <{value = 0 : index}> : () -> index %4 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32 %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32 %6 = "arith.extui"(%4) : (i32) -> i64 %7 = "arith.extui"(%5) : (i32) -> i64 %8 = "arith.shli"(%7, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64 %9 = "arith.ori"(%6, %8) : (i64, i64) -> i64 %10 = "arith.index_castui"(%9) : (i64) -> index %11 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi8> "memref.assume_alignment"(%11) <{alignment = 64 : i32}> : (memref<?xi8>) -> () %12 = "hal.interface.binding.subspan"(%3, %10) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64> "memref.assume_alignment"(%12) <{alignment = 64 : i32}> : (memref<?xi64>) -> () %13 = "memref.alloca"(%10) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64> "scf.for"(%3, %10, %0) ({ ^bb0(%arg2: index): %20 = "memref.load"(%11, %arg2) <{nontemporal = false}> : (memref<?xi8>, index) -> i8 %21 = "arith.trunci"(%20) : (i8) -> i1 %22 = "arith.extui"(%21) : (i1) -> i64 "memref.store"(%22, %13, %arg2) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.for"(%3, %10, %0) ({ ^bb0(%arg1: index): "memref.store"(%1, %12, %arg1) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.for"(%3, %10, %0) ({ ^bb0(%arg0: index): %14 = "arith.cmpi"(%arg0, %3) <{predicate = 0 : i64}> : (index, index) -> i1 "scf.if"(%14) ({ %19 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64 "memref.store"(%19, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> () "scf.yield"() : () -> () }, { %15 = "arith.subi"(%arg0, %0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index %16 = "memref.load"(%12, %15) <{nontemporal = false}> : (memref<?xi64>, index) -> i64 %17 = "memref.load"(%13, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64 %18 = "arith.addi"(%16, %17) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64 "memref.store"(%18, %12, %arg0) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> () "scf.yield"() : () -> () }) : (i1) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "func.return"() : () -> () }) : () -> () }) : () -> () "hal.executable.variant_end"() : () -> () }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> () failed to translate executables
cumsumbool.torch.mlir
module { func.func @main_graph(%arg0: !torch.vtensor<[?],i1>) -> !torch.vtensor<[?],si64> attributes {torch.onnx_meta.ir_version = 9 : si64, torch.onnx_meta.opset_version = 20 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.6.0"} { %int0 = torch.constant.int 0 %none = torch.constant.none %int4 = torch.constant.int 4 %false = torch.constant.bool false %0 = torch.aten.to.dtype %arg0, %int4, %false, %false, %none : !torch.vtensor<[?],i1>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?],si64> %1 = torch.aten.cumsum %0, %int0, %none : !torch.vtensor<[?],si64>, !torch.int, !torch.none -> !torch.vtensor<[?],si64> return %1 : !torch.vtensor<[?],si64> } }
cumsumbool.linalg.mlir
#map = affine_map<(d0) -> (d0)> module { ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64> func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<?xi64> { %c0_i64 = arith.constant 0 : i64 %c0 = arith.constant 0 : index %dim = tensor.dim %arg0, %c0 : tensor<?xi1> %0 = tensor.empty(%dim) : tensor<?xi64> %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<?xi1>) outs(%0 : tensor<?xi64>) { ^bb0(%in: i1, %out: i64): %7 = arith.extui %in : i1 to i64 linalg.yield %7 : i64 } -> tensor<?xi64> %dim_0 = tensor.dim %1, %c0 : tensor<?xi64> %2 = tensor.empty(%dim_0) : tensor<?xi64> %3 = linalg.fill ins(%c0_i64 : i64) outs(%2 : tensor<?xi64>) -> tensor<?xi64> %4 = tensor.empty() : tensor<i64> %5 = linalg.fill ins(%c0_i64 : i64) outs(%4 : tensor<i64>) -> tensor<i64> %6:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%1 : tensor<?xi64>) outs(%3, %5 : tensor<?xi64>, tensor<i64>) { ^bb0(%arg1: i64, %arg2: i64): %7 = arith.addi %arg1, %arg2 : i64 tm_tensor.yield %7 : i64 } -> tensor<?xi64>, tensor<i64> return %6#0 : tensor<?xi64> } }
If you want to test with torch-mlir:
class NonzeroLongModule(torch.nn.Module): def __init__(self): super().__init__() @export @annotate_args( [ None, ([-1], torch.bool, True), ] ) def forward(self, x): return x.long() @register_test_case(module_factory=lambda: NonzeroLongModule()) def NonzeroLongModule_basic(module, tu: TestUtils): module.forward(torch.tensor([0, 0, 1, 1, 0, 0], dtype=torch.bool))
python -m e2e_testing.main --config=onnx -v --filter NonzeroCumsumBoolModule PASS - "NonzeroCumsumBoolModule_basic" Summary: Passed: 1
Compiler
commit cb593895d297ae2e79cef6fef0b7157096cb5b4a (HEAD -> main, upstream/main) Author: MaheshRavishankar <[email protected]> Date: Fri Dec 6 15:09:28 2024 -0800 [Codegen][LLVMGPU] Avoid long compilation times of warp reduction pipeline (#19381)
Finid this error in decompose torch.nonzero for model migraphx_onnx-model-zoo__gpt2-10 llvm/torch-mlir#3876
The text was updated successfully, but these errors were encountered:
pashu123
Successfully merging a pull request may close this issue.
What happened?
The test pass in torch-mlir e2e but fail with iree.
iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb
Steps to reproduce your issue
iree-compile --iree-hal-target-backends=llvm-cpu cumsumbool.torch.mlir -o model.vmfb
cumsumbool.torch.mlir
cumsumbool.linalg.mlir
If you want to test with torch-mlir:
What component(s) does this issue relate to?
Compiler
Version information
Additional context
Finid this error in decompose torch.nonzero for model migraphx_onnx-model-zoo__gpt2-10 llvm/torch-mlir#3876
The text was updated successfully, but these errors were encountered: