From 47561149c8f3cae1150c82ef824be390c62cb273 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 17 Jun 2023 22:27:53 +0100 Subject: [PATCH] riscv64: Implement a few misc SIMD instructions (#6598) * riscv64: Add immediate rule to `gen_vec_mask` * riscv64: Implement `scalar_to_vector` * riscv64: Implement vector `select` * riscv64: Implement SIMD `iabs` * wasmtime: Enable SIMD memory64 tests for riscv64 * cranelift: Update targets for `simd-select` tests --- build.rs | 9 - cranelift/codegen/src/isa/riscv64/inst.isle | 12 - .../codegen/src/isa/riscv64/inst_vector.isle | 11 +- cranelift/codegen/src/isa/riscv64/lower.isle | 35 ++- .../codegen/src/isa/riscv64/lower/isle.rs | 2 +- .../filetests/isa/riscv64/simd-iabs.clif | 166 +++++++++++ .../isa/riscv64/simd-insertlane.clif | 112 ++++---- .../isa/riscv64/simd-scalartovector.clif | 159 ++++++++++ .../filetests/isa/riscv64/simd-select.clif | 272 ++++++++++++++++++ .../filetests/runtests/simd-iabs.clif | 1 + .../runtests/simd-scalartovector-aarch64.clif | 1 + .../runtests/simd-scalartovector.clif | 1 + .../filetests/runtests/simd-select.clif | 65 +++++ 13 files changed, 753 insertions(+), 93 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-select.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-select.clif diff --git a/build.rs b/build.rs index 28dceb3ee1f6..10e15a3b578b 100644 --- a/build.rs +++ b/build.rs @@ -228,11 +228,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { return true; } - // The memory64 testsuite has a single SIMD test that we don't pass yet. - if testname == "simd" && testsuite == "memory64" { - return true; - } - let known_failure = [ "canonicalize_nan", "cvt_from_uint", @@ -249,18 +244,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_i16x8_arith2", "simd_i16x8_cmp", "simd_i16x8_q15mulr_sat_s", - "simd_i32x4_arith2", "simd_i32x4_cmp", "simd_i32x4_trunc_sat_f32x4", "simd_i32x4_trunc_sat_f64x2", - "simd_i64x2_arith2", "simd_i64x2_cmp", "simd_i8x16_arith2", "simd_i8x16_cmp", "simd_load", - "simd_load_zero", "simd_splat", - "v128_select", ] .contains(&testname); diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index a02145caa417..ec70bbe361d9 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -2673,18 +2673,6 @@ (gen_select_reg (IntCC.SignedGreaterThan) x y x y)) -(decl lower_iabs (Type XReg) XReg) - -; I64 and lower -; Generate the following code: -; sext.{b,h,w} a0, a0 -; neg a1, a0 -; max a0, a0, a1 -(rule (lower_iabs (fits_in_64 ty) val) - (let ((extended XReg (sext val ty $I64)) - (negated XReg (rv_neg extended))) - (max $I64 extended negated))) - (decl gen_trapif (XReg TrapCode) InstOutput) (rule (gen_trapif test trap_code) diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index baef0578a1a1..3ec6c58f976a 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -1006,14 +1006,17 @@ ;; Build a vector mask from a u64 -;; TODO: We should merge this with the `vconst` rules, and take advantage of -;; the other existing `vconst` rules. One example is using `vmv.v.i` which -;; can represent some of these masks. +;; TODO(#6571): We should merge this with the `vconst` rules, and take advantage of +;; the other existing `vconst` rules. (decl gen_vec_mask (u64) VReg) +;; When the immediate fits in a 5-bit immediate, we can use `vmv.v.i` directly. +(rule 1 (gen_vec_mask (imm5_from_u64 imm)) + (rv_vmv_vi imm (vstate_from_type $I64X2))) + ;; Materialize the mask into an X register, and move it into the bottom of ;; the vector register. -(rule (gen_vec_mask mask) +(rule 0 (gen_vec_mask mask) (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2))) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 93e0bda7fd92..acd5d8db65f3 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1556,9 +1556,23 @@ (load_ra)) ;;; Rules for `iabs` ;;;;;;;;;;;;; -(rule - (lower (has_type (fits_in_64 ty) (iabs x))) - (lower_iabs ty x)) + +;; I64 and lower +;; Generate the following code: +;; sext.{b,h,w} a0, a0 +;; neg a1, a0 +;; max a0, a0, a1 +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iabs x))) + (let ((extended XReg (sext x ty $I64)) + (negated XReg (rv_neg extended))) + (max $I64 extended negated))) + +;; For vectors we generate the same code, but with vector instructions +;; we can skip the sign extension, since the vector unit will only process +;; Element Sized chunks. +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (iabs x))) + (let ((negated VReg (rv_vneg_v x (unmasked) ty))) + (rv_vmax_vv x negated (unmasked) ty))) ;;;; Rules for calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1842,3 +1856,18 @@ (xor VReg (rv_vxor_vv x y (unmasked) ty)) (rhs VReg (rv_vssrl_vi xor one (unmasked) ty))) (rv_vadd_vv lhs rhs (unmasked) ty))) + +;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x))) + (if (ty_vector_not_float ty)) + (let ((zero VReg (rv_vmv_vx (zero_reg) ty)) + (mask VReg (gen_vec_mask 1))) + (rv_vmerge_vxm zero x mask ty))) + +(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x))) + (if (ty_vector_float ty)) + (let ((zero VReg (rv_vmv_vx (zero_reg) ty)) + (elem VReg (rv_vfmv_sf x ty)) + (mask VReg (gen_vec_mask 1))) + (rv_vmerge_vvm zero elem mask ty))) diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index c51d294fc55d..c03a93640131 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -224,7 +224,7 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> } else { vec![self.temp_writable_reg(I64), self.temp_writable_reg(I64)] } - } else if ty.is_float() { + } else if ty.is_float() || ty.is_vector() { vec![self.temp_writable_reg(ty)] } else { unimplemented!("ty:{:?}", ty) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif new file mode 100644 index 000000000000..b14ebfa34f23 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif @@ -0,0 +1,166 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %iabs_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iabs v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma) +; vmax.vv v6,v1,v4 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x03, 0x12, 0x1e +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iabs_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iabs v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vmax.vv v6,v1,v4 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x03, 0x12, 0x1e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iabs_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iabs v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vmax.vv v6,v1,v4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x03, 0x12, 0x1e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iabs_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iabs v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmax.vv v6,v1,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x03, 0x12, 0x1e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif index 0f8c14dfb6f2..975dbbf2def2 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif @@ -102,10 +102,9 @@ block0(v0: i32x4, v1: i32): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,4 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vxm v9,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,4 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -121,13 +120,12 @@ block0(v0: i32x4, v1: i32): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 4 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x30, 0x02, 0x5e ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0xd7, 0x43, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x84, 0x05, 0x02 +; .byte 0xa7, 0x83, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -146,10 +144,9 @@ block0(v0: i64x2, v1: i64): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,1 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vxm v9,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -165,12 +162,11 @@ block0(v0: i64x2, v1: i64): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 1 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 -; .byte 0xd7, 0x44, 0x15, 0x5c +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0xd7, 0x43, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x84, 0x05, 0x02 +; .byte 0xa7, 0x83, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -189,10 +185,9 @@ block0(v0: f64x2, v1: f64): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,1 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v7,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -208,12 +203,11 @@ block0(v0: f64x2, v1: f64): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 1 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 -; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0xd7, 0x53, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x04, 0x05, 0x02 +; .byte 0xa7, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -232,10 +226,9 @@ block0(v0: f64x2, v1: f64): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,2 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v7,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -251,12 +244,11 @@ block0(v0: f64x2, v1: f64): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 2 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 -; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0x57, 0x30, 0x01, 0x5e +; .byte 0xd7, 0x53, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x04, 0x05, 0x02 +; .byte 0xa7, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -275,10 +267,9 @@ block0(v0: f32x4, v1: f32): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,1 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v7,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -294,13 +285,12 @@ block0(v0: f32x4, v1: f32): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 1 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0xb0, 0x00, 0x5e ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0xd7, 0x53, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x04, 0x05, 0x02 +; .byte 0xa7, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -319,10 +309,9 @@ block0(v0: f32x4, v1: f32): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a2,2 -; vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma) -; vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,2 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmerge.vfm v7,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -338,13 +327,12 @@ block0(v0: f32x4, v1: f32): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a2, zero, 2 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x60, 0x06, 0x42 +; .byte 0x57, 0x30, 0x01, 0x5e ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0xd7, 0x54, 0x15, 0x5c +; .byte 0xd7, 0x53, 0x15, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xa7, 0x04, 0x05, 0x02 +; .byte 0xa7, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -453,10 +441,9 @@ block0(v0: i32x4): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a1,4 -; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vim v8,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,4 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v6,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -472,13 +459,12 @@ block0(v0: i32x4): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a1, zero, 4 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xe0, 0x05, 0x42 +; .byte 0x57, 0x30, 0x02, 0x5e ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0xb4, 0x17, 0x5c +; .byte 0x57, 0xb3, 0x17, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 +; .byte 0x27, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 @@ -498,10 +484,9 @@ block0(v0: i64x2): ; mv fp,sp ; block0: ; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; li a1,1 -; vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma) -; vmerge.vim v8,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma) -; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vim v6,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -517,12 +502,11 @@ block0(v0: i64x2): ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 ; .byte 0x87, 0x80, 0x0f, 0x02 -; addi a1, zero, 1 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xe0, 0x05, 0x42 -; .byte 0x57, 0xb4, 0x1b, 0x5c +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0x57, 0xb3, 0x1b, 0x5c ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x04, 0x05, 0x02 +; .byte 0x27, 0x03, 0x05, 0x02 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif b/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif new file mode 100644 index 000000000000..b588dd312189 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif @@ -0,0 +1,159 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %scalartovector_i8(i8) -> i8x16 { +block0(v0: i8): + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v3,a0,v0.t #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xd7, 0x43, 0x35, 0x5c +; .byte 0xa7, 0x83, 0x05, 0x02 +; ret + +function %scalartovector_i16(i16) -> i16x8 { +block0(v0: i16): + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=8, #vtype=(e16, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v3,a0,v0.t #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x43, 0x35, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ret + +function %scalartovector_i32(i32) -> i32x4 { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=4, #vtype=(e32, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v3,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x43, 0x35, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ret + +function %scalartovector_i64(i64) -> i64x2 { +block0(v0: i64): + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=2, #vtype=(e64, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vxm v7,v3,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0xd7, 0x43, 0x35, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ret + +function %scalartovector_f32(f32) -> f32x4 { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=4, #vtype=(e32, m1, ta, ma) +; vfmv.s.f v5,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v9,v3,v5,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0xd7, 0x52, 0x05, 0x42 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x84, 0x32, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ret + +function %scalartovector_f64(f64) -> f64x2 { +block0(v0: f64): + v1 = scalar_to_vector.f64x2 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,zero #avl=2, #vtype=(e64, m1, ta, ma) +; vfmv.s.f v5,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v9,v3,v5,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x41, 0x00, 0x5e +; .byte 0xd7, 0x52, 0x05, 0x42 +; .byte 0x57, 0xb0, 0x00, 0x5e +; .byte 0xd7, 0x84, 0x32, 0x5c +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x04, 0x05, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-select.clif b/cranelift/filetests/filetests/isa/riscv64/simd-select.clif new file mode 100644 index 000000000000..c2596fd50d7b --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-select.clif @@ -0,0 +1,272 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %select_i64x2(i64, i64x2, i64x2) -> i64x2 { +block0(v0: i64, v1: i64x2, v2: i64x2): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; select_i64x2 v7,v2,v4##condition=a0 +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; beqz a0, 0xc +; .byte 0xd7, 0x33, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x33, 0x40, 0x9e +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %select_i32x4(i32, i32x4, i32x4) -> i32x4 { +block0(v0: i32, v1: i32x4, v2: i32x4): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; slli a4,a0,32 +; srli a6,a4,32 +; select_i32x4 v11,v2,v4##condition=a6 +; vse8.v v11,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; slli a4, a0, 0x20 +; srli a6, a4, 0x20 +; beqz a6, 0xc +; .byte 0xd7, 0x35, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x35, 0x40, 0x9e +; .byte 0xa7, 0x85, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %select_i16x8(i16, i16x8, i16x8) -> i16x8 { +block0(v0: i16, v1: i16x8, v2: i16x8): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; slli a4,a0,48 +; srli a6,a4,48 +; select_i16x8 v11,v2,v4##condition=a6 +; vse8.v v11,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; slli a4, a0, 0x30 +; srli a6, a4, 0x30 +; beqz a6, 0xc +; .byte 0xd7, 0x35, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x35, 0x40, 0x9e +; .byte 0xa7, 0x85, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %select_i8x16(i8, i8x16, i8x16) -> i8x16 { +block0(v0: i8, v1: i8x16, v2: i8x16): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; andi a4,a0,255 +; select_i8x16 v9,v2,v4##condition=a4 +; vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; andi a4, a0, 0xff +; beqz a4, 0xc +; .byte 0xd7, 0x34, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x34, 0x40, 0x9e +; .byte 0xa7, 0x84, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %select_f64x2(i64, f64x2, f64x2) -> f64x2 { +block0(v0: i64, v1: f64x2, v2: f64x2): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; select_f64x2 v7,v2,v4##condition=a0 +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; beqz a0, 0xc +; .byte 0xd7, 0x33, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x33, 0x40, 0x9e +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %select_f32x4(i64, f32x4, f32x4) -> f32x4 { +block0(v0: i64, v1: f32x4, v2: f32x4): + v3 = select v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; select_f32x4 v7,v2,v4##condition=a0 +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x07, 0x82, 0x0f, 0x02 +; beqz a0, 0xc +; .byte 0xd7, 0x33, 0x20, 0x9e +; j 8 +; .byte 0xd7, 0x33, 0x40, 0x9e +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-iabs.clif b/cranelift/filetests/filetests/runtests/simd-iabs.clif index 2fa0ee71c348..63242f1920da 100644 --- a/cranelift/filetests/filetests/runtests/simd-iabs.clif +++ b/cranelift/filetests/filetests/runtests/simd-iabs.clif @@ -8,6 +8,7 @@ target x86_64 target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx +target riscv64 has_v function %iabs_i8x16(i8x16) -> i8x16 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif index 0721599a2112..f6aaf71efbfe 100644 --- a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif @@ -2,6 +2,7 @@ test run test interpret target aarch64 target s390x +target riscv64 has_v ; i8 and i16 are invalid source sizes for x86_64 function %scalartovector_i8(i8) -> i8x16 { diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif index 7891693f3f51..2ff203d644d2 100644 --- a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif +++ b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif @@ -5,6 +5,7 @@ target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64 has_v function %scalartovector_i32(i32) -> i32x4 { block0(v0: i32): diff --git a/cranelift/filetests/filetests/runtests/simd-select.clif b/cranelift/filetests/filetests/runtests/simd-select.clif new file mode 100644 index 000000000000..63112860a77a --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-select.clif @@ -0,0 +1,65 @@ +test interpret +test run +target aarch64 +target s390x +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + +function %select_i64x2(i64, i64x2, i64x2) -> i64x2 { +block0(v0: i64, v1: i64x2, v2: i64x2): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_i64x2(0, [1 2], [3 4]) == [3 4] +; run: %select_i64x2(1, [1 2], [3 4]) == [1 2] +; run: %select_i64x2(-1, [1 2], [3 4]) == [1 2] + +function %select_i32x4(i64, i32x4, i32x4) -> i32x4 { +block0(v0: i64, v1: i32x4, v2: i32x4): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_i32x4(0, [1 2 3 4], [5 6 7 8]) == [5 6 7 8] +; run: %select_i32x4(1, [1 2 3 4], [5 6 7 8]) == [1 2 3 4] +; run: %select_i32x4(-1, [1 2 3 4], [5 6 7 8]) == [1 2 3 4] + +function %select_i16x8(i64, i16x8, i16x8) -> i16x8 { +block0(v0: i64, v1: i16x8, v2: i16x8): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_i16x8(0, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 13 14 15 16] +; run: %select_i16x8(1, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8] +; run: %select_i16x8(-1, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8] + +function %select_i8x16(i64, i8x16, i8x16) -> i8x16 { +block0(v0: i64, v1: i8x16, v2: i8x16): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_i8x16(0, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32] +; run: %select_i8x16(1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] +; run: %select_i8x16(-1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + +function %select_f64x2(i64, f64x2, f64x2) -> f64x2 { +block0(v0: i64, v1: f64x2, v2: f64x2): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_f64x2(0, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x3.0 0x4.0] +; run: %select_f64x2(1, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x1.0 0x2.0] +; run: %select_f64x2(-1, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x1.0 0x2.0] + +function %select_f32x4(i64, f32x4, f32x4) -> f32x4 { +block0(v0: i64, v1: f32x4, v2: f32x4): + v3 = select v0, v1, v2 + return v3 +} +; run: %select_f32x4(0, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0] +; run: %select_f32x4(1, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x1.0 0x2.0 0x3.0 0x4.0] +; run: %select_f32x4(-1, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x1.0 0x2.0 0x3.0 0x4.0] +