From 579918c2d643f78b31a257758e29286b2c413845 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Tue, 6 Jun 2023 21:05:29 +0100 Subject: [PATCH] riscv64: Implement SIMD `swizzle` and `shuffle` (#6515) * riscv64: Implement SIMD `swizzle` * riscv64: Implement SIMD `shuffle` * wasmtime: Enable more RISC-V SIMD tests * riscv64: Add TODO issue numbers * riscv64: Fix trailing newline issues --- build.rs | 1 - cranelift/codegen/src/isa/riscv64/inst.isle | 8 +- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 28 +++- .../codegen/src/isa/riscv64/inst/vector.rs | 30 ++++- .../codegen/src/isa/riscv64/inst_vector.isle | 40 +++++- cranelift/codegen/src/isa/riscv64/lower.isle | 33 ++++- cranelift/codegen/src/machinst/isle.rs | 7 + cranelift/codegen/src/prelude_lower.isle | 5 + .../filetests/isa/riscv64/simd-shuffle.clif | 61 +++++++++ .../filetests/isa/riscv64/simd-swizzle.clif | 121 ++++++++++++++++++ .../filetests/runtests/simd-shuffle.clif | 1 + .../filetests/runtests/simd-swizzle.clif | 20 +++ 12 files changed, 339 insertions(+), 16 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-shuffle.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-swizzle.clif diff --git a/build.rs b/build.rs index 1ffef3f41376..e887ae0a0eee 100644 --- a/build.rs +++ b/build.rs @@ -266,7 +266,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_i8x16_arith2", "simd_i8x16_cmp", "simd_int_to_int_extend", - "simd_lane", "simd_load", "simd_load_extend", "simd_load_zero", diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index dad404d352a6..a414ad826ceb 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -1566,13 +1566,19 @@ (extern constructor imm5_from_i8 imm5_from_i8) ;; Extractor that matches a `Value` equivalent to a replicated Imm5 on all lanes. -;; TODO: Try matching vconst here as well +;; TODO(#6527): Try matching vconst here as well (decl replicated_imm5 (Imm5) Value) (extractor (replicated_imm5 n) (def_inst (splat (iconst (u64_from_imm64 (imm5_from_u64 n)))))) ;; UImm5 Helpers +;; Extractor that matches a `Value` equivalent to a replicated UImm5 on all lanes. +;; TODO(#6527): Try matching vconst here as well +(decl replicated_uimm5 (UImm5) Value) +(extractor (replicated_uimm5 n) + (def_inst (splat (uimm5_from_value n)))) + ;; Helper to go directly from a `Value`, when it's an `iconst`, to an `UImm5`. (decl uimm5_from_value (UImm5) Value) (extractor (uimm5_from_value n) diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 3ec0e04c0da2..5e3cc27b90c2 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -654,17 +654,39 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_use(vs1); collector.reg_use(vs2); - collector.reg_def(vd); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_src_dst_overlaps() { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + vec_mask_operands(mask, collector); } &Inst::VecAluRRImm5 { - vd, vs2, ref mask, .. + op, + vd, + vs2, + ref mask, + .. } => { debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); debug_assert_eq!(vs2.class(), RegClass::Vector); collector.reg_use(vs2); - collector.reg_def(vd); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_src_dst_overlaps() { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + vec_mask_operands(mask, collector); } &Inst::VecAluRR { diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index afedeb70dc60..f68e6b558b0b 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -296,6 +296,7 @@ impl VecAluOpRRR { VecAluOpRRR::VssubuVV | VecAluOpRRR::VssubuVX => 0b100010, VecAluOpRRR::VssubVV | VecAluOpRRR::VssubVX => 0b100011, VecAluOpRRR::VfsgnjnVV => 0b001001, + VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX => 0b001100, VecAluOpRRR::VmsltVX => 0b011011, } } @@ -318,7 +319,8 @@ impl VecAluOpRRR { | VecAluOpRRR::VminVV | VecAluOpRRR::VmaxuVV | VecAluOpRRR::VmaxVV - | VecAluOpRRR::VmergeVVM => VecOpCategory::OPIVV, + | VecAluOpRRR::VmergeVVM + | VecAluOpRRR::VrgatherVV => VecOpCategory::OPIVV, VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV @@ -343,7 +345,8 @@ impl VecAluOpRRR { | VecAluOpRRR::VmaxVX | VecAluOpRRR::VslidedownVX | VecAluOpRRR::VmergeVXM - | VecAluOpRRR::VmsltVX => VecOpCategory::OPIVX, + | VecAluOpRRR::VmsltVX + | VecAluOpRRR::VrgatherVX => VecOpCategory::OPIVX, VecAluOpRRR::VfaddVV | VecAluOpRRR::VfsubVV | VecAluOpRRR::VfmulVV @@ -368,6 +371,14 @@ impl VecAluOpRRR { _ => unreachable!(), } } + + /// Some instructions do not allow the source and destination registers to overlap. + pub fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX => true, + _ => false, + } + } } impl fmt::Display for VecAluOpRRR { @@ -408,6 +419,7 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VmergeVIM => 0b010111, VecAluOpRRImm5::VsadduVI => 0b100000, VecAluOpRRImm5::VsaddVI => 0b100001, + VecAluOpRRImm5::VrgatherVI => 0b001100, } } @@ -424,7 +436,8 @@ impl VecAluOpRRImm5 { | VecAluOpRRImm5::VslidedownVI | VecAluOpRRImm5::VmergeVIM | VecAluOpRRImm5::VsadduVI - | VecAluOpRRImm5::VsaddVI => VecOpCategory::OPIVI, + | VecAluOpRRImm5::VsaddVI + | VecAluOpRRImm5::VrgatherVI => VecOpCategory::OPIVI, } } @@ -433,7 +446,8 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VsllVI | VecAluOpRRImm5::VsrlVI | VecAluOpRRImm5::VsraVI - | VecAluOpRRImm5::VslidedownVI => true, + | VecAluOpRRImm5::VslidedownVI + | VecAluOpRRImm5::VrgatherVI => true, VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VandVI @@ -444,6 +458,14 @@ impl VecAluOpRRImm5 { | VecAluOpRRImm5::VsaddVI => false, } } + + /// Some instructions do not allow the source and destination registers to overlap. + pub fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRRImm5::VrgatherVI => true, + _ => false, + } + } } impl fmt::Display for VecAluOpRRImm5 { diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 94cc758ab04a..126551791e83 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -117,6 +117,7 @@ (VmergeVVM) (VredmaxuVS) (VredminuVS) + (VrgatherVV) ;; Vector-Scalar Opcodes (VaddVX) @@ -145,6 +146,7 @@ (VfrdivVF) (VmergeVXM) (VfmergeVFM) + (VrgatherVX) (VmsltVX) )) @@ -163,6 +165,7 @@ (VxorVI) (VslidedownVI) (VmergeVIM) + (VrgatherVI) )) ;; Imm only ALU Ops @@ -718,6 +721,25 @@ (rule (rv_vredmaxu_vs vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VredmaxuVS) vs2 vs1 mask vstate)) +;; Helper for emitting the `vrgather.vv` instruction. +;; +;; vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; +(decl rv_vrgather_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vrgather_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrgatherVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrgather.vx` instruction. +;; +;; vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]] +(decl rv_vrgather_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vrgather_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VrgatherVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vrgather.vi` instruction. +(decl rv_vrgather_vi (VReg UImm5 VecOpMasking VState) VReg) +(rule (rv_vrgather_vi vs2 imm mask vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VrgatherVI) vs2 imm mask vstate)) + ;; Helper for emitting the `vmslt.vx` (Vector Mask Set Less Than) instruction. (decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg) (rule (rv_vmslt_vx vs2 vs1 mask vstate) @@ -757,4 +779,20 @@ ;; Materialize the mask into an X register, and move it into the bottom of ;; the vector register. (rule (gen_vec_mask mask) - (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2))) \ No newline at end of file + (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2))) + + +;; Loads a `VCodeConstant` value into a vector register. For some special `VCodeConstant`s +;; we can use a dedicated instruction, otherwise we load the value from the pool. +;; +;; Type is the preferred type to use when loading the constant. +(decl gen_constant (Type VCodeConstant) VReg) + +;; The fallback case is to load the constant from the pool. +(rule (gen_constant ty n) + (vec_load + (element_width_from_type ty) + (VecAMode.UnitStride (gen_const_amode n)) + (mem_flags_trusted) + (unmasked) + ty)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 85cb307c23b4..de6f1d3a793f 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -12,12 +12,7 @@ ;; ;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (ty_vec_fits_in_register ty) (vconst n))) - (vec_load - (element_width_from_type ty) - (VecAMode.UnitStride (gen_const_amode (const_to_vconst n))) - (mem_flags_trusted) - (unmasked) - ty)) + (gen_constant ty (const_to_vconst n))) ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1407,3 +1402,29 @@ ;; use the original type as a VState and avoid a state change. (x_mask XReg (rv_vmv_xs mask (vstate_from_type $I64X2)))) (gen_andi x_mask (ty_lane_mask ty)))) + +;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x y))) + (rv_vrgather_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x (splat y)))) + (rv_vrgather_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x (replicated_uimm5 y)))) + (rv_vrgather_vi x y (unmasked) ty)) + +;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Use a vrgather to load all 0-15 lanes from x. And then modify the mask to load all +;; 16-31 lanes from y. Finally, use a vor to combine the two vectors. +;; +;; vrgather will insert a 0 for lanes that are out of bounds, so we can let it load +;; negative and out of bounds indexes. +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I8X16) (shuffle x y (vconst_from_immediate mask)))) + (if-let neg16 (imm5_from_i8 -16)) + (let ((x_mask VReg (gen_constant ty mask)) + (x_lanes VReg (rv_vrgather_vv x x_mask (unmasked) ty)) + (y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty)) + (y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty))) + (rv_vor_vv x_lanes y_lanes (unmasked) ty))) diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 8064439a8f8e..088c04a8db0e 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -342,6 +342,13 @@ macro_rules! isle_lower_prelude_methods { Some(u128::from_le_bytes(bytes.try_into().ok()?)) } + #[inline] + fn vconst_from_immediate(&mut self, imm: Immediate) -> Option { + Some(self.lower_ctx.use_constant(VCodeConstantData::Generated( + self.lower_ctx.get_immediate_data(imm).clone(), + ))) + } + #[inline] fn vec_mask_from_immediate(&mut self, imm: Immediate) -> Option { let data = self.lower_ctx.get_immediate_data(imm); diff --git a/cranelift/codegen/src/prelude_lower.isle b/cranelift/codegen/src/prelude_lower.isle index 297caab2ba7d..4d44745618af 100644 --- a/cranelift/codegen/src/prelude_lower.isle +++ b/cranelift/codegen/src/prelude_lower.isle @@ -854,6 +854,11 @@ (decl u128_from_immediate (u128) Immediate) (extern extractor u128_from_immediate u128_from_immediate) +;; Extracts an `Immediate` as a `VCodeConstant`. + +(decl vconst_from_immediate (VCodeConstant) Immediate) +(extern extractor vconst_from_immediate vconst_from_immediate) + ;; Accessor for `Constant` as u128. (decl u128_from_constant (u128) Constant) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-shuffle.clif b/cranelift/filetests/filetests/isa/riscv64/simd-shuffle.clif new file mode 100644 index 000000000000..2868ef62a182 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-shuffle.clif @@ -0,0 +1,61 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v6,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vrgather.vv v8,v1,v6 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vi v10,v6,-16 #avl=16, #vtype=(e8, m1, ta, ma) +; vrgather.vv v12,v3,v10 #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v14,v8,v12 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; auipc t6, 0 +; addi t6, t6, 0x3c +; .byte 0x07, 0x83, 0x0f, 0x02 +; .byte 0x57, 0x04, 0x13, 0x32 +; .byte 0x57, 0x35, 0x68, 0x02 +; .byte 0x57, 0x06, 0x35, 0x32 +; .byte 0x57, 0x07, 0x86, 0x2a +; .byte 0x27, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; lb zero, 0x1a1(t5) +; .byte 0x04, 0x06, 0x0c, 0x0b +; auipc s10, 0x4180 +; .byte 0x02, 0x0f, 0x11, 0x05 + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-swizzle.clif b/cranelift/filetests/filetests/isa/riscv64/simd-swizzle.clif new file mode 100644 index 000000000000..6c5fc12fbcdc --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-swizzle.clif @@ -0,0 +1,121 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %swizzle_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = swizzle v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vrgather.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x32 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swizzle_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = swizzle v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vrgather.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x42, 0x15, 0x32 +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swizzle_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 2 + v2 = splat.i8x16 v1 + v3 = swizzle v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vrgather.vi v4,v1,2 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x11, 0x32 +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index cdf2adfc5caf..e6aa4b6af8eb 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -6,6 +6,7 @@ set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi +target riscv64gc has_v function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif index 89fcc1b50813..2592e65fb311 100644 --- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif +++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif @@ -5,6 +5,7 @@ target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64gc has_v function %swizzle_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -13,3 +14,22 @@ block0(v0: i8x16, v1: i8x16): } ; run: %swizzle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == [1 10 16 2 7 14 8 12 11 9 0 13 5 3 4 6] +function %swizzle_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = swizzle v0, v2 + return v3 +} +; run: %swizzle_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 5) == [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6] +; run: %swizzle_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 99) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + + +function %swizzle_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 2 + v2 = splat.i8x16 v1 + v3 = swizzle v0, v2 + return v3 +} +; run: %swizzle_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] +