From 752c7ea4dd83f9cb247e00cadf9d1958c0bdf9b6 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 17 May 2023 20:08:44 +0100 Subject: [PATCH] riscv64: Add `extractlane` and `splat` instructions (#6397) * riscv64: Add `vslidedown.v{x,i}` instructions * riscv64: Add `v{f,}mv` instructions These instructions move values from vectors into other register types and vice-versa. * riscv64: Add `extractlane` lowerings * riscv64: Add `vmv.v.*` instructions * riscv64: Implement `splat` * riscv64: Add `vmv.v.i` instruction * riscv64: Remove unused `imm5_zero` * wasmtime: Enable more RISC-V SIMD tests * cranelift: Enable ssse3 tests for `fadd-splat` testsuite * riscv64: Update splat TODO comment --- build.rs | 6 - cranelift/codegen/src/isa/riscv64/inst.isle | 20 + .../codegen/src/isa/riscv64/inst/emit.rs | 13 + .../codegen/src/isa/riscv64/inst/encode.rs | 45 +- .../codegen/src/isa/riscv64/inst/imms.rs | 6 + cranelift/codegen/src/isa/riscv64/inst/mod.rs | 43 +- .../codegen/src/isa/riscv64/inst/vector.rs | 164 ++++++- .../codegen/src/isa/riscv64/inst_vector.isle | 119 +++++ cranelift/codegen/src/isa/riscv64/lower.isle | 20 + .../codegen/src/isa/riscv64/lower/isle.rs | 13 + .../isa/riscv64/simd-extractlane.clif | 446 ++++++++++++++++++ .../filetests/isa/riscv64/simd-splat.clif | 206 ++++++++ .../filetests/runtests/simd-extractlane.clif | 1 + .../filetests/runtests/simd-fadd-splat.clif | 27 ++ .../filetests/runtests/simd-splat.clif | 19 +- 15 files changed, 1116 insertions(+), 32 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-extractlane.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-splat.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-fadd-splat.clif diff --git a/build.rs b/build.rs index b0d64c190a0c..5825af1be0e3 100644 --- a/build.rs +++ b/build.rs @@ -253,14 +253,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_load64_lane", "simd_load8_lane", "simd_load_extend", - "simd_load_splat", "simd_load_zero", "simd_splat", - "simd_store16_lane", - "simd_store32_lane", - "simd_store64_lane", - "simd_store8_lane", - "spillslot_size_fuzzbug", "v128_select", ] .contains(&testname); diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 2a6ef131ea68..9a4f99b8e36c 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -337,6 +337,18 @@ (imm Imm5) (vstate VState)) + (VecAluRR + (op VecAluOpRR) + (vd WritableReg) + (vs Reg) + (vstate VState)) + + (VecAluRImm5 + (op VecAluOpRImm5) + (vd WritableReg) + (imm Imm5) + (vstate VState)) + (VecSetState (rd WritableReg) (vstate VState)) @@ -1353,6 +1365,14 @@ (extractor (replicated_imm5 n) (def_inst (splat (iconst (u64_from_imm64 (imm5_from_u64 n)))))) +;; UImm5 Helpers + +;; Extract a `UImm5` from an `u8`. +(decl pure partial uimm5_from_u8 (UImm5) u8) +(extern extractor uimm5_from_u8 uimm5_from_u8) + +(decl uimm5_bitcast_to_imm5 (UImm5) Imm5) +(extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5) ;; Float Helpers diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index df7567918bf4..d4c03c1ea053 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -467,7 +467,9 @@ impl Inst { // VecSetState does not expect any vstate, rather it updates it. Inst::VecSetState { .. } => None, + Inst::VecAluRR { vstate, .. } | Inst::VecAluRRR { vstate, .. } | + Inst::VecAluRImm5 { vstate, .. } | Inst::VecAluRRImm5 { vstate, .. } | // TODO: Unit-stride loads and stores only need the AVL to be correct, not // the full vtype. A future optimization could be to decouple these two when @@ -2818,6 +2820,17 @@ impl MachInstEmit for Inst { sink.put4(encode_valu_imm(op, vd, imm, vs2, VecOpMasking::Disabled)); } + &Inst::VecAluRR { op, vd, vs, .. } => { + let vs = allocs.next(vs); + let vd = allocs.next_writable(vd); + + sink.put4(encode_valu_rr(op, vd, vs, VecOpMasking::Disabled)); + } + &Inst::VecAluRImm5 { op, vd, imm, .. } => { + let vd = allocs.next_writable(vd); + + sink.put4(encode_valu_r_imm(op, vd, imm, VecOpMasking::Disabled)); + } &Inst::VecSetState { rd, ref vstate } => { let rd = allocs.next_writable(rd); diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs index 2a479867578c..e52d05aa48a1 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs @@ -9,7 +9,8 @@ use super::{Imm12, Imm5, UImm5, VType}; use crate::isa::riscv64::inst::reg_to_gpr_num; use crate::isa::riscv64::lower::isle::generated_code::{ - VecAluOpRRImm5, VecAluOpRRR, VecElementWidth, VecOpCategory, VecOpMasking, + VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecElementWidth, VecOpCategory, + VecOpMasking, }; use crate::machinst::isle::WritableReg; use crate::Reg; @@ -145,6 +146,48 @@ pub fn encode_valu_imm( ) } +pub fn encode_valu_rr(op: VecAluOpRR, vd: WritableReg, vs: Reg, masking: VecOpMasking) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + + let (vs1, vs2) = if op.vs_is_vs2_encoded() { + (op.aux_encoding(), reg_to_gpr_num(vs)) + } else { + (reg_to_gpr_num(vs), op.aux_encoding()) + }; + + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + vs1, + vs2, + funct7, + ) +} + +pub fn encode_valu_r_imm( + op: VecAluOpRImm5, + vd: WritableReg, + imm: Imm5, + masking: VecOpMasking, +) -> u32 { + let funct7 = (op.funct6() << 1) | masking.encode(); + + // This is true for this opcode, not sure if there are any other ones. + debug_assert_eq!(op, VecAluOpRImm5::VmvVI); + let vs1 = imm.bits() as u32; + let vs2 = op.aux_encoding(); + + encode_r_type_bits( + op.opcode(), + reg_to_gpr_num(vd.to_reg()), + op.funct3(), + vs1, + vs2, + funct7, + ) +} + /// Encodes a Vector CFG Imm instruction. /// /// See: https://github.com/riscv/riscv-v-spec/blob/master/vcfg-format.adoc diff --git a/cranelift/codegen/src/isa/riscv64/inst/imms.rs b/cranelift/codegen/src/isa/riscv64/inst/imms.rs index 7ef4f52451ea..2f9b544b15ed 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/imms.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/imms.rs @@ -143,6 +143,12 @@ impl Imm5 { } } + pub fn from_bits(value: u8) -> Imm5 { + assert_eq!(value & 0x1f, value); + let signed = ((value << 3) as i8) >> 3; + Imm5 { value: signed } + } + /// Bits for encoding. pub fn bits(&self) -> u8 { self.value as u8 & 0x1f diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index d654e77765a6..4a13ca4c62cb 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -642,6 +642,18 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_use(vs2); collector.reg_def(vd); } + &Inst::VecAluRR { op, vd, vs, .. } => { + debug_assert_eq!(vd.to_reg().class(), op.dst_regclass()); + debug_assert_eq!(vs.class(), op.src_regclass()); + + collector.reg_use(vs); + collector.reg_def(vd); + } + &Inst::VecAluRImm5 { vd, .. } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + + collector.reg_def(vd); + } &Inst::VecSetState { rd, .. } => { collector.reg_def(rd); } @@ -1585,7 +1597,36 @@ impl Inst { let vs2_s = format_reg(vs2, allocs); let vd_s = format_reg(vd.to_reg(), allocs); - format!("{} {},{},{} {}", op, vd_s, vs2_s, imm, vstate) + // Some opcodes interpret the immediate as unsigned, lets show the + // correct number here. + let imm_s = if op.imm_is_unsigned() { + format!("{}", imm.bits()) + } else { + format!("{}", imm) + }; + + format!("{} {},{},{} {}", op, vd_s, vs2_s, imm_s, vstate) + } + &Inst::VecAluRR { + op, + vd, + vs, + ref vstate, + } => { + let vs_s = format_reg(vs, allocs); + let vd_s = format_reg(vd.to_reg(), allocs); + + format!("{} {},{} {}", op, vd_s, vs_s, vstate) + } + &Inst::VecAluRImm5 { + op, + vd, + imm, + ref vstate, + } => { + let vd_s = format_reg(vd.to_reg(), allocs); + + format!("{} {},{} {}", op, vd_s, imm, vstate) } &Inst::VecSetState { rd, ref vstate } => { let rd_s = format_reg(rd.to_reg(), allocs); diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 603da8690ea4..a45cf39b3369 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -1,8 +1,8 @@ use crate::isa::riscv64::inst::AllocationConsumer; use crate::isa::riscv64::inst::EmitState; use crate::isa::riscv64::lower::isle::generated_code::{ - VecAMode, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode, - VecOpCategory, VecOpMasking, VecTailMode, + VecAMode, VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth, + VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode, }; use crate::machinst::RegClass; use crate::Reg; @@ -260,6 +260,7 @@ impl VecAluOpRRR { VecAluOpRRR::VandVV => 0b001001, VecAluOpRRR::VorVV => 0b001010, VecAluOpRRR::VxorVV => 0b001011, + VecAluOpRRR::VslidedownVX => 0b001111, } } @@ -273,9 +274,10 @@ impl VecAluOpRRR { VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => { VecOpCategory::OPMVV } - VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX => { - VecOpCategory::OPIVX - } + VecAluOpRRR::VaddVX + | VecAluOpRRR::VsubVX + | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VslidedownVX => VecOpCategory::OPIVX, } } @@ -305,13 +307,30 @@ impl VecAluOpRRImm5 { 0x57 } pub fn funct3(&self) -> u32 { - VecOpCategory::OPIVI.encode() + self.category().encode() } + pub fn funct6(&self) -> u32 { // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc match self { VecAluOpRRImm5::VaddVI => 0b000000, VecAluOpRRImm5::VrsubVI => 0b000011, + VecAluOpRRImm5::VslidedownVI => 0b001111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VslidedownVI => { + VecOpCategory::OPIVI + } + } + } + + pub fn imm_is_unsigned(&self) -> bool { + match self { + VecAluOpRRImm5::VslidedownVI => true, + VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI => false, } } } @@ -325,6 +344,139 @@ impl fmt::Display for VecAluOpRRImm5 { } } +impl VecAluOpRR { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => { + 0b010000 + } + VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRR::VmvSX => VecOpCategory::OPMVX, + VecAluOpRR::VmvXS => VecOpCategory::OPMVV, + VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF, + VecAluOpRR::VfmvFS => VecOpCategory::OPFVV, + VecAluOpRR::VmvVV => VecOpCategory::OPIVV, + VecAluOpRR::VmvVX => VecOpCategory::OPIVX, + } + } + + /// Returns the auxiliary encoding field for the instruction, if any. + pub fn aux_encoding(&self) -> u32 { + match self { + // VRXUNARY0 + VecAluOpRR::VmvSX => 0b00000, + // VWXUNARY0 + VecAluOpRR::VmvXS => 0b00000, + // VRFUNARY0 + VecAluOpRR::VfmvSF => 0b00000, + // VWFUNARY0 + VecAluOpRR::VfmvFS => 0b00000, + // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: + // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. + VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0, + } + } + + /// Most of these opcodes have the source register encoded in the VS2 field and + /// the `aux_encoding` field in VS1. However some special snowflakes have it the + /// other way around. As far as I can tell only vmv.v.* are backwards. + pub fn vs_is_vs2_encoded(&self) -> bool { + match self { + VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => true, + VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => false, + } + } + + pub fn dst_regclass(&self) -> RegClass { + match self { + VecAluOpRR::VfmvSF + | VecAluOpRR::VmvSX + | VecAluOpRR::VmvVV + | VecAluOpRR::VmvVX + | VecAluOpRR::VfmvVF => RegClass::Vector, + VecAluOpRR::VmvXS => RegClass::Int, + VecAluOpRR::VfmvFS => RegClass::Float, + } + } + + pub fn src_regclass(&self) -> RegClass { + match self { + VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV => RegClass::Vector, + VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float, + VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int, + } + } +} + +impl fmt::Display for VecAluOpRR { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + VecAluOpRR::VmvSX => "vmv.s.x", + VecAluOpRR::VmvXS => "vmv.x.s", + VecAluOpRR::VfmvSF => "vfmv.s.f", + VecAluOpRR::VfmvFS => "vfmv.f.s", + VecAluOpRR::VmvVV => "vmv.v.v", + VecAluOpRR::VmvVX => "vmv.v.x", + VecAluOpRR::VfmvVF => "vfmv.v.f", + }) + } +} + +impl VecAluOpRImm5 { + pub fn opcode(&self) -> u32 { + // Vector Opcode + 0x57 + } + pub fn funct3(&self) -> u32 { + self.category().encode() + } + + pub fn funct6(&self) -> u32 { + // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc + match self { + VecAluOpRImm5::VmvVI => 0b010111, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRImm5::VmvVI => VecOpCategory::OPIVI, + } + } + + /// Returns the auxiliary encoding field for the instruction, if any. + pub fn aux_encoding(&self) -> u32 { + match self { + // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: + // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. + VecAluOpRImm5::VmvVI => 0, + } + } +} + +impl fmt::Display for VecAluOpRImm5 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + VecAluOpRImm5::VmvVI => "vmv.v.i", + }) + } +} + impl VecAMode { pub fn get_base_register(&self) -> Option { match self { diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index ee02f7b7c503..bfbe10e958cd 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -97,14 +97,39 @@ (VaddVX) (VsubVX) (VrsubVX) + (VslidedownVX) )) ;; Register-Imm ALU Ops (type VecAluOpRRImm5 (enum + ;; Regular VI Opcodes (VaddVI) (VrsubVI) + (VslidedownVI) )) +;; Imm only ALU Ops +(type VecAluOpRImm5 (enum + (VmvVI) +)) + +;; These are all of the special cases that have weird encodings. They are all +;; single source, single destination instructions, and usually use one of +;; the two source registers as auxiliary encoding space. +(type VecAluOpRR (enum + (VmvSX) + (VmvXS) + (VfmvSF) + (VfmvFS) + ;; vmv.v* is special in that vs2 must be v0 (and is ignored) otherwise the instruction is illegal. + (VmvVV) + (VmvVX) + (VfmvVF) +)) + +;; Returns the canonical destination type for a VecAluOpRRImm5. +(decl pure vec_alu_rr_dst_type (VecAluOpRR) Type) +(extern constructor vec_alu_rr_dst_type vec_alu_rr_dst_type) ;; Vector Addressing Mode @@ -139,9 +164,15 @@ (rule (element_width_from_type ty) (if-let $I32 (lane_type ty)) (VecElementWidth.E32)) +(rule (element_width_from_type ty) + (if-let $F32 (lane_type ty)) + (VecElementWidth.E32)) (rule (element_width_from_type ty) (if-let $I64 (lane_type ty)) (VecElementWidth.E64)) +(rule (element_width_from_type ty) + (if-let $F64 (lane_type ty)) + (VecElementWidth.E64)) (decl pure min_vec_reg_size () u64) (extern constructor min_vec_reg_size min_vec_reg_size) @@ -172,6 +203,27 @@ (_ Unit (emit (MInst.VecAluRRImm5 op vd vs2 imm vstate)))) vd)) +;; Helper for emitting `MInst.VecAluRRImm5` instructions where the immediate +;; is zero extended instead of sign extended. +(decl vec_alu_rr_uimm5 (VecAluOpRRImm5 Reg UImm5 VState) Reg) +(rule (vec_alu_rr_uimm5 op vs2 imm vstate) + (vec_alu_rr_imm5 op vs2 (uimm5_bitcast_to_imm5 imm) vstate)) + +;; Helper for emitting `MInst.VecAluRRImm5` instructions that use the Imm5 as +;; auxiliary encoding space. +(decl vec_alu_rr (VecAluOpRR Reg VState) Reg) +(rule (vec_alu_rr op vs vstate) + (let ((vd WritableReg (temp_writable_reg (vec_alu_rr_dst_type op))) + (_ Unit (emit (MInst.VecAluRR op vd vs vstate)))) + vd)) + +;; Helper for emitting `MInst.VecAluRImm5` instructions. +(decl vec_alu_r_imm5 (VecAluOpRImm5 Imm5 VState) Reg) +(rule (vec_alu_r_imm5 op imm vstate) + (let ((vd WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.VecAluRImm5 op vd imm vstate)))) + vd)) + ;; Helper for emitting `MInst.VecLoad` instructions. (decl vec_load (VecElementWidth VecAMode MemFlags VState) Reg) (rule (vec_load eew from flags vstate) @@ -254,3 +306,70 @@ (decl rv_vxor_vv (Reg Reg VState) Reg) (rule (rv_vxor_vv vs2 vs1 vstate) (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vslidedown.vx` instruction. +;; `vslidedown` moves all elements in the vector down by n elements. +;; The top most elements are up to the tail policy. +(decl rv_vslidedown_vx (Reg Reg VState) Reg) +(rule (rv_vslidedown_vx vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VslidedownVX) vs2 vs1 vstate)) + +;; Helper for emitting the `vslidedown.vi` instruction. +;; Unlike other `vi` instructions the immediate is zero extended. +(decl rv_vslidedown_vi (Reg UImm5 VState) Reg) +(rule (rv_vslidedown_vi vs2 imm vstate) + (vec_alu_rr_uimm5 (VecAluOpRRImm5.VslidedownVI) vs2 imm vstate)) + +;; Helper for emitting the `vmv.x.s` instruction. +;; This instruction copies the first element of the source vector to the destination X register. +(decl rv_vmv_xs (Reg VState) Reg) +(rule (rv_vmv_xs vs vstate) + (vec_alu_rr (VecAluOpRR.VmvXS) vs vstate)) + +;; Helper for emitting the `vfmv.f.s` instruction. +;; This instruction copies the first element of the source vector to the destination F register. +(decl rv_vfmv_fs (Reg VState) Reg) +(rule (rv_vfmv_fs vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvFS) vs vstate)) + +;; Helper for emitting the `vmv.v.x` instruction. +;; This instruction splats the X regsiter into all elements of the destination vector. +(decl rv_vmv_vx (Reg VState) Reg) +(rule (rv_vmv_vx vs vstate) + (vec_alu_rr (VecAluOpRR.VmvVX) vs vstate)) + +;; Helper for emitting the `vfmv.v.f` instruction. +;; This instruction splats the F regsiter into all elements of the destination vector. +(decl rv_vfmv_vf (Reg VState) Reg) +(rule (rv_vfmv_vf vs vstate) + (vec_alu_rr (VecAluOpRR.VfmvVF) vs vstate)) + +;; Helper for emitting the `vmv.v.i` instruction. +;; This instruction splat's the immediate value into all elements of the destination vector. +(decl rv_vmv_vi (Imm5 VState) Reg) +(rule (rv_vmv_vi imm vstate) + (vec_alu_r_imm5 (VecAluOpRImm5.VmvVI) imm vstate)) + +;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl gen_extractlane (Type Reg u8) Reg) + +;; When extracting lane 0 for floats, we can use `vfmv.f.s` directly. +(rule 3 (gen_extractlane (ty_vec_fits_in_register ty) src 0) + (if (ty_vector_float ty)) + (rv_vfmv_fs src ty)) + +;; When extracting lane 0 for integers, we can use `vmv.x.s` directly. +(rule 2 (gen_extractlane (ty_vec_fits_in_register ty) src 0) + (if (ty_vector_not_float ty)) + (rv_vmv_xs src ty)) + +;; In the general case, we must first use a `vslidedown` to place the correct lane +;; in index 0, and then use the appropriate `vmv` instruction. +;; If the index fits into a 5-bit immediate, we can emit a `vslidedown.vi`. +(rule 1 (gen_extractlane (ty_vec_fits_in_register ty) src (uimm5_from_u8 idx)) + (gen_extractlane ty (rv_vslidedown_vi src idx ty) 0)) + +;; Otherwise lower it into an X register. +(rule 0 (gen_extractlane (ty_vec_fits_in_register ty) src idx) + (gen_extractlane ty (rv_vslidedown_vx src (imm $I64 idx) ty) 0)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index cf30e6e89b0f..7dc69850ebd0 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1030,3 +1030,23 @@ (rule (lower (call_indirect sig_ref val inputs)) (gen_call_indirect sig_ref val inputs)) + +;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx))) + (gen_extractlane ty x idx)) + +;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type ty (splat n @ (value_type (ty_scalar_float _))))) + (rv_vfmv_vf n ty)) + +(rule 1 (lower (has_type ty (splat n @ (value_type (ty_int_ref_scalar_64 _))))) + (rv_vmv_vx n ty)) + +(rule 2 (lower (has_type ty (splat (iconst (u64_from_imm64 (imm5_from_u64 imm)))))) + (rv_vmv_vi imm ty)) + +;; TODO: We can splat out more patterns by using for example a vmv.v.i i8x16 for +;; a i64x2 const with a compatible bit pattern. The AArch64 Backend does something +;; similar in its splat rules. diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index 1094dfce2113..8dc6c6675903 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -6,6 +6,7 @@ pub mod generated_code; use generated_code::{Context, ExtendOp, MInst}; // Types that the generated ISLE code uses via `use super::*`. +use self::generated_code::VecAluOpRR; use super::{writable_zero_reg, zero_reg}; use crate::isa::riscv64::abi::Riscv64ABICaller; use crate::isa::riscv64::Riscv64Backend; @@ -205,6 +206,14 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> Imm5::maybe_from_i8(i8::try_from(arg0 as i64).ok()?) } #[inline] + fn uimm5_bitcast_to_imm5(&mut self, arg0: UImm5) -> Imm5 { + Imm5::from_bits(arg0.bits() as u8) + } + #[inline] + fn uimm5_from_u8(&mut self, arg0: u8) -> Option { + UImm5::maybe_from_u8(arg0) + } + #[inline] fn writable_zero_reg(&mut self) -> WritableReg { writable_zero_reg() } @@ -455,6 +464,10 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> None } } + + fn vec_alu_rr_dst_type(&mut self, op: &VecAluOpRR) -> Type { + MInst::canonical_type_for_rc(op.dst_regclass()) + } } /// The main entry point for lowering with ISLE. diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-extractlane.clif b/cranelift/filetests/filetests/isa/riscv64/simd-extractlane.clif new file mode 100644 index 000000000000..1cd78cd557be --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-extractlane.clif @@ -0,0 +1,446 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %extractlane_i8x16_idx_0(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.x.s a0,v0 #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x25, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i16x8_idx_0(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.x.s a0,v0 #avl=8, #vtype=(e16, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x25, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i32x4_idx_0(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.x.s a0,v0 #avl=4, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x25, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i64x2_idx_0(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.x.s a0,v0 #avl=2, #vtype=(e64, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x25, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_f32x4_idx_0(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmv.f.s fa0,v0 #avl=4, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x15, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_f64x2_idx_0(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane v0, 0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmv.f.s fa0,v0 #avl=2, #vtype=(e64, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x15, 0x00, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i8x16_idx_1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=16, #vtype=(e8, m1, ta, ma) +; vmv.x.s a0,v2 #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x25, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i16x8_idx_1(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=8, #vtype=(e16, m1, ta, ma) +; vmv.x.s a0,v2 #avl=8, #vtype=(e16, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x25, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i32x4_idx_1(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=4, #vtype=(e32, m1, ta, ma) +; vmv.x.s a0,v2 #avl=4, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x25, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_i64x2_idx_1(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmv.x.s a0,v2 #avl=2, #vtype=(e64, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x25, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_f32x4_idx_1(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=4, #vtype=(e32, m1, ta, ma) +; vfmv.f.s fa0,v2 #avl=4, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x15, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %extractlane_f64x2_idx_1(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v0,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v2,v0,1 #avl=2, #vtype=(e64, m1, ta, ma) +; vfmv.f.s fa0,v2 #avl=2, #vtype=(e64, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb1, 0x00, 0x3e +; .byte 0x57, 0x15, 0x20, 0x42 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-splat.clif b/cranelift/filetests/filetests/isa/riscv64/simd-splat.clif new file mode 100644 index 000000000000..af0099034fac --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-splat.clif @@ -0,0 +1,206 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %splat_i8x16(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xd7, 0x41, 0x05, 0x5e +; .byte 0xa7, 0x81, 0x05, 0x02 +; ret + +function %splat_i16x8(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x41, 0x05, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 +; ret + +function %splat_i32x4(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x41, 0x05, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 +; ret + +function %splat_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.x v3,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x41, 0x05, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 +; ret + +function %splat_const_i8x16() -> i8x16 { +block0: + v0 = iconst.i8 2 + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.i v2,2 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x31, 0x01, 0x5e +; .byte 0x27, 0x01, 0x05, 0x02 +; ret + +function %splat_const_i16x8() -> i16x8 { +block0: + v0 = iconst.i16 2 + v1 = splat.i16x8 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.i v2,2 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x31, 0x01, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 +; ret + +function %splat_const_i32x4() -> i32x4 { +block0: + v0 = iconst.i32 2 + v1 = splat.i32x4 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.i v2,2 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x31, 0x01, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 +; ret + +function %splat_const_i64x2() -> i64x2 { +block0: + v0 = iconst.i64 2 + v1 = splat.i64x2 v0 + return v1 +} + +; VCode: +; block0: +; vmv.v.i v2,2 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x31, 0x01, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 +; ret + +function %splat_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; VCode: +; block0: +; vfmv.v.f v3,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v3,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x51, 0x05, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x01, 0x05, 0x02 +; ret + +function %splat_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; VCode: +; block0: +; vfmv.v.f v3,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v3,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x51, 0x05, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x01, 0x05, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane.clif b/cranelift/filetests/filetests/runtests/simd-extractlane.clif index 016f504edb35..032079c0da0c 100644 --- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif @@ -8,6 +8,7 @@ target x86_64 target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx +target riscv64 has_v function %extractlane_4(i8x16) -> i8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif new file mode 100644 index 000000000000..4e74ba91b850 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif @@ -0,0 +1,27 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx + +function %splat_f32x4_2(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = f32const 0x1.5 + v2 = splat.f32x4 v1 + v3 = fadd v0, v2 + return v3 +} +; run: %splat_f32x4_2([0x0.0 NaN 0x1.0 0x2.0]) == [0x1.5 NaN 0x2.5 0x3.5] + +function %splat_f64x2_2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = f64const 0x7.5 + v2 = splat.f64x2 v1 + v3 = fadd v0, v2 + return v3 +} +; run: %splat_f64x2_2([0x0.0 0x1.0]) == [0x7.5 0x8.5] \ No newline at end of file diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index de2b49fd4adb..c07ef8833c10 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -6,6 +6,7 @@ set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx target x86_64 has_sse3 has_ssse3 has_sse41 has_avx has_avx2 +target riscv64 has_v function %splat_i8x16(i8) -> i8x16 { block0(v0: i8): @@ -127,24 +128,6 @@ block0(v0: i64x2): } ; run: %splat_i64x2_2([-1 0]) == [-2 -1] -function %splat_f32x4_2(f32x4) -> f32x4 { -block0(v0: f32x4): - v1 = f32const 0x1.5 - v2 = splat.f32x4 v1 - v3 = fadd v0, v2 - return v3 -} -; run: %splat_f32x4_2([0x0.0 NaN 0x1.0 0x2.0]) == [0x1.5 NaN 0x2.5 0x3.5] - -function %splat_f64x2_2(f64x2) -> f64x2 { -block0(v0: f64x2): - v1 = f64const 0x7.5 - v2 = splat.f64x2 v1 - v3 = fadd v0, v2 - return v3 -} -; run: %splat_f64x2_2([0x0.0 0x1.0]) == [0x7.5 0x8.5] - function %load_splat_i8x16(i8) -> i8x16 { ss0 = explicit_slot 8