From d6b48256cb034039ea433691eafee636f84eceb5 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 30 Aug 2023 20:34:46 +0100 Subject: [PATCH] riscv64: Implement vector floating point rounding instructions (#6920) * riscv64: Add CSR Instructions * riscv64: Add float to int vector instructions * cranelift: Split vector rounding mode tests * riscv64: Implement float rounding ops for vectors * riscv64: Update tests --- build.rs | 2 - cranelift/codegen/src/isa/riscv64/inst.isle | 80 ++++++++++- .../codegen/src/isa/riscv64/inst/args.rs | 76 ++++++++++ .../codegen/src/isa/riscv64/inst/emit.rs | 13 ++ .../codegen/src/isa/riscv64/inst/encode.rs | 43 ++++-- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 37 ++++- .../codegen/src/isa/riscv64/inst/vector.rs | 52 ++++++- .../codegen/src/isa/riscv64/inst_vector.isle | 100 +++++++++++++ cranelift/codegen/src/isa/riscv64/lower.isle | 28 ++-- .../codegen/src/isa/riscv64/lower/isle.rs | 5 + .../filetests/isa/riscv64/simd-ceil.clif | 131 ++++++++++++++++++ .../filetests/isa/riscv64/simd-floor.clif | 131 ++++++++++++++++++ .../filetests/isa/riscv64/simd-nearest.clif | 131 ++++++++++++++++++ .../filetests/isa/riscv64/simd-trunc.clif | 123 ++++++++++++++++ .../filetests/filetests/runtests/ceil.clif | 19 +-- .../filetests/filetests/runtests/floor.clif | 19 +-- .../filetests/filetests/runtests/nearest.clif | 19 +-- .../filetests/runtests/simd-ceil.clif | 25 ++++ .../filetests/runtests/simd-floor.clif | 25 ++++ .../filetests/runtests/simd-nearest.clif | 25 ++++ .../filetests/runtests/simd-trunc.clif | 25 ++++ .../filetests/filetests/runtests/trunc.clif | 19 +-- 22 files changed, 1028 insertions(+), 100 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-ceil.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-floor.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-nearest.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-trunc.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-ceil.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-floor.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-nearest.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-trunc.clif diff --git a/build.rs b/build.rs index 6cdb1f3a80f0..4ffc8894c58b 100644 --- a/build.rs +++ b/build.rs @@ -259,8 +259,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "cvt_from_uint", "issue_3327_bnot_lowering", "simd_conversions", - "simd_f32x4_rounding", - "simd_f64x2_rounding", "simd_i32x4_trunc_sat_f32x4", "simd_i32x4_trunc_sat_f64x2", "simd_load", diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 5a489ee812f0..624579eef987 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -59,6 +59,20 @@ (rs Reg) (imm12 Imm12)) + ;; A CSR Reading or Writing instruction with a register source and a register destination. + (CsrReg + (op CsrRegOP) + (rd WritableReg) + (rs Reg) + (csr CSR)) + + ;; A CSR Writing instruction with an immediate source and a register destination. + (CsrImm + (op CsrImmOP) + (rd WritableReg) + (imm UImm5) + (csr CSR)) + ;; An load (Load (rd WritableReg) @@ -689,6 +703,30 @@ (Bseti) )) +(type CsrRegOP (enum + ;; Atomic Read/Write CSR + (CsrRW) + ;; Atomic Read and Set Bits in CSR + (CsrRS) + ;; Atomic Read and Clear Bits in CSR + (CsrRC) +)) + +(type CsrImmOP (enum + ;; Atomic Read/Write CSR (Immediate Source) + (CsrRWI) + ;; Atomic Read and Set Bits in CSR (Immediate Source) + (CsrRSI) + ;; Atomic Read and Clear Bits in CSR (Immediate Source) + (CsrRCI) +)) + +;; Enum of the known CSR registers +(type CSR (enum + ;; Floating-Point Dynamic Rounding Mode + (Frm) +)) + (type FRM (enum ;; Round to Nearest, ties to Even @@ -706,6 +744,10 @@ (Fcsr) )) +(decl pure frm_bits (FRM) UImm5) +(extern constructor frm_bits frm_bits) +(convert FRM UImm5 frm_bits) + (type FFlagsException (enum ;; Invalid Operation (NV) @@ -1508,6 +1550,30 @@ (alu_rrr (AluOPRRR.Packw) rs1 rs2)) +;; `Zicsr` Extension Instructions + +;; Helper for emitting the `csrrwi` instruction. +(decl rv_csrrwi (CSR UImm5) XReg) +(rule (rv_csrrwi csr imm) + (csr_imm (CsrImmOP.CsrRWI) csr imm)) + +;; This is a special case of `csrrwi` when the CSR is the `frm` CSR. +(decl rv_fsrmi (FRM) XReg) +(rule (rv_fsrmi frm) (rv_csrrwi (CSR.Frm) frm)) + + +;; Helper for emitting the `csrw` instruction. This is a special case of +;; `csrrw` where the destination register is always `x0`. +(decl rv_csrw (CSR XReg) Unit) +(rule (rv_csrw csr rs) + (csr_reg_dst_zero (CsrRegOP.CsrRW) csr rs)) + +;; This is a special case of `csrw` when the CSR is the `frm` CSR. +(decl rv_fsrm (XReg) Unit) +(rule (rv_fsrm rs) (rv_csrw (CSR.Frm) rs)) + + + ;; Generate a mask for the bit-width of the given type @@ -1686,7 +1752,6 @@ (_ Unit (emit (MInst.FpuRRR op (gen_default_frm) dst src1 src2)))) dst)) - ;; Helper for emitting `MInst.FpuRRRR` instructions. (decl fpu_rrrr (FpuOPRRRR Type Reg Reg Reg) Reg) (rule (fpu_rrrr op ty src1 src2 src3) @@ -1710,7 +1775,6 @@ (_ Unit (emit (MInst.AluRRImm12 op dst src (imm12_zero))))) dst)) - ;; Helper for emitting the `Lui` instruction. ;; TODO: This should be something like `emit_u_type`. And should share the ;; `MInst` with `auipc` since these instructions share the U-Type format. @@ -1720,6 +1784,18 @@ (_ Unit (emit (MInst.Lui dst imm)))) dst)) +;; Helper for emitting `MInst.CsrImm` instructions. +(decl csr_imm (CsrImmOP CSR UImm5) XReg) +(rule (csr_imm op csr imm) + (let ((dst WritableXReg (temp_writable_xreg)) + (_ Unit (emit (MInst.CsrImm op dst imm csr)))) + dst)) + +;; Helper for emitting a `MInst.CsrReg` instruction that writes the result to x0. +(decl csr_reg_dst_zero (CsrRegOP CSR XReg) Unit) +(rule (csr_reg_dst_zero op csr rs) + (emit (MInst.CsrReg op (writable_zero_reg) rs csr))) + (decl select_addi (Type) AluOPRRI) diff --git a/cranelift/codegen/src/isa/riscv64/inst/args.rs b/cranelift/codegen/src/isa/riscv64/inst/args.rs index 46827fab1e3a..40ebbc94fd1f 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/args.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/args.rs @@ -1810,3 +1810,79 @@ pub(crate) fn f64_cvt_to_int_bounds(signed: bool, out_bits: u8) -> (f64, f64) { _ => unreachable!(), } } + +impl CsrRegOP { + pub(crate) fn funct3(self) -> u32 { + match self { + CsrRegOP::CsrRW => 0b001, + CsrRegOP::CsrRS => 0b010, + CsrRegOP::CsrRC => 0b011, + } + } + + pub(crate) fn opcode(self) -> u32 { + 0b1110011 + } + + pub(crate) fn name(self) -> &'static str { + match self { + CsrRegOP::CsrRW => "csrrw", + CsrRegOP::CsrRS => "csrrs", + CsrRegOP::CsrRC => "csrrc", + } + } +} + +impl Display for CsrRegOP { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} + +impl CsrImmOP { + pub(crate) fn funct3(self) -> u32 { + match self { + CsrImmOP::CsrRWI => 0b101, + CsrImmOP::CsrRSI => 0b110, + CsrImmOP::CsrRCI => 0b111, + } + } + + pub(crate) fn opcode(self) -> u32 { + 0b1110011 + } + + pub(crate) fn name(self) -> &'static str { + match self { + CsrImmOP::CsrRWI => "csrrwi", + CsrImmOP::CsrRSI => "csrrsi", + CsrImmOP::CsrRCI => "csrrci", + } + } +} + +impl Display for CsrImmOP { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} + +impl CSR { + pub(crate) fn bits(self) -> Imm12 { + Imm12::from_bits(match self { + CSR::Frm => 0x0002, + }) + } + + pub(crate) fn name(self) -> &'static str { + match self { + CSR::Frm => "frm", + } + } +} + +impl Display for CSR { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + write!(f, "{}", self.name()) + } +} diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index f86569b24323..10ab5649412f 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -331,6 +331,8 @@ impl Inst { | Inst::AluRRR { .. } | Inst::FpuRRR { .. } | Inst::AluRRImm12 { .. } + | Inst::CsrReg { .. } + | Inst::CsrImm { .. } | Inst::Load { .. } | Inst::Store { .. } | Inst::Args { .. } @@ -595,6 +597,17 @@ impl MachInstEmit for Inst { | alu_op.imm12(imm12) << 20; sink.put4(x); } + &Inst::CsrReg { op, rd, rs, csr } => { + let rs = allocs.next(rs); + let rd = allocs.next_writable(rd); + + sink.put4(encode_csr_reg(op, rd, rs, csr)); + } + &Inst::CsrImm { op, rd, csr, imm } => { + let rd = allocs.next_writable(rd); + + sink.put4(encode_csr_imm(op, rd, csr, imm)); + } &Inst::Load { rd, op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs index cff00cecabbf..b1e17c57a22d 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/encode.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/encode.rs @@ -6,7 +6,7 @@ //! Some instructions especially in extensions have slight variations from //! the base RISC-V specification. -use super::{Imm12, Imm5, UImm5, VType}; +use super::*; use crate::isa::riscv64::inst::reg_to_gpr_num; use crate::isa::riscv64::lower::isle::generated_code::{ VecAluOpRImm5, VecAluOpRR, VecAluOpRRImm5, VecAluOpRRR, VecAluOpRRRImm5, VecAluOpRRRR, @@ -53,21 +53,30 @@ pub fn encode_r_type( ) } -/// Encode an I-type instruction. -/// /// Layout: /// 0-------6-7-------11-12------14-15------19-20------------------31 /// | Opcode | rd | width | rs1 | Offset[11:0] | -pub fn encode_i_type(opcode: u32, rd: WritableReg, width: u32, rs1: Reg, offset: Imm12) -> u32 { +fn encode_i_type_bits(opcode: u32, rd: u32, funct3: u32, rs1: u32, offset: u32) -> u32 { let mut bits = 0; bits |= unsigned_field_width(opcode, 7); - bits |= reg_to_gpr_num(rd.to_reg()) << 7; - bits |= unsigned_field_width(width, 3) << 12; - bits |= reg_to_gpr_num(rs1) << 15; - bits |= unsigned_field_width(offset.as_u32(), 12) << 20; + bits |= unsigned_field_width(rd, 5) << 7; + bits |= unsigned_field_width(funct3, 3) << 12; + bits |= unsigned_field_width(rs1, 5) << 15; + bits |= unsigned_field_width(offset, 12) << 20; bits } +/// Encode an I-type instruction. +pub fn encode_i_type(opcode: u32, rd: WritableReg, width: u32, rs1: Reg, offset: Imm12) -> u32 { + encode_i_type_bits( + opcode, + reg_to_gpr_num(rd.to_reg()), + width, + reg_to_gpr_num(rs1), + offset.as_u32(), + ) +} + /// Encode an S-type instruction. /// /// Layout: @@ -297,3 +306,21 @@ pub fn encode_vmem_store( // with different names on the fields. encode_vmem_load(opcode, vs3, width, rs1, sumop, masking, mop, nf) } + +// The CSR Reg instruction is really just an I type instruction with the CSR in +// the immediate field. +pub fn encode_csr_reg(op: CsrRegOP, rd: WritableReg, rs: Reg, csr: CSR) -> u32 { + encode_i_type(op.opcode(), rd, op.funct3(), rs, csr.bits()) +} + +// The CSR Imm instruction is an I type instruction with the CSR in +// the immediate field and the value to be set in the `rs1` field. +pub fn encode_csr_imm(op: CsrImmOP, rd: WritableReg, csr: CSR, imm: UImm5) -> u32 { + encode_i_type_bits( + op.opcode(), + reg_to_gpr_num(rd.to_reg()), + op.funct3(), + imm.bits(), + csr.bits().as_u32(), + ) +} diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 6a136d9b6a74..d0378a4aa15b 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -55,8 +55,9 @@ pub(crate) type VecWritableReg = Vec>; // Instructions (top level): definition pub use crate::isa::riscv64::lower::isle::generated_code::{ - AluOPRRI, AluOPRRR, AtomicOP, FClassResult, FFlagsException, FloatRoundOP, FloatSelectOP, - FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, FRM, + AluOPRRI, AluOPRRR, AtomicOP, CsrImmOP, CsrRegOP, FClassResult, FFlagsException, FloatRoundOP, + FloatSelectOP, FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, CSR, + FRM, }; use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRImm5, VecAluOpRRR}; @@ -399,6 +400,13 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_use(rs); collector.reg_def(rd); } + &Inst::CsrReg { rd, rs, .. } => { + collector.reg_use(rs); + collector.reg_def(rd); + } + &Inst::CsrImm { rd, .. } => { + collector.reg_def(rd); + } &Inst::Load { rd, from, .. } => { if let Some(r) = from.get_allocatable_register() { collector.reg_use(r); @@ -1512,6 +1520,31 @@ impl Inst { } } } + &Inst::CsrReg { op, rd, rs, csr } => { + let rs_s = format_reg(rs, allocs); + let rd_s = format_reg(rd.to_reg(), allocs); + + match (op, csr, rd) { + (CsrRegOP::CsrRW, CSR::Frm, rd) if rd.to_reg() == zero_reg() => { + format!("fsrm {rs_s}") + } + _ => { + format!("{op} {rd_s},{csr},{rs_s}") + } + } + } + &Inst::CsrImm { op, rd, csr, imm } => { + let rd_s = format_reg(rd.to_reg(), allocs); + + match (op, csr, rd) { + (CsrImmOP::CsrRWI, CSR::Frm, rd) if rd.to_reg() != zero_reg() => { + format!("fsrmi {rd_s},{imm}") + } + _ => { + format!("{op} {rd_s},{csr},{imm}") + } + } + } &Inst::Load { rd, op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 8124a37c05af..ec3e99fd7390 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -752,6 +752,12 @@ impl VecAluOpRR { | VecAluOpRR::VsextVF8 => 0b010010, VecAluOpRR::VfsqrtV => 0b010011, VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111, + VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV => 0b010010, } } @@ -766,7 +772,14 @@ impl VecAluOpRR { | VecAluOpRR::VsextVF4 | VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV, VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF, - VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV, + VecAluOpRR::VfmvFS + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV => VecOpCategory::OPFVV, VecAluOpRR::VmvVV => VecOpCategory::OPIVV, VecAluOpRR::VmvVX => VecOpCategory::OPIVX, } @@ -792,6 +805,13 @@ impl VecAluOpRR { VecAluOpRR::VsextVF4 => 0b00101, VecAluOpRR::VzextVF2 => 0b00110, VecAluOpRR::VsextVF2 => 0b00111, + // VFUNARY0 + VecAluOpRR::VfcvtxufV => 0b00000, + VecAluOpRR::VfcvtxfV => 0b00001, + VecAluOpRR::VfcvtrtzxufV => 0b00110, + VecAluOpRR::VfcvtrtzxfV => 0b00111, + VecAluOpRR::VfcvtfxuV => 0b00010, + VecAluOpRR::VfcvtfxV => 0b00011, // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0, @@ -811,7 +831,13 @@ impl VecAluOpRR { | VecAluOpRR::VzextVF8 | VecAluOpRR::VsextVF2 | VecAluOpRR::VsextVF4 - | VecAluOpRR::VsextVF8 => true, + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV => true, VecAluOpRR::VmvSX | VecAluOpRR::VfmvSF | VecAluOpRR::VmvVV @@ -833,7 +859,13 @@ impl VecAluOpRR { | VecAluOpRR::VzextVF8 | VecAluOpRR::VsextVF2 | VecAluOpRR::VsextVF4 - | VecAluOpRR::VsextVF8 => RegClass::Vector, + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV => RegClass::Vector, VecAluOpRR::VmvXS => RegClass::Int, VecAluOpRR::VfmvFS => RegClass::Float, } @@ -850,7 +882,13 @@ impl VecAluOpRR { | VecAluOpRR::VzextVF8 | VecAluOpRR::VsextVF2 | VecAluOpRR::VsextVF4 - | VecAluOpRR::VsextVF8 => RegClass::Vector, + | VecAluOpRR::VsextVF8 + | VecAluOpRR::VfcvtxufV + | VecAluOpRR::VfcvtxfV + | VecAluOpRR::VfcvtrtzxufV + | VecAluOpRR::VfcvtrtzxfV + | VecAluOpRR::VfcvtfxuV + | VecAluOpRR::VfcvtfxV => RegClass::Vector, VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float, VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int, } @@ -887,6 +925,12 @@ impl fmt::Display for VecAluOpRR { VecAluOpRR::VmvVV => "vmv.v.v", VecAluOpRR::VmvVX => "vmv.v.x", VecAluOpRR::VfmvVF => "vfmv.v.f", + VecAluOpRR::VfcvtxufV => "vfcvt.xu.f.v", + VecAluOpRR::VfcvtxfV => "vfcvt.x.f.v", + VecAluOpRR::VfcvtrtzxufV => "vfcvt.rtz.xu.f.v", + VecAluOpRR::VfcvtrtzxfV => "vfcvt.rtz.x.f.v", + VecAluOpRR::VfcvtfxuV => "vfcvt.f.xu.v", + VecAluOpRR::VfcvtfxV => "vfcvt.f.x.v", }) } } diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index dc0618aec1ed..63db6c751146 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -285,6 +285,12 @@ (VzextVF2) (VzextVF4) (VzextVF8) + (VfcvtxufV) + (VfcvtxfV) + (VfcvtrtzxufV) + (VfcvtrtzxfV) + (VfcvtfxuV) + (VfcvtfxV) )) ;; Returns the canonical destination type for a VecAluOpRRImm5. @@ -1014,6 +1020,46 @@ (rule (rv_vfsqrt_v vs mask vstate) (vec_alu_rr (VecAluOpRR.VfsqrtV) vs mask vstate)) +;; Helper for emitting the `vfcvt.xu.f.v` instruction. +;; This instruction converts a float to an unsigned integer. +(decl rv_vfcvt_xu_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_xu_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtxufV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.x.f.v` instruction. +;; This instruction converts a float to a signed integer. +(decl rv_vfcvt_x_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_x_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtxfV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.rtz.xu.f.v` instruction. +;; This instruction converts a float to an unsigned integer +;; using the Round to Zero (RTZ) rounding mode and ignoring +;; the currently set FRM rounding mode. +(decl rv_vfcvt_rtz_xu_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_rtz_xu_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtrtzxufV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.rtz.x.f.v` instruction. +;; This instruction converts a float to a signed integer. +;; using the Round to Zero (RTZ) rounding mode and ignoring +;; the currently set FRM rounding mode. +(decl rv_vfcvt_rtz_x_f_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_rtz_x_f_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtrtzxfV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.f.xu.v` instruction. +;; This instruction converts a unsigned integer to a float. +(decl rv_vfcvt_f_xu_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_f_xu_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtfxuV) vs mask vstate)) + +;; Helper for emitting the `vfcvt.x.f.v` instruction. +;; This instruction converts a signed integer to a float. +(decl rv_vfcvt_f_x_v (VReg VecOpMasking VState) VReg) +(rule (rv_vfcvt_f_x_v vs mask vstate) + (vec_alu_rr (VecAluOpRR.VfcvtfxV) vs mask vstate)) + ;; Helper for emitting the `vslidedown.vx` instruction. ;; `vslidedown` moves all elements in the vector down by n elements. ;; The top most elements are up to the tail policy. @@ -1771,3 +1817,57 @@ (rule 0 (gen_fcmp_mask (ty_vec_fits_in_register ty) (FloatCC.UnorderedOrLessThanOrEqual) x y) (rv_vmnot_m (gen_fcmp_mask ty (FloatCC.GreaterThan) x y) ty)) + + +;; Emits a `vfcvt.x.f.v` instruction with the given rounding mode. +(decl gen_vfcvt_x_f (VReg FRM VState) VReg) + +;; We have a special instruction for RTZ +(rule 1 (gen_vfcvt_x_f x (FRM.RTZ) vstate) + (rv_vfcvt_rtz_x_f_v x (unmasked) vstate)) + +;; In the general case we need to first switch into the appropriate rounding mode. +(rule 0 (gen_vfcvt_x_f x frm vstate) + (let (;; Set the rounding mode and save the current mode + (saved_frm XReg (rv_fsrmi frm)) + (res VReg (rv_vfcvt_x_f_v x (unmasked) vstate)) + ;; Restore the previous rounding mode + (_ Unit (rv_fsrm saved_frm))) + res)) + + +;; Retruns the maximum value integer value that can be represented by a float +(decl float_int_max (Type) u64) +(rule (float_int_max $F32) 0x4B000000) +(rule (float_int_max $F64) 0x4330000000000000) + +;; Builds the instruction sequence to round a vector register to FRM +(decl gen_vec_round (VReg FRM Type) VReg) + +;; For floating-point round operations, if the input is NaN, +/-infinity, or +/-0, the +;; same input is returned as the rounded result; this differs from behavior of +;; RISCV fcvt instructions (which round out-of-range values to the nearest +;; max or min value), therefore special handling is needed for these values. +(rule (gen_vec_round x frm (ty_vec_fits_in_register ty)) + (let ((scalar_ty Type (lane_type ty)) + ;; if x is NaN/+-Infinity/+-Zero or if the exponent is larger than # of bits + ;; in mantissa, the result is the same as src, build a mask for those cases. + ;; (There is an additional fixup for NaN's at the end) + (abs VReg (rv_vfabs_v x (unmasked) ty)) + (max FReg (imm scalar_ty (float_int_max scalar_ty))) + (exact VReg (rv_vmflt_vf abs max (unmasked) ty)) + + ;; The rounding is performed by converting from float to integer, with the + ;; desired rounding mode. And then converting back with the default rounding + ;; mode. + (int VReg (gen_vfcvt_x_f x frm ty)) + (cvt VReg (rv_vfcvt_f_x_v int (unmasked) ty)) + ;; Copy the sign bit from the original value. + (signed VReg (rv_vfsgnj_vv cvt x (unmasked) ty)) + + ;; We want to return a arithmetic nan if the input is a canonical nan. + ;; Convert them by adding 0.0 to the input. + (float_zero FReg (gen_bitcast (zero_reg) (float_int_of_same_size scalar_ty) scalar_ty)) + (corrected_nan VReg (rv_vfadd_vf x float_zero (unmasked) ty))) + ;; Merge the original value if it does not need rounding, or the rounded value + (rv_vmerge_vvm corrected_nan signed exact ty))) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 66aa3abd5259..4e991a75459c 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1652,25 +1652,33 @@ (gen_bitcast v in_ty out_ty)) ;;;;; Rules for `ceil`;;;;;;;;; -(rule - (lower (has_type ty (ceil x))) - (gen_float_round (FloatRoundOP.Ceil) x ty) -) +(rule 0 (lower (has_type (ty_scalar_float ty) (ceil x))) + (gen_float_round (FloatRoundOP.Ceil) x ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ceil x))) + (gen_vec_round x (FRM.RUP) ty)) ;;;;; Rules for `floor`;;;;;;;;; -(rule - (lower (has_type ty (floor x))) +(rule 0 (lower (has_type (ty_scalar_float ty) (floor x))) (gen_float_round (FloatRoundOP.Floor) x ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (floor x))) + (gen_vec_round x (FRM.RDN) ty)) + ;;;;; Rules for `trunc`;;;;;;;;; -(rule - (lower (has_type ty (trunc x))) +(rule 0 (lower (has_type (ty_scalar_float ty) (trunc x))) (gen_float_round (FloatRoundOP.Trunc) x ty)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (trunc x))) + (gen_vec_round x (FRM.RTZ) ty)) + ;;;;; Rules for `nearest`;;;;;;;;; -(rule - (lower (has_type ty (nearest x))) +(rule 0 (lower (has_type (ty_scalar_float ty) (nearest x))) (gen_float_round (FloatRoundOP.Nearest) x ty)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (nearest x))) + (gen_vec_round x (FRM.RNE) ty)) + ;;;;; Rules for `select_spectre_guard`;;;;;;;;; diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index 4ce9a4c49836..81ecba07d7c4 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -358,6 +358,11 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> fn gen_default_frm(&mut self) -> OptionFloatRoundingMode { None } + + fn frm_bits(&mut self, frm: &FRM) -> UImm5 { + UImm5::maybe_from_u8(frm.bits()).unwrap() + } + fn gen_select_reg(&mut self, cc: &IntCC, a: XReg, b: XReg, rs1: Reg, rs2: Reg) -> Reg { let rd = self.temp_writable_reg(MInst::canonical_type_for_rc(rs1.class())); self.emit(&MInst::SelectReg { diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-ceil.clif b/cranelift/filetests/filetests/isa/riscv64/simd-ceil.clif new file mode 100644 index 000000000000..3cc3b9b21a46 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-ceil.clif @@ -0,0 +1,131 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %ceil_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; lui a3,307200 +; fmv.w.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrmi t4,3 +; vfcvt.x.f.v v14,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=4, #vtype=(e32, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fmv.w.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=4, #vtype=(e32, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; lui a3, 0x4b000 +; fmv.w.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 3 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.w.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %ceil_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; ld a3,[const(0)] +; fmv.d.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrmi t4,3 +; vfcvt.x.f.v v14,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=2, #vtype=(e64, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fmv.d.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; auipc t6, 0 +; addi t6, t6, 0x4c +; ld a3, 0(t6) +; fmv.d.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 3 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.d.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x30, 0x43 + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-floor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-floor.clif new file mode 100644 index 000000000000..6ecdcc7c88f0 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-floor.clif @@ -0,0 +1,131 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %floor_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = floor v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; lui a3,307200 +; fmv.w.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrmi t4,2 +; vfcvt.x.f.v v14,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=4, #vtype=(e32, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fmv.w.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=4, #vtype=(e32, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; lui a3, 0x4b000 +; fmv.w.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 2 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.w.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %floor_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = floor v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; ld a3,[const(0)] +; fmv.d.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrmi t4,2 +; vfcvt.x.f.v v14,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=2, #vtype=(e64, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fmv.d.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; auipc t6, 0 +; addi t6, t6, 0x4c +; ld a3, 0(t6) +; fmv.d.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 2 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.d.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x30, 0x43 + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-nearest.clif b/cranelift/filetests/filetests/isa/riscv64/simd-nearest.clif new file mode 100644 index 000000000000..359fdf6a1c90 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-nearest.clif @@ -0,0 +1,131 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %nearest_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = nearest v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; lui a3,307200 +; fmv.w.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrmi t4,0 +; vfcvt.x.f.v v14,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=4, #vtype=(e32, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fmv.w.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=4, #vtype=(e32, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; lui a3, 0x4b000 +; fmv.w.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 0 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.w.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %nearest_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = nearest v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; ld a3,[const(0)] +; fmv.d.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrmi t4,0 +; vfcvt.x.f.v v14,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fsrm t4 +; vfcvt.f.x.v v17,v14 #avl=2, #vtype=(e64, m1, ta, ma) +; vfsgnj.vv v19,v17,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fmv.d.x ft1,zero +; vfadd.vf v23,v1,ft1 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v25,v23,v19,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v25,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; auipc t6, 0 +; addi t6, t6, 0x4c +; ld a3, 0(t6) +; fmv.d.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; fsrmi t4, 0 +; .byte 0x57, 0x97, 0x10, 0x4a +; fsrm t4 +; .byte 0xd7, 0x98, 0xe1, 0x4a +; .byte 0xd7, 0x99, 0x10, 0x23 +; fmv.d.x ft1, zero +; .byte 0xd7, 0xdb, 0x10, 0x02 +; .byte 0xd7, 0x8c, 0x79, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x0c, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x30, 0x43 + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-trunc.clif b/cranelift/filetests/filetests/isa/riscv64/simd-trunc.clif new file mode 100644 index 000000000000..515643924a38 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-trunc.clif @@ -0,0 +1,123 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %trunc_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = trunc v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; lui a3,307200 +; fmv.w.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vfcvt.rtz.x.f.v v12,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vfcvt.f.x.v v14,v12 #avl=4, #vtype=(e32, m1, ta, ma) +; vfsgnj.vv v16,v14,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; fmv.w.x ft10,zero +; vfadd.vf v20,v1,ft10 #avl=4, #vtype=(e32, m1, ta, ma) +; vmerge.vvm v22,v20,v16,v0.t #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v22,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; lui a3, 0x4b000 +; fmv.w.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; .byte 0x57, 0x96, 0x13, 0x4a +; .byte 0x57, 0x97, 0xc1, 0x4a +; .byte 0x57, 0x98, 0xe0, 0x22 +; fmv.w.x ft10, zero +; .byte 0x57, 0x5a, 0x1f, 0x02 +; .byte 0x57, 0x0b, 0x48, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x0b, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %trunc_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = trunc v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfabs.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; ld a3,[const(0)] +; fmv.d.x fa0,a3 +; vmflt.vf v0,v4,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vfcvt.rtz.x.f.v v12,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vfcvt.f.x.v v14,v12 #avl=2, #vtype=(e64, m1, ta, ma) +; vfsgnj.vv v16,v14,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; fmv.d.x ft10,zero +; vfadd.vf v20,v1,ft10 #avl=2, #vtype=(e64, m1, ta, ma) +; vmerge.vvm v22,v20,v16,v0.t #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v22,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x92, 0x10, 0x2a +; auipc t6, 0 +; addi t6, t6, 0x44 +; ld a3, 0(t6) +; fmv.d.x fa0, a3 +; .byte 0x57, 0x50, 0x45, 0x6e +; .byte 0x57, 0x96, 0x13, 0x4a +; .byte 0x57, 0x97, 0xc1, 0x4a +; .byte 0x57, 0x98, 0xe0, 0x22 +; fmv.d.x ft10, zero +; .byte 0x57, 0x5a, 0x1f, 0x02 +; .byte 0x57, 0x0b, 0x48, 0x5d +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x0b, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x30, 0x43 + diff --git a/cranelift/filetests/filetests/runtests/ceil.clif b/cranelift/filetests/filetests/runtests/ceil.clif index 78c0734a1db4..5848ba260449 100644 --- a/cranelift/filetests/filetests/runtests/ceil.clif +++ b/cranelift/filetests/filetests/runtests/ceil.clif @@ -6,8 +6,7 @@ target x86_64 sse42 target x86_64 sse42 has_avx target aarch64 target s390x -;; FIXME: needs support for vectors -;;target riscv64 +target riscv64 function %ceil_f32(f32) -> f32 { block0(v0: f32): @@ -150,19 +149,3 @@ block0(v0: f64): ; run: %ceil_is_nan_f64(-sNaN:0x1) == 1 ; run: %ceil_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %ceil_is_nan_f64(-sNaN:0x4000000000001) == 1 - -function %ceil_f32x4(f32x4) -> f32x4 { -block0(v0: f32x4): - v1 = ceil v0 - return v1 -} -; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1] -; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] - -function %ceil_f64x2(f64x2) -> f64x2 { -block0(v0: f64x2): - v1 = ceil v0 - return v1 -} -; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0] -; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/floor.clif b/cranelift/filetests/filetests/runtests/floor.clif index b283a907c8ca..bde2a77907aa 100644 --- a/cranelift/filetests/filetests/runtests/floor.clif +++ b/cranelift/filetests/filetests/runtests/floor.clif @@ -6,8 +6,7 @@ target x86_64 sse42 target x86_64 sse42 has_avx target aarch64 target s390x -;; FIXME: needs support for vectors -;;target riscv64 +target riscv64 function %floor_f32(f32) -> f32 { block0(v0: f32): @@ -150,19 +149,3 @@ block0(v0: f64): ; run: %floor_is_nan_f64(-sNaN:0x1) == 1 ; run: %floor_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %floor_is_nan_f64(-sNaN:0x4000000000001) == 1 - -function %floor_f32x4(f32x4) -> f32x4 { -block0(v0: f32x4): - v1 = floor v0 - return v1 -} -; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] -; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1] - -function %floor_f64x2(f64x2) -> f64x2 { -block0(v0: f64x2): - v1 = floor v0 - return v1 -} -; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] -; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/nearest.clif b/cranelift/filetests/filetests/runtests/nearest.clif index a45dc8361b23..a5ce1aacb2e0 100644 --- a/cranelift/filetests/filetests/runtests/nearest.clif +++ b/cranelift/filetests/filetests/runtests/nearest.clif @@ -6,8 +6,7 @@ target x86_64 sse42 target x86_64 sse42 has_avx target aarch64 target s390x -;; FIXME: needs support for vectors -;;target riscv64 +target riscv64 function %nearest_f32(f32) -> f32 { block0(v0: f32): @@ -150,19 +149,3 @@ block0(v0: f64): ; run: %near_is_nan_f64(-sNaN:0x1) == 1 ; run: %near_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %near_is_nan_f64(-sNaN:0x4000000000001) == 1 - -function %nearest_f32x4(f32x4) -> f32x4 { -block0(v0: f32x4): - v1 = nearest v0 - return v1 -} -; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1] -; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1] - -function %nearest_f64x2(f64x2) -> f64x2 { -block0(v0: f64x2): - v1 = nearest v0 - return v1 -} -; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] -; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/simd-ceil.clif b/cranelift/filetests/filetests/runtests/simd-ceil.clif new file mode 100644 index 000000000000..1abebd9f23a2 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-ceil.clif @@ -0,0 +1,25 @@ +test interpret +test run +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target aarch64 +target s390x +target riscv64 has_v + +function %ceil_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} +; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1] +; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] + +function %ceil_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} +; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0] +; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/simd-floor.clif b/cranelift/filetests/filetests/runtests/simd-floor.clif new file mode 100644 index 000000000000..8193f3fa1a06 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-floor.clif @@ -0,0 +1,25 @@ +test interpret +test run +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target aarch64 +target s390x +target riscv64 has_v + +function %floor_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = floor v0 + return v1 +} +; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] +; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1] + +function %floor_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = floor v0 + return v1 +} +; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/simd-nearest.clif b/cranelift/filetests/filetests/runtests/simd-nearest.clif new file mode 100644 index 000000000000..bcd4993ffd42 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-nearest.clif @@ -0,0 +1,25 @@ +test interpret +test run +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target aarch64 +target s390x +target riscv64 has_v + +function %nearest_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = nearest v0 + return v1 +} +; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1] +; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1] + +function %nearest_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = nearest v0 + return v1 +} +; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/simd-trunc.clif b/cranelift/filetests/filetests/runtests/simd-trunc.clif new file mode 100644 index 000000000000..91fb8e1bde0e --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-trunc.clif @@ -0,0 +1,25 @@ +test interpret +test run +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target aarch64 +target s390x +target riscv64 has_v + +function %trunc_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = trunc v0 + return v1 +} +; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] +; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] + +function %trunc_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = trunc v0 + return v1 +} +; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/trunc.clif b/cranelift/filetests/filetests/runtests/trunc.clif index a40d3326faef..37bcb61d07b5 100644 --- a/cranelift/filetests/filetests/runtests/trunc.clif +++ b/cranelift/filetests/filetests/runtests/trunc.clif @@ -6,8 +6,7 @@ target x86_64 sse42 target x86_64 sse42 has_avx target aarch64 target s390x -;; FIXME: needs support for vectors -;;target riscv64 +target riscv64 function %trunc_f32(f32) -> f32 { block0(v0: f32): @@ -150,19 +149,3 @@ block0(v0: f64): ; run: %trunc_is_nan_f64(-sNaN:0x1) == 1 ; run: %trunc_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %trunc_is_nan_f64(-sNaN:0x4000000000001) == 1 - -function %trunc_f32x4(f32x4) -> f32x4 { -block0(v0: f32x4): - v1 = trunc v0 - return v1 -} -; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] -; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] - -function %trunc_f64x2(f64x2) -> f64x2 { -block0(v0: f64x2): - v1 = trunc v0 - return v1 -} -; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] -; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]