diff --git a/build.rs b/build.rs index 5825af1be0e3..682187eaf17c 100644 --- a/build.rs +++ b/build.rs @@ -217,12 +217,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_boolean", "simd_conversions", "simd_f32x4", - "simd_f32x4_arith", "simd_f32x4_cmp", "simd_f32x4_pmin_pmax", "simd_f32x4_rounding", "simd_f64x2", - "simd_f64x2_arith", "simd_f64x2_cmp", "simd_f64x2_pmin_pmax", "simd_f64x2_rounding", diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 4a13ca4c62cb..6324484eebbb 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -1580,10 +1580,13 @@ impl Inst { // Note: vs2 and vs1 here are opposite to the standard scalar ordering. // This is noted in Section 10.1 of the RISC-V Vector spec. - match (op, vs1) { - (VecAluOpRRR::VrsubVX, vs1) if vs1 == zero_reg() => { + match (op, vs2, vs1) { + (VecAluOpRRR::VrsubVX, _, vs1) if vs1 == zero_reg() => { format!("vneg.v {},{} {}", vd_s, vs2_s, vstate) } + (VecAluOpRRR::VfsgnjnVV, vs2, vs1) if vs2 == vs1 => { + format!("vfneg.v {},{} {}", vd_s, vs2_s, vstate) + } _ => format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate), } } diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index a45cf39b3369..8d21ef9ebe57 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -251,16 +251,26 @@ impl VecAluOpRRR { pub fn funct6(&self) -> u32 { // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc match self { - VecAluOpRRR::VaddVV | VecAluOpRRR::VaddVX => 0b000000, - VecAluOpRRR::VsubVV | VecAluOpRRR::VsubVX => 0b000010, + VecAluOpRRR::VaddVV + | VecAluOpRRR::VaddVX + | VecAluOpRRR::VfaddVV + | VecAluOpRRR::VfaddVF => 0b000000, + VecAluOpRRR::VsubVV + | VecAluOpRRR::VsubVX + | VecAluOpRRR::VfsubVV + | VecAluOpRRR::VfsubVF => 0b000010, VecAluOpRRR::VrsubVX => 0b000011, VecAluOpRRR::VmulVV => 0b100101, VecAluOpRRR::VmulhVV => 0b100111, - VecAluOpRRR::VmulhuVV => 0b100100, + VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100, VecAluOpRRR::VandVV => 0b001001, VecAluOpRRR::VorVV => 0b001010, VecAluOpRRR::VxorVV => 0b001011, VecAluOpRRR::VslidedownVX => 0b001111, + VecAluOpRRR::VfrsubVF => 0b100111, + VecAluOpRRR::VfdivVV | VecAluOpRRR::VfdivVF => 0b100000, + VecAluOpRRR::VfrdivVF => 0b100001, + VecAluOpRRR::VfsgnjnVV => 0b001001, } } @@ -278,6 +288,17 @@ impl VecAluOpRRR { | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX | VecAluOpRRR::VslidedownVX => VecOpCategory::OPIVX, + VecAluOpRRR::VfaddVV + | VecAluOpRRR::VfsubVV + | VecAluOpRRR::VfmulVV + | VecAluOpRRR::VfdivVV + | VecAluOpRRR::VfsgnjnVV => VecOpCategory::OPFVV, + VecAluOpRRR::VfaddVF + | VecAluOpRRR::VfsubVF + | VecAluOpRRR::VfrsubVF + | VecAluOpRRR::VfmulVF + | VecAluOpRRR::VfdivVF + | VecAluOpRRR::VfrdivVF => VecOpCategory::OPFVF, } } @@ -360,6 +381,7 @@ impl VecAluOpRR { VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => { 0b010000 } + VecAluOpRR::VfsqrtV => 0b010011, VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111, } } @@ -369,7 +391,7 @@ impl VecAluOpRR { VecAluOpRR::VmvSX => VecOpCategory::OPMVX, VecAluOpRR::VmvXS => VecOpCategory::OPMVV, VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF, - VecAluOpRR::VfmvFS => VecOpCategory::OPFVV, + VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV, VecAluOpRR::VmvVV => VecOpCategory::OPIVV, VecAluOpRR::VmvVX => VecOpCategory::OPIVX, } @@ -386,6 +408,8 @@ impl VecAluOpRR { VecAluOpRR::VfmvSF => 0b00000, // VWFUNARY0 VecAluOpRR::VfmvFS => 0b00000, + // VFUNARY1 + VecAluOpRR::VfsqrtV => 0b00000, // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0, @@ -397,7 +421,11 @@ impl VecAluOpRR { /// other way around. As far as I can tell only vmv.v.* are backwards. pub fn vs_is_vs2_encoded(&self) -> bool { match self { - VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => true, + VecAluOpRR::VmvSX + | VecAluOpRR::VmvXS + | VecAluOpRR::VfmvSF + | VecAluOpRR::VfmvFS + | VecAluOpRR::VfsqrtV => true, VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => false, } } @@ -408,7 +436,8 @@ impl VecAluOpRR { | VecAluOpRR::VmvSX | VecAluOpRR::VmvVV | VecAluOpRR::VmvVX - | VecAluOpRR::VfmvVF => RegClass::Vector, + | VecAluOpRR::VfmvVF + | VecAluOpRR::VfsqrtV => RegClass::Vector, VecAluOpRR::VmvXS => RegClass::Int, VecAluOpRR::VfmvFS => RegClass::Float, } @@ -416,7 +445,9 @@ impl VecAluOpRR { pub fn src_regclass(&self) -> RegClass { match self { - VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV => RegClass::Vector, + VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV | VecAluOpRR::VfsqrtV => { + RegClass::Vector + } VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float, VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int, } @@ -430,6 +461,7 @@ impl fmt::Display for VecAluOpRR { VecAluOpRR::VmvXS => "vmv.x.s", VecAluOpRR::VfmvSF => "vfmv.s.f", VecAluOpRR::VfmvFS => "vfmv.f.s", + VecAluOpRR::VfsqrtV => "vfsqrt.v", VecAluOpRR::VmvVV => "vmv.v.v", VecAluOpRR::VmvVX => "vmv.v.x", VecAluOpRR::VfmvVF => "vfmv.v.f", diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index bfbe10e958cd..dcd8c2bdeb86 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -92,12 +92,23 @@ (VandVV) (VorVV) (VxorVV) + (VfaddVV) + (VfsubVV) + (VfmulVV) + (VfdivVV) + (VfsgnjnVV) ;; Vector-Scalar Opcodes (VaddVX) (VsubVX) (VrsubVX) (VslidedownVX) + (VfaddVF) + (VfsubVF) + (VfrsubVF) + (VfmulVF) + (VfdivVF) + (VfrdivVF) )) ;; Register-Imm ALU Ops @@ -125,6 +136,7 @@ (VmvVV) (VmvVX) (VfmvVF) + (VfsqrtV) )) ;; Returns the canonical destination type for a VecAluOpRRImm5. @@ -307,6 +319,73 @@ (rule (rv_vxor_vv vs2 vs1 vstate) (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate)) +;; Helper for emitting the `vfadd.vv` instruction. +(decl rv_vfadd_vv (Reg Reg VState) Reg) +(rule (rv_vfadd_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vfadd.vf` instruction. +(decl rv_vfadd_vf (Reg Reg VState) Reg) +(rule (rv_vfadd_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfaddVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfsub.vv` instruction. +(decl rv_vfsub_vv (Reg Reg VState) Reg) +(rule (rv_vfsub_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vfsub.vf` instruction. +(decl rv_vfsub_vf (Reg Reg VState) Reg) +(rule (rv_vfsub_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfsubVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfrsub.vf` instruction. +(decl rv_vfrsub_vf (Reg Reg VState) Reg) +(rule (rv_vfrsub_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfrsubVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfmul.vv` instruction. +(decl rv_vfmul_vv (Reg Reg VState) Reg) +(rule (rv_vfmul_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vfmul.vf` instruction. +(decl rv_vfmul_vf (Reg Reg VState) Reg) +(rule (rv_vfmul_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfmulVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfdiv.vv` instruction. +(decl rv_vfdiv_vv (Reg Reg VState) Reg) +(rule (rv_vfdiv_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vfdiv.vf` instruction. +(decl rv_vfdiv_vf (Reg Reg VState) Reg) +(rule (rv_vfdiv_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfdivVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfrdiv.vf` instruction. +(decl rv_vfrdiv_vf (Reg Reg VState) Reg) +(rule (rv_vfrdiv_vf vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfrdivVF) vs2 vs1 vstate)) + +;; Helper for emitting the `vfsgnjn.vv` ("Floating Point Sign Injection Negated") instruction. +;; The output of this instruction is `vs2` with the negated sign bit from `vs1` +(decl rv_vfsgnjn_vv (Reg Reg VState) Reg) +(rule (rv_vfsgnjn_vv vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VfsgnjnVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vfneg.v` instruction. +;; This instruction is a mnemonic for `vfsgnjn.vv vd, vs, vs` +(decl rv_vfneg_v (Reg VState) Reg) +(rule (rv_vfneg_v vs vstate) (rv_vfsgnjn_vv vs vs vstate)) + +;; Helper for emitting the `vfsqrt.v` instruction. +;; This instruction splats the F regsiter into all elements of the destination vector. +(decl rv_vfsqrt_v (Reg VState) Reg) +(rule (rv_vfsqrt_v vs vstate) + (vec_alu_rr (VecAluOpRR.VfsqrtV) vs vstate)) + ;; Helper for emitting the `vslidedown.vx` instruction. ;; `vslidedown` moves all elements in the vector down by n elements. ;; The top most elements are up to the tail policy. diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 7dc69850ebd0..d28e30344d60 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -584,9 +584,12 @@ (rv_fabs ty x)) ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty (fneg x))) +(rule 0 (lower (has_type (ty_scalar_float ty) (fneg x))) (rv_fneg ty x)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fneg x))) + (rv_vfneg_v x ty)) + ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (fcopysign x y))) (rv_fsgnj ty x y)) @@ -597,9 +600,11 @@ ;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty (sqrt x))) +(rule 0 (lower (has_type (ty_scalar_float ty) (sqrt x))) (rv_fsqrt ty x)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqrt x))) + (rv_vfsqrt_v x ty)) ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 @@ -706,18 +711,65 @@ ;;;;; Rules for for float arithmetic -(rule (lower (has_type ty (fadd x y))) + + +;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_scalar_float ty) (fadd x y))) (rv_fadd ty x y)) -(rule (lower (has_type ty (fsub x y))) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fadd x y))) + (rv_vfadd_vv x y ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fadd x (splat y)))) + (rv_vfadd_vf x y ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fadd (splat x) y))) + (rv_vfadd_vf y x ty)) + + +;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_scalar_float ty) (fsub x y))) (rv_fsub ty x y)) -(rule (lower (has_type ty (fmul x y))) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fsub x y))) + (rv_vfsub_vv x y ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fsub x (splat y)))) + (rv_vfsub_vf x y ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fsub (splat x) y))) + (rv_vfrsub_vf y x ty)) + +;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_scalar_float ty) (fmul x y))) (rv_fmul ty x y)) -(rule (lower (has_type ty (fdiv x y))) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fmul x y))) + (rv_vfmul_vv x y ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fmul x (splat y)))) + (rv_vfmul_vf x y ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fmul (splat x) y))) + (rv_vfmul_vf y x ty)) + + +;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule 0 (lower (has_type (ty_scalar_float ty) (fdiv x y))) (rv_fdiv ty x y)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x y))) + (rv_vfdiv_vv x y ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (fdiv x (splat y)))) + (rv_vfdiv_vf x y ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (fdiv (splat x) y))) + (rv_vfrdiv_vf y x ty)) + +;;;; Rules for `fmin/fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (rule (lower (has_type ty (fmin x y))) (gen_float_select (FloatSelectOP.Min) x y ty)) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fadd.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fadd.clif new file mode 100644 index 000000000000..a4535c6e8715 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-fadd.clif @@ -0,0 +1,249 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %fadd_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fadd v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x93, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fadd_splat_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fadd v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fadd_splat_reverse_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fadd v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fadd_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fadd v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x93, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fadd_splat_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fadd v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fadd_splat_reverse_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fadd v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfadd.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fdiv.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fdiv.clif new file mode 100644 index 000000000000..a7bb956cb390 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-fdiv.clif @@ -0,0 +1,249 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %fdiv_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fdiv v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfdiv.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x93, 0x11, 0x82 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fdiv_splat_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fdiv v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfdiv.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x82 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fdiv_splat_reverse_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fdiv v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfrdiv.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x86 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fdiv_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fdiv v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfdiv.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x93, 0x11, 0x82 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fdiv_splat_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fdiv v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfdiv.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x82 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fdiv_splat_reverse_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fdiv v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfrdiv.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x86 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fmul.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fmul.clif new file mode 100644 index 000000000000..de5aef4d7bc2 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-fmul.clif @@ -0,0 +1,249 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %fmul_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmul v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x93, 0x11, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fmul_splat_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fmul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fmul_splat_reverse_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fmul v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fmul_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmul v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x93, 0x11, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fmul_splat_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fmul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fmul_splat_reverse_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fmul v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfmul.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fneg.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fneg.clif new file mode 100644 index 000000000000..21c66e0a6e07 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-fneg.clif @@ -0,0 +1,83 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %fneg_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fneg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x92, 0x10, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fneg_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fneg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x92, 0x10, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-fsub.clif b/cranelift/filetests/filetests/isa/riscv64/simd-fsub.clif new file mode 100644 index 000000000000..a9e57567ae7d --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-fsub.clif @@ -0,0 +1,249 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %fsub_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fsub v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsub.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x93, 0x11, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fsub_splat_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fsub v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsub.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fsub_splat_reverse_f32x4(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = splat.f32x4 v1 + v3 = fsub v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfrsub.vf v5,v1,fa0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fsub_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fsub v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsub.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x93, 0x11, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fsub_splat_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fsub v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsub.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %fsub_splat_reverse_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = splat.f64x2 v1 + v3 = fsub v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfrsub.vf v5,v1,fa0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x52, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-sqrt.clif b/cranelift/filetests/filetests/isa/riscv64/simd-sqrt.clif new file mode 100644 index 000000000000..a0b3b698516b --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-sqrt.clif @@ -0,0 +1,83 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %sqrt_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = sqrt v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsqrt.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x12, 0x10, 0x4e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %sqrt_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = sqrt v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vfsqrt.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x12, 0x10, 0x4e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif index 4e74ba91b850..da8c526ca1df 100644 --- a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif @@ -7,6 +7,7 @@ target x86_64 target x86_64 sse41 target x86_64 sse42 target x86_64 sse42 has_avx +target riscv64 has_v function %splat_f32x4_2(f32x4) -> f32x4 { block0(v0: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fadd.clif b/cranelift/filetests/filetests/runtests/simd-fadd.clif new file mode 100644 index 000000000000..60764d4a87c2 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fadd.clif @@ -0,0 +1,28 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + + +function %fadd_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fadd v0, v1 + return v2 +} +; run: %fadd_f32x4([0x0.5 0x1.5 0x1.1p10 0x1.4cccccp0], [0x1.0 0x2.9 0x1.400000p1 0x1.800000p0]) == [0x1.5 0x1.fp1 0x1.10ap10 0x1.666666p1] +; run: %fadd_f32x4([0x0.0 -0x0.0 -0x0.0 0x0.0], [-0x0.0 0x0.0 +Inf -Inf]) == [0x0.0 0x0.0 +Inf -Inf] + + +function %fadd_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fadd v0, v1 + return v2 +} +; run: %fadd_f64x2([0x0.5 0x1.5], [0x1.0 0x2.9]) == [0x1.5 0x1.fp1] +; run: %fadd_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [0x0.0 0x0.0] diff --git a/cranelift/filetests/filetests/runtests/simd-fdiv.clif b/cranelift/filetests/filetests/runtests/simd-fdiv.clif new file mode 100644 index 000000000000..3a6381e47275 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fdiv.clif @@ -0,0 +1,26 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + + +function %fdiv_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fdiv v0, v1 + return v2 +} +; run: %fdiv_f32x4([0x1.5 0x1.5 0x1.5 0x1.5], [0x2.9 0x2.9 0x2.9 0x2.9]) == [0x1.063e70p-1 0x1.063e70p-1 0x1.063e70p-1 0x1.063e70p-1] + + +function %fdiv_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fdiv v0, v1 + return v2 +} +; run: %fdiv_f64x2([0x1.5 0x1.5], [0x2.9 0x2.9]) == [0x1.063e7063e7064p-1 0x1.063e7063e7064p-1] diff --git a/cranelift/filetests/filetests/runtests/simd-fmul.clif b/cranelift/filetests/filetests/runtests/simd-fmul.clif new file mode 100644 index 000000000000..4f0d5eb68d98 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fmul.clif @@ -0,0 +1,26 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + + +function %fmul_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmul v0, v1 + return v2 +} +; run: %fmul_f32x4([0x1.5 0x1.5 0x1.5 0x1.5], [0x2.9 0x2.9 0x2.9 0x2.9]) == [0x1.ae8p1 0x1.ae8p1 0x1.ae8p1 0x1.ae8p1] + + +function %fmul_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmul v0, v1 + return v2 +} +; run: %fmul_f64x2([0x1.5 0x1.5], [0x2.9 0x2.9]) == [0x1.ae8p1 0x1.ae8p1] diff --git a/cranelift/filetests/filetests/runtests/simd-fneg.clif b/cranelift/filetests/filetests/runtests/simd-fneg.clif new file mode 100644 index 000000000000..643c4f9c3ea7 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fneg.clif @@ -0,0 +1,24 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + +function %fneg_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fneg v0 + return v1 +} +; run: %fneg_f32x4([0x9.0 0x9.0 0x9.0 0x9.0]) == [-0x9.0 -0x9.0 -0x9.0 -0x9.0] + +function %fneg_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fneg v0 + return v1 +} +; run: %fneg_f64x2([0x9.0 0x9.0]) == [-0x9.0 -0x9.0] \ No newline at end of file diff --git a/cranelift/filetests/filetests/runtests/simd-fsub.clif b/cranelift/filetests/filetests/runtests/simd-fsub.clif new file mode 100644 index 000000000000..b04affda5668 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fsub.clif @@ -0,0 +1,26 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + + +function %fsub_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fsub v0, v1 + return v2 +} +; run: %fsub_f32x4([0x0.5 0x0.5 0x0.5 0x0.5], [0x1.0 0x1.0 0x1.0 0x1.0]) == [-0x1.6p-1 -0x1.6p-1 -0x1.6p-1 -0x1.6p-1] + + +function %fsub_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fsub v0, v1 + return v2 +} +; run: %fsub_f64x2([0x0.5 0x0.5], [0x1.0 0x1.0]) == [-0x1.6p-1 -0x1.6p-1] diff --git a/cranelift/filetests/filetests/runtests/simd-sqrt.clif b/cranelift/filetests/filetests/runtests/simd-sqrt.clif new file mode 100644 index 000000000000..10152fbb7117 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-sqrt.clif @@ -0,0 +1,26 @@ +test run +target aarch64 +target s390x +target x86_64 ssse3 has_sse41=false +set enable_simd +target x86_64 +target x86_64 sse41 +target x86_64 sse42 +target x86_64 sse42 has_avx +target riscv64 has_v + + +function %sqrt_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = sqrt v0 + return v1 +} +; run: %sqrt_f32x4([0x9.0 0x9.0 0x9.0 0x9.0]) == [0x3.0 0x3.0 0x3.0 0x3.0] + +function %sqrt_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = sqrt v0 + return v1 +} +; run: %sqrt_f64x2([0x9.0 0x9.0]) == [0x3.0 0x3.0] +