From 0e9ce4c231b4b88ce79a1639fbbb5e8bd672d3c3 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Mon, 19 Jun 2023 15:05:37 +0100 Subject: [PATCH] riscv64: Implement SIMD `sqmul_round_sat` and `splat+mul` instructions (#6602) * riscv64: Add splat versions of multiplication instructions * riscv64: Implement `sqmul_round_sat` --- build.rs | 3 - .../codegen/src/isa/riscv64/inst/vector.rs | 14 +- .../codegen/src/isa/riscv64/inst_vector.isle | 30 ++++ cranelift/codegen/src/isa/riscv64/lower.isle | 29 +++ .../filetests/isa/riscv64/simd-imul.clif | 158 ++++++++++++++++ .../filetests/isa/riscv64/simd-smulhi.clif | 159 +++++++++++++++++ .../isa/riscv64/simd-sqmulroundsat.clif | 168 ++++++++++++++++++ .../filetests/isa/riscv64/simd-umulhi.clif | 160 +++++++++++++++++ .../runtests/simd-sqmulroundsat-aarch64.clif | 10 ++ .../runtests/simd-sqmulroundsat.clif | 11 ++ 10 files changed, 736 insertions(+), 6 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif diff --git a/build.rs b/build.rs index 10e15a3b578b..0c858e889390 100644 --- a/build.rs +++ b/build.rs @@ -241,14 +241,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_f64x2_cmp", "simd_f64x2_pmin_pmax", "simd_f64x2_rounding", - "simd_i16x8_arith2", "simd_i16x8_cmp", - "simd_i16x8_q15mulr_sat_s", "simd_i32x4_cmp", "simd_i32x4_trunc_sat_f32x4", "simd_i32x4_trunc_sat_f64x2", "simd_i64x2_cmp", - "simd_i8x16_arith2", "simd_i8x16_cmp", "simd_load", "simd_splat", diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index a2d1cf5a0036..2d0e83eb68e4 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -318,8 +318,12 @@ impl VecAluOpRRR { | VecAluOpRRR::VfsubVF => 0b000010, VecAluOpRRR::VrsubVX => 0b000011, VecAluOpRRR::VmulVV | VecAluOpRRR::VmulVX => 0b100101, - VecAluOpRRR::VmulhVV => 0b100111, - VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100, + VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhVX => 0b100111, + VecAluOpRRR::VmulhuVV + | VecAluOpRRR::VmulhuVX + | VecAluOpRRR::VfmulVV + | VecAluOpRRR::VfmulVF => 0b100100, + VecAluOpRRR::VsmulVV | VecAluOpRRR::VsmulVX => 0b100111, VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101, VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000, VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001, @@ -365,6 +369,7 @@ impl VecAluOpRRR { | VecAluOpRRR::VsubVV | VecAluOpRRR::VssubVV | VecAluOpRRR::VssubuVV + | VecAluOpRRR::VsmulVV | VecAluOpRRR::VsllVV | VecAluOpRRR::VsrlVV | VecAluOpRRR::VsraVV @@ -399,7 +404,9 @@ impl VecAluOpRRR { | VecAluOpRRR::VwsubuVX | VecAluOpRRR::VwsubuWX | VecAluOpRRR::VwsubWX - | VecAluOpRRR::VmulVX => VecOpCategory::OPMVX, + | VecAluOpRRR::VmulVX + | VecAluOpRRR::VmulhVX + | VecAluOpRRR::VmulhuVX => VecOpCategory::OPMVX, VecAluOpRRR::VaddVX | VecAluOpRRR::VsaddVX | VecAluOpRRR::VsadduVX @@ -407,6 +414,7 @@ impl VecAluOpRRR { | VecAluOpRRR::VssubVX | VecAluOpRRR::VssubuVX | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VsmulVX | VecAluOpRRR::VsllVX | VecAluOpRRR::VsrlVX | VecAluOpRRR::VsraVX diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 3ec6c58f976a..fa8d08564b48 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -107,6 +107,7 @@ (VmulVV) (VmulhVV) (VmulhuVV) + (VsmulVV) (VsllVV) (VsrlVV) (VsraVV) @@ -145,6 +146,9 @@ (VssubVX) (VssubuVX) (VmulVX) + (VmulhVX) + (VmulhuVX) + (VsmulVX) (VsllVX) (VsrlVX) (VsraVX) @@ -569,11 +573,37 @@ (rule (rv_vmulh_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vmulh.vx` instruction. +(decl rv_vmulh_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmulh_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhVX) vs2 vs1 mask vstate)) + ;; Helper for emitting the `vmulhu.vv` instruction. (decl rv_vmulhu_vv (VReg VReg VecOpMasking VState) VReg) (rule (rv_vmulhu_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vmulhu.vx` instruction. +(decl rv_vmulhu_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vmulhu_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VmulhuVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsmul.vv` instruction. +;; +;; Signed saturating and rounding fractional multiply +;; # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1)) +(decl rv_vsmul_vv (VReg VReg VecOpMasking VState) VReg) +(rule (rv_vsmul_vv vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsmulVV) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vsmul.vx` instruction. +;; +;; Signed saturating and rounding fractional multiply +;; # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1)) +(decl rv_vsmul_vx (VReg XReg VecOpMasking VState) VReg) +(rule (rv_vsmul_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VsmulVX) vs2 vs1 mask vstate)) + ;; Helper for emitting the `sll.vv` instruction. (decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg) (rule (rv_vsll_vv vs2 vs1 mask vstate) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index acd5d8db65f3..6978d0785d8e 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -420,6 +420,12 @@ (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y))) (rv_vmul_vv x y (unmasked) ty)) +(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (imul (splat x) y))) + (rv_vmul_vx y x (unmasked) ty)) + +(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (imul x (splat y)))) + (rv_vmul_vx x y (unmasked) ty)) + ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y))) (lower_smlhi ty (sext x ty $I64) (sext y ty $I64))) @@ -427,6 +433,12 @@ (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y))) (rv_vmulh_vv x y (unmasked) ty)) +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (smulhi (splat x) y))) + (rv_vmulh_vx y x (unmasked) ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x (splat y)))) + (rv_vmulh_vx x y (unmasked) ty)) + ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y))) (lower_umlhi ty (zext x ty $I64) (zext y ty $I64))) @@ -434,6 +446,12 @@ (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y))) (rv_vmulhu_vv x y (unmasked) ty)) +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (umulhi (splat x) y))) + (rv_vmulhu_vx y x (unmasked) ty)) + +(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x (splat y)))) + (rv_vmulhu_vx x y (unmasked) ty)) + ;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type (fits_in_32 ty) (udiv x y))) @@ -1871,3 +1889,14 @@ (elem VReg (rv_vfmv_sf x ty)) (mask VReg (gen_vec_mask 1))) (rv_vmerge_vvm zero elem mask ty))) + +;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x y))) + (rv_vsmul_vv x y (unmasked) ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x (splat y)))) + (rv_vsmul_vx x y (unmasked) ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat (splat x) y))) + (rv_vsmul_vx y x (unmasked) ty)) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif index 0ad03c98a944..8a4ea2d956d8 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif @@ -169,3 +169,161 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret +function %imul_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x62, 0x15, 0x96 +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %imul_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x62, 0x15, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %imul_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %imul_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = imul v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif index fa9d7cf36dc0..4dd615d06f63 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif @@ -169,3 +169,162 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret + +function %smulhi_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = smulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x62, 0x15, 0x9e +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %smulhi_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = smulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x62, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %smulhi_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = smulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %smulhi_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = smulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif new file mode 100644 index 000000000000..ee2519db6ac2 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif @@ -0,0 +1,168 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %sqmulrs_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sqmul_round_sat v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsmul.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x83, 0x11, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = sqmul_round_sat v0, v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsmul.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x83, 0x11, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %sqmulrs_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = sqmul_round_sat v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x42, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %sqmulrs_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = sqmul_round_sat v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif index c17f81d91496..a4e4002c4852 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif @@ -169,3 +169,163 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret + + +function %umulhi_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = umulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x62, 0x15, 0x92 +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %umulhi_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = umulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x62, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %umulhi_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = umulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %umulhi_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = umulhi v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x62, 0x15, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif index 91554360b664..6980a122e7e5 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif @@ -3,6 +3,7 @@ test run target aarch64 target s390x ;; x86_64 hasn't implemented this for `i32x4` +target riscv64 has_v function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): @@ -11,3 +12,12 @@ block0(v0: i32x4, v1: i32x4): } ; run: %sqmulrs_i32x4([1000 2000 3000 4000], [10000 100000 1000000 10000000]) == [0 0 1 19] ; run: %sqmulrs_i32x4([2147483647 -2147483648 -2147483648 0], [2147483647 -2147483648 2147483647 0]) == [2147483646 2147483647 -2147483647 0] + +function %sqmulrs_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = sqmul_round_sat v0, v2 + return v3 +} +; run: %sqmulrs_splat_i32x4([1 2 3 4], 1500000000) == [1 1 2 3] +; run: %sqmulrs_splat_i32x4([2147483647 2147483647 -2147483648 0], 4) == [4 4 -4 0] diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif index 672cf5c4244c..3869442cfbb4 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif @@ -5,6 +5,7 @@ target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64 has_v function %sqmulrs_i16x8(i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8): @@ -13,3 +14,13 @@ block0(v0: i16x8, v1: i16x8): } ; run: %sqmulrs_i16x8([1 2 3 4 5 6 7 8], [1 10 100 1000 10000 15000 20000 25000]) == [0 0 0 0 2 3 4 6] ; run: %sqmulrs_i16x8([32767 32767 -32768 -32768 -32768 -32768 0 0], [32767 32767 -32768 -32768 32767 32767 0 0]) == [32766 32766 32767 32767 -32767 -32767 0 0] + + +function %sqmulrs_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = sqmul_round_sat v0, v2 + return v3 +} +; run: %sqmulrs_splat_i16x8([1 2 3 4 5 6 7 8], 15000) == [0 1 1 2 2 3 3 4] +; run: %sqmulrs_splat_i16x8([32767 32767 -32768 -32768 -32768 -32768 0 0], 4) == [4 4 -4 -4 -4 -4 0 0]