riscv64: Implement SIMD sqmul_round_sat and splat+mul instructions (

bytecodealliance#6602) * riscv64: Add splat versions of multiplication instructions * riscv64: Implement `sqmul_round_sat`
gurry · Jun 19, 2023 · 0e9ce4c · 0e9ce4c
1 parent 4756114
commit 0e9ce4c
Show file tree

Hide file tree

Showing 10 changed files with 736 additions and 6 deletions.
diff --git a/build.rs b/build.rs
@@ -241,14 +241,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_f64x2_cmp",
                 "simd_f64x2_pmin_pmax",
                 "simd_f64x2_rounding",
-                "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
-                "simd_i16x8_q15mulr_sat_s",
                 "simd_i32x4_cmp",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
                 "simd_i64x2_cmp",
-                "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
                 "simd_load",
                 "simd_splat",

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -318,8 +318,12 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VfsubVF => 0b000010,
             VecAluOpRRR::VrsubVX => 0b000011,
             VecAluOpRRR::VmulVV | VecAluOpRRR::VmulVX => 0b100101,
-            VecAluOpRRR::VmulhVV => 0b100111,
-            VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
+            VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhVX => 0b100111,
+            VecAluOpRRR::VmulhuVV
+            | VecAluOpRRR::VmulhuVX
+            | VecAluOpRRR::VfmulVV
+            | VecAluOpRRR::VfmulVF => 0b100100,
+            VecAluOpRRR::VsmulVV | VecAluOpRRR::VsmulVX => 0b100111,
             VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101,
             VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000,
             VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001,
@@ -365,6 +369,7 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VsubVV
             | VecAluOpRRR::VssubVV
             | VecAluOpRRR::VssubuVV
+            | VecAluOpRRR::VsmulVV
             | VecAluOpRRR::VsllVV
             | VecAluOpRRR::VsrlVV
             | VecAluOpRRR::VsraVV
@@ -399,14 +404,17 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VwsubuVX
             | VecAluOpRRR::VwsubuWX
             | VecAluOpRRR::VwsubWX
-            | VecAluOpRRR::VmulVX => VecOpCategory::OPMVX,
+            | VecAluOpRRR::VmulVX
+            | VecAluOpRRR::VmulhVX
+            | VecAluOpRRR::VmulhuVX => VecOpCategory::OPMVX,
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsaddVX
             | VecAluOpRRR::VsadduVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VssubVX
             | VecAluOpRRR::VssubuVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VsmulVX
             | VecAluOpRRR::VsllVX
             | VecAluOpRRR::VsrlVX
             | VecAluOpRRR::VsraVX

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -107,6 +107,7 @@
   (VmulVV)
   (VmulhVV)
   (VmulhuVV)
+  (VsmulVV)
   (VsllVV)
   (VsrlVV)
   (VsraVV)
@@ -145,6 +146,9 @@
   (VssubVX)
   (VssubuVX)
   (VmulVX)
+  (VmulhVX)
+  (VmulhuVX)
+  (VsmulVX)
   (VsllVX)
   (VsrlVX)
   (VsraVX)
@@ -569,11 +573,37 @@
 (rule (rv_vmulh_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vmulh.vx` instruction.
+(decl rv_vmulh_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vmulh_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhVX) vs2 vs1 mask vstate))
+
 ;; Helper for emitting the `vmulhu.vv` instruction.
 (decl rv_vmulhu_vv (VReg VReg VecOpMasking VState) VReg)
 (rule (rv_vmulhu_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vmulhu.vx` instruction.
+(decl rv_vmulhu_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vmulhu_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhuVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsmul.vv` instruction.
+;;
+;; Signed saturating and rounding fractional multiply
+;; # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1))
+(decl rv_vsmul_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vsmul_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsmulVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsmul.vx` instruction.
+;;
+;; Signed saturating and rounding fractional multiply
+;; # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1))
+(decl rv_vsmul_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vsmul_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsmulVX) vs2 vs1 mask vstate))
+
 ;; Helper for emitting the `sll.vv` instruction.
 (decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg)
 (rule (rv_vsll_vv vs2 vs1 mask vstate)

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -420,20 +420,38 @@
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y)))
   (rv_vmul_vv x y (unmasked) ty))
 
+(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (imul (splat x) y)))
+  (rv_vmul_vx y x (unmasked) ty))
+
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (imul x (splat y))))
+  (rv_vmul_vx x y (unmasked) ty))
+
 ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y)))
   (lower_smlhi ty (sext x ty $I64) (sext y ty $I64)))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y)))
   (rv_vmulh_vv x y (unmasked) ty))
 
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (smulhi (splat x) y)))
+  (rv_vmulh_vx y x (unmasked) ty))
+
+(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x (splat y))))
+  (rv_vmulh_vx x y (unmasked) ty))
+
 ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y)))
   (lower_umlhi ty (zext x ty $I64) (zext y ty $I64)))
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y)))
   (rv_vmulhu_vv x y (unmasked) ty))
 
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (umulhi (splat x) y)))
+  (rv_vmulhu_vx y x (unmasked) ty))
+
+(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x (splat y))))
+  (rv_vmulhu_vx x y (unmasked) ty))
+
 ;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule -1 (lower (has_type (fits_in_32 ty) (udiv x y)))
@@ -1871,3 +1889,14 @@
         (elem VReg (rv_vfmv_sf x ty))
         (mask VReg (gen_vec_mask 1)))
     (rv_vmerge_vvm zero elem mask ty)))
+
+;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x y)))
+  (rv_vsmul_vv x y (unmasked) ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x (splat y))))
+  (rv_vsmul_vx x y (unmasked) ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat (splat x) y)))
+  (rv_vsmul_vx y x (unmasked) ty))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif
@@ -169,3 +169,161 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %imul_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+