From 0e9ce4c231b4b88ce79a1639fbbb5e8bd672d3c3 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonso360@users.noreply.github.com>
Date: Mon, 19 Jun 2023 15:05:37 +0100
Subject: [PATCH] riscv64: Implement SIMD `sqmul_round_sat` and `splat+mul`
 instructions (#6602)

* riscv64: Add splat versions of multiplication instructions

* riscv64: Implement `sqmul_round_sat`
---
 build.rs                                      |   3 -
 .../codegen/src/isa/riscv64/inst/vector.rs    |  14 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  30 ++++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  29 +++
 .../filetests/isa/riscv64/simd-imul.clif      | 158 ++++++++++++++++
 .../filetests/isa/riscv64/simd-smulhi.clif    | 159 +++++++++++++++++
 .../isa/riscv64/simd-sqmulroundsat.clif       | 168 ++++++++++++++++++
 .../filetests/isa/riscv64/simd-umulhi.clif    | 160 +++++++++++++++++
 .../runtests/simd-sqmulroundsat-aarch64.clif  |  10 ++
 .../runtests/simd-sqmulroundsat.clif          |  11 ++
 10 files changed, 736 insertions(+), 6 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif

diff --git a/build.rs b/build.rs
index 10e15a3b578b..0c858e889390 100644
--- a/build.rs
+++ b/build.rs
@@ -241,14 +241,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_f64x2_cmp",
                 "simd_f64x2_pmin_pmax",
                 "simd_f64x2_rounding",
-                "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
-                "simd_i16x8_q15mulr_sat_s",
                 "simd_i32x4_cmp",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
                 "simd_i64x2_cmp",
-                "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
                 "simd_load",
                 "simd_splat",
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index a2d1cf5a0036..2d0e83eb68e4 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -318,8 +318,12 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VfsubVF => 0b000010,
             VecAluOpRRR::VrsubVX => 0b000011,
             VecAluOpRRR::VmulVV | VecAluOpRRR::VmulVX => 0b100101,
-            VecAluOpRRR::VmulhVV => 0b100111,
-            VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
+            VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhVX => 0b100111,
+            VecAluOpRRR::VmulhuVV
+            | VecAluOpRRR::VmulhuVX
+            | VecAluOpRRR::VfmulVV
+            | VecAluOpRRR::VfmulVF => 0b100100,
+            VecAluOpRRR::VsmulVV | VecAluOpRRR::VsmulVX => 0b100111,
             VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101,
             VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000,
             VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001,
@@ -365,6 +369,7 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VsubVV
             | VecAluOpRRR::VssubVV
             | VecAluOpRRR::VssubuVV
+            | VecAluOpRRR::VsmulVV
             | VecAluOpRRR::VsllVV
             | VecAluOpRRR::VsrlVV
             | VecAluOpRRR::VsraVV
@@ -399,7 +404,9 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VwsubuVX
             | VecAluOpRRR::VwsubuWX
             | VecAluOpRRR::VwsubWX
-            | VecAluOpRRR::VmulVX => VecOpCategory::OPMVX,
+            | VecAluOpRRR::VmulVX
+            | VecAluOpRRR::VmulhVX
+            | VecAluOpRRR::VmulhuVX => VecOpCategory::OPMVX,
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsaddVX
             | VecAluOpRRR::VsadduVX
@@ -407,6 +414,7 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VssubVX
             | VecAluOpRRR::VssubuVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VsmulVX
             | VecAluOpRRR::VsllVX
             | VecAluOpRRR::VsrlVX
             | VecAluOpRRR::VsraVX
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 3ec6c58f976a..fa8d08564b48 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -107,6 +107,7 @@
   (VmulVV)
   (VmulhVV)
   (VmulhuVV)
+  (VsmulVV)
   (VsllVV)
   (VsrlVV)
   (VsraVV)
@@ -145,6 +146,9 @@
   (VssubVX)
   (VssubuVX)
   (VmulVX)
+  (VmulhVX)
+  (VmulhuVX)
+  (VsmulVX)
   (VsllVX)
   (VsrlVX)
   (VsraVX)
@@ -569,11 +573,37 @@
 (rule (rv_vmulh_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vmulh.vx` instruction.
+(decl rv_vmulh_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vmulh_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhVX) vs2 vs1 mask vstate))
+
 ;; Helper for emitting the `vmulhu.vv` instruction.
 (decl rv_vmulhu_vv (VReg VReg VecOpMasking VState) VReg)
 (rule (rv_vmulhu_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vmulhu.vx` instruction.
+(decl rv_vmulhu_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vmulhu_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmulhuVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsmul.vv` instruction.
+;;
+;; Signed saturating and rounding fractional multiply
+;; # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1))
+(decl rv_vsmul_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vsmul_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsmulVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsmul.vx` instruction.
+;;
+;; Signed saturating and rounding fractional multiply
+;; # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1))
+(decl rv_vsmul_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vsmul_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsmulVX) vs2 vs1 mask vstate))
+
 ;; Helper for emitting the `sll.vv` instruction.
 (decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg)
 (rule (rv_vsll_vv vs2 vs1 mask vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index acd5d8db65f3..6978d0785d8e 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -420,6 +420,12 @@
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y)))
   (rv_vmul_vv x y (unmasked) ty))
 
+(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (imul (splat x) y)))
+  (rv_vmul_vx y x (unmasked) ty))
+
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (imul x (splat y))))
+  (rv_vmul_vx x y (unmasked) ty))
+
 ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y)))
   (lower_smlhi ty (sext x ty $I64) (sext y ty $I64)))
@@ -427,6 +433,12 @@
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y)))
   (rv_vmulh_vv x y (unmasked) ty))
 
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (smulhi (splat x) y)))
+  (rv_vmulh_vx y x (unmasked) ty))
+
+(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x (splat y))))
+  (rv_vmulh_vx x y (unmasked) ty))
+
 ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y)))
   (lower_umlhi ty (zext x ty $I64) (zext y ty $I64)))
@@ -434,6 +446,12 @@
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y)))
   (rv_vmulhu_vv x y (unmasked) ty))
 
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (umulhi (splat x) y)))
+  (rv_vmulhu_vx y x (unmasked) ty))
+
+(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x (splat y))))
+  (rv_vmulhu_vx x y (unmasked) ty))
+
 ;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule -1 (lower (has_type (fits_in_32 ty) (udiv x y)))
@@ -1871,3 +1889,14 @@
         (elem VReg (rv_vfmv_sf x ty))
         (mask VReg (gen_vec_mask 1)))
     (rv_vmerge_vvm zero elem mask ty)))
+
+;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x y)))
+  (rv_vsmul_vv x y (unmasked) ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x (splat y))))
+  (rv_vsmul_vx x y (unmasked) ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat (splat x) y)))
+  (rv_vsmul_vx y x (unmasked) ty))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif
index 0ad03c98a944..8a4ea2d956d8 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif
@@ -169,3 +169,161 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %imul_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %imul_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = imul v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmul.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x96
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif
index fa9d7cf36dc0..4dd615d06f63 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif
@@ -169,3 +169,162 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+
+function %smulhi_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = smulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulh.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x62, 0x15, 0x9e
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %smulhi_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = smulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulh.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x62, 0x15, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %smulhi_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = smulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulh.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %smulhi_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = smulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulh.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif
new file mode 100644
index 000000000000..ee2519db6ac2
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-sqmulroundsat.clif
@@ -0,0 +1,168 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %sqmulrs_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = sqmul_round_sat v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsmul.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x83, 0x11, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = sqmul_round_sat v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsmul.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x83, 0x11, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %sqmulrs_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = sqmul_round_sat v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x42, 0x15, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %sqmulrs_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = sqmul_round_sat v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x9e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif
index c17f81d91496..a4e4002c4852 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif
@@ -169,3 +169,163 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+
+
+function %umulhi_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = umulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulhu.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x62, 0x15, 0x92
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %umulhi_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = umulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulhu.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x62, 0x15, 0x92
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %umulhi_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = umulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulhu.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x92
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %umulhi_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = umulhi v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmulhu.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x62, 0x15, 0x92
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif
index 91554360b664..6980a122e7e5 100644
--- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif
+++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif
@@ -3,6 +3,7 @@ test run
 target aarch64
 target s390x
 ;; x86_64 hasn't implemented this for `i32x4`
+target riscv64 has_v
 
 function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
@@ -11,3 +12,12 @@ block0(v0: i32x4, v1: i32x4):
 }
 ; run: %sqmulrs_i32x4([1000 2000 3000 4000], [10000 100000 1000000 10000000]) == [0 0 1 19]
 ; run: %sqmulrs_i32x4([2147483647 -2147483648 -2147483648 0], [2147483647 -2147483648 2147483647 0]) == [2147483646 2147483647 -2147483647 0]
+
+function %sqmulrs_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = sqmul_round_sat v0, v2
+    return v3
+}
+; run: %sqmulrs_splat_i32x4([1 2 3 4], 1500000000) == [1 1 2 3]
+; run: %sqmulrs_splat_i32x4([2147483647 2147483647 -2147483648 0], 4) == [4 4 -4 0]
diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif
index 672cf5c4244c..3869442cfbb4 100644
--- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif
@@ -5,6 +5,7 @@ target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target riscv64 has_v
 
 function %sqmulrs_i16x8(i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8):
@@ -13,3 +14,13 @@ block0(v0: i16x8, v1: i16x8):
 }
 ; run: %sqmulrs_i16x8([1 2 3 4 5 6 7 8], [1 10 100 1000 10000 15000 20000 25000]) == [0 0 0 0 2 3 4 6]
 ; run: %sqmulrs_i16x8([32767 32767 -32768 -32768 -32768 -32768 0 0], [32767 32767 -32768 -32768 32767 32767 0 0]) == [32766 32766 32767 32767 -32767 -32767 0 0]
+
+
+function %sqmulrs_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = sqmul_round_sat v0, v2
+    return v3
+}
+; run: %sqmulrs_splat_i16x8([1 2 3 4 5 6 7 8], 15000) == [0 1 1 2 2 3 3 4]
+; run: %sqmulrs_splat_i16x8([32767 32767 -32768 -32768 -32768 -32768 0 0], 4) == [4 4 -4 -4 -4 -4 0 0]