From 1d0565ba878928359ce9eb441d4e6a676b15834d Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonso360@users.noreply.github.com>
Date: Thu, 8 Jun 2023 10:39:12 +0100
Subject: [PATCH] riscv64: Implement `{u,s}widen_{low,high}` and `load+extend`
 instructions (#6534)

* riscv64: Add SIMD Load+Extends

* riscv64: Add SIMD `{u,s}widen_{low,high}`

* riscv64: Add `gen_slidedown_half`

This isn't really necessary yet, but we are going to make a lot of use for it
in the widening arithmetic instructions, so might as well add it now.

* riscv64: Add multi widen SIMD instructions

* riscv64: Typo Fix
---
 build.rs                                      |   7 -
 cranelift/codegen/src/isa/riscv64/inst.isle   |   4 +
 cranelift/codegen/src/isa/riscv64/inst/mod.rs |  11 +-
 .../codegen/src/isa/riscv64/inst/vector.rs    |  71 ++++-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  56 ++++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  95 +++++++
 cranelift/codegen/src/isle_prelude.rs         |   5 +
 cranelift/codegen/src/prelude.isle            |   4 +
 .../isa/riscv64/simd-load-extend.clif         | 142 ++++++++++
 .../isa/riscv64/simd-load-splat.clif          | 141 ++++++++++
 .../isa/riscv64/simd-swiden_high.clif         | 257 +++++++++++++++++
 .../isa/riscv64/simd-swiden_low.clif          | 242 ++++++++++++++++
 .../isa/riscv64/simd-uwiden_high.clif         | 258 ++++++++++++++++++
 .../isa/riscv64/simd-uwiden_low.clif          | 243 +++++++++++++++++
 .../filetests/runtests/simd-swidenhigh.clif   |  27 ++
 .../filetests/runtests/simd-swidenlow.clif    |  28 ++
 .../filetests/runtests/simd-uwidenhigh.clif   |  28 ++
 .../filetests/runtests/simd-uwidenlow.clif    |  34 ++-
 18 files changed, 1636 insertions(+), 17 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif

diff --git a/build.rs b/build.rs
index e887ae0a0eee..9cb05298cd64 100644
--- a/build.rs
+++ b/build.rs
@@ -234,11 +234,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             }
 
             let known_failure = [
-                "almost_extmul",
                 "canonicalize_nan",
                 "cvt_from_uint",
                 "issue_3327_bnot_lowering",
-                "simd_align",
                 "simd_conversions",
                 "simd_f32x4",
                 "simd_f32x4_cmp",
@@ -251,23 +249,18 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
                 "simd_i16x8_extadd_pairwise_i8x16",
-                "simd_i16x8_extmul_i8x16",
                 "simd_i16x8_q15mulr_sat_s",
                 "simd_i32x4_arith2",
                 "simd_i32x4_cmp",
                 "simd_i32x4_dot_i16x8",
                 "simd_i32x4_extadd_pairwise_i16x8",
-                "simd_i32x4_extmul_i16x8",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
                 "simd_i64x2_arith2",
                 "simd_i64x2_cmp",
-                "simd_i64x2_extmul_i32x4",
                 "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
-                "simd_int_to_int_extend",
                 "simd_load",
-                "simd_load_extend",
                 "simd_load_zero",
                 "simd_splat",
                 "v128_select",
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index a414ad826ceb..f0a79b9d5dbb 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -1592,6 +1592,10 @@
 (decl pure partial uimm5_from_u64 (UImm5) u64)
 (extern extractor uimm5_from_u64 uimm5_from_u64)
 
+;; Convert a `u64` into an `UImm5`
+(decl pure partial u64_to_uimm5 (u64) UImm5)
+(rule (u64_to_uimm5 (uimm5_from_u64 n)) n)
+
 (decl uimm5_bitcast_to_imm5 (UImm5) Imm5)
 (extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5)
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 5e3cc27b90c2..79c1fbfd9fd3 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -700,7 +700,16 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             debug_assert_eq!(vs.class(), op.src_regclass());
 
             collector.reg_use(vs);
-            collector.reg_def(vd);
+
+            // If the operation forbids source/destination overlap, then we must
+            // register it as an early_def. This encodes the constraint that
+            // these must not overlap.
+            if op.forbids_src_dst_overlaps() {
+                collector.reg_early_def(vd);
+            } else {
+                collector.reg_def(vd);
+            }
+
             vec_mask_operands(mask, collector);
         }
         &Inst::VecAluRImm5 { vd, ref mask, .. } => {
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index f68e6b558b0b..7136ccd2d6a8 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -498,6 +498,12 @@ impl VecAluOpRR {
             VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => {
                 0b010000
             }
+            VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => 0b010010,
             VecAluOpRR::VfsqrtV => 0b010011,
             VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111,
         }
@@ -506,7 +512,13 @@ impl VecAluOpRR {
     pub fn category(&self) -> VecOpCategory {
         match self {
             VecAluOpRR::VmvSX => VecOpCategory::OPMVX,
-            VecAluOpRR::VmvXS => VecOpCategory::OPMVV,
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV,
             VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF,
             VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV,
             VecAluOpRR::VmvVV => VecOpCategory::OPIVV,
@@ -527,6 +539,13 @@ impl VecAluOpRR {
             VecAluOpRR::VfmvFS => 0b00000,
             // VFUNARY1
             VecAluOpRR::VfsqrtV => 0b00000,
+            // VXUNARY0
+            VecAluOpRR::VzextVF8 => 0b00010,
+            VecAluOpRR::VsextVF8 => 0b00011,
+            VecAluOpRR::VzextVF4 => 0b00100,
+            VecAluOpRR::VsextVF4 => 0b00101,
+            VecAluOpRR::VzextVF2 => 0b00110,
+            VecAluOpRR::VsextVF2 => 0b00111,
             // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states:
             // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved.
             VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0,
@@ -538,7 +557,15 @@ impl VecAluOpRR {
     /// other way around. As far as I can tell only vmv.v.* are backwards.
     pub fn vs_is_vs2_encoded(&self) -> bool {
         match self {
-            VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true,
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VfmvFS
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => true,
             VecAluOpRR::VmvSX
             | VecAluOpRR::VfmvSF
             | VecAluOpRR::VmvVV
@@ -554,7 +581,13 @@ impl VecAluOpRR {
             | VecAluOpRR::VmvVV
             | VecAluOpRR::VmvVX
             | VecAluOpRR::VfmvVF
-            | VecAluOpRR::VfsqrtV => RegClass::Vector,
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => RegClass::Vector,
             VecAluOpRR::VmvXS => RegClass::Int,
             VecAluOpRR::VfmvFS => RegClass::Float,
         }
@@ -562,13 +595,33 @@ impl VecAluOpRR {
 
     pub fn src_regclass(&self) -> RegClass {
         match self {
-            VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV | VecAluOpRR::VfsqrtV => {
-                RegClass::Vector
-            }
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VfmvFS
+            | VecAluOpRR::VmvVV
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => RegClass::Vector,
             VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float,
             VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int,
         }
     }
+
+    /// Some instructions do not allow the source and destination registers to overlap.
+    pub fn forbids_src_dst_overlaps(&self) -> bool {
+        match self {
+            VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRR {
@@ -579,6 +632,12 @@ impl fmt::Display for VecAluOpRR {
             VecAluOpRR::VfmvSF => "vfmv.s.f",
             VecAluOpRR::VfmvFS => "vfmv.f.s",
             VecAluOpRR::VfsqrtV => "vfsqrt.v",
+            VecAluOpRR::VzextVF2 => "vzext.vf2",
+            VecAluOpRR::VzextVF4 => "vzext.vf4",
+            VecAluOpRR::VzextVF8 => "vzext.vf8",
+            VecAluOpRR::VsextVF2 => "vsext.vf2",
+            VecAluOpRR::VsextVF4 => "vsext.vf4",
+            VecAluOpRR::VsextVF8 => "vsext.vf8",
             VecAluOpRR::VmvVV => "vmv.v.v",
             VecAluOpRR::VmvVX => "vmv.v.x",
             VecAluOpRR::VfmvVF => "vfmv.v.f",
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 126551791e83..bd021c7cf458 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -186,6 +186,12 @@
   (VmvVX)
   (VfmvVF)
   (VfsqrtV)
+  (VsextVF2)
+  (VsextVF4)
+  (VsextVF8)
+  (VzextVF2)
+  (VzextVF4)
+  (VzextVF8)
 ))
 
 ;; Returns the canonical destination type for a VecAluOpRRImm5.
@@ -745,6 +751,42 @@
 (rule (rv_vmslt_vx vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vzext.vf2` instruction.
+;; Zero-extend SEW/2 source to SEW destination
+(decl rv_vzext_vf2 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf2 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF2) vs mask vstate))
+
+;; Helper for emitting the `vzext.vf4` instruction.
+;; Zero-extend SEW/4 source to SEW destination
+(decl rv_vzext_vf4 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf4 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF4) vs mask vstate))
+
+;; Helper for emitting the `vzext.vf8` instruction.
+;; Zero-extend SEW/8 source to SEW destination
+(decl rv_vzext_vf8 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf8 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF8) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf2` instruction.
+;; Sign-extend SEW/2 source to SEW destination
+(decl rv_vsext_vf2 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf2 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF2) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf4` instruction.
+;; Sign-extend SEW/4 source to SEW destination
+(decl rv_vsext_vf4 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf4 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF4) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf8` instruction.
+;; Sign-extend SEW/8 source to SEW destination
+(decl rv_vsext_vf8 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf8 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF8) vs mask vstate))
+
 ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl gen_extractlane (Type VReg u8) Reg)
@@ -796,3 +838,17 @@
     (mem_flags_trusted)
     (unmasked)
     ty))
+
+
+;; Emits a vslidedown instruction that moves half the lanes down.
+(decl gen_slidedown_half (Type VReg) VReg)
+
+;; If the lane count can fit in a 5-bit immediate, we can use `vslidedown.vi`.
+(rule 1 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
+  (if-let (uimm5_from_u64 amt) (u64_udiv (ty_lane_count ty) 2))
+  (rv_vslidedown_vi src amt (unmasked) ty))
+
+;; Otherwise lower it into an X register.
+(rule 0 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
+  (if-let amt (u64_udiv (ty_lane_count ty) 2))
+  (rv_vslidedown_vx src (imm $I64 amt) (unmasked) ty))
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index de6f1d3a793f..4a2d14503751 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1087,6 +1087,51 @@
   (let ((eew VecElementWidth (element_width_from_type ty)))
     (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty)))
 
+;;;;;  Rules for Load + Extend Combos ;;;;;;;;;
+
+;; These rules cover the special loads that load a 64bit value and do some sort of extension.
+;; We don't have any special instructions to do this, so just load the 64 bits as a vector, and
+;; do a SEW/2 extension. This only reads half width elements from the source vector register
+;; extends it, and writes the back the full register.
+
+(decl gen_load64_extend (Type ExtendOp MemFlags XReg Offset32) VReg)
+
+(rule (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)
+  (let ((eew VecElementWidth (element_width_from_type $I64))
+        (load_state VState (vstate_from_type $I64))
+        (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
+    (rv_vsext_vf2 loaded (unmasked) ty)))
+
+(rule (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)
+  (let ((eew VecElementWidth (element_width_from_type $I64))
+        (load_state VState (vstate_from_type $I64))
+        (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
+    (rv_vzext_vf2 loaded (unmasked) ty)))
+
+;;;;;  Rules for `uload8x8`;;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (uload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `uload16x4`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (uload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `uload32x2`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (uload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `sload8x8`;;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (sload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
+;;;;;  Rules for `sload16x4`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (sload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
+;;;;;  Rules for `sload32x2`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (sload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
 ;;;;;  Rules for `istore8`;;;;;;;;;
 (rule
   (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset))
@@ -1428,3 +1473,53 @@
         (y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty))
         (y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty)))
     (rv_vor_vv x_lanes y_lanes (unmasked) ty)))
+
+;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Slide down half the vector, and do a signed extension.
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high x @ (value_type in_ty))))
+  (rv_vsext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high x @ (value_type in_ty)))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vsext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high (swiden_high x @ (value_type in_ty))))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vsext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Slide down half the vector, and do a zero extension.
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high x @ (value_type in_ty))))
+  (rv_vzext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high x @ (value_type in_ty)))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vzext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high (uwiden_high x @ (value_type in_ty))))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vzext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low x)))
+  (rv_vsext_vf2 x (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low x))))
+  (rv_vsext_vf4 x (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low (swiden_low x)))))
+  (rv_vsext_vf8 x (unmasked) out_ty))
+
+;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low x)))
+  (rv_vzext_vf2 x (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low x))))
+  (rv_vzext_vf4 x (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x)))))
+  (rv_vzext_vf8 x (unmasked) out_ty))
diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs
index 2c252ddb54d4..0a21c3750f2e 100644
--- a/cranelift/codegen/src/isle_prelude.rs
+++ b/cranelift/codegen/src/isle_prelude.rs
@@ -237,6 +237,11 @@ macro_rules! isle_common_prelude_methods {
             u64::MAX >> shift
         }
 
+        #[inline]
+        fn ty_lane_count(&mut self, ty: Type) -> u64 {
+            ty.lane_count() as u64
+        }
+
         #[inline]
         fn ty_umin(&mut self, _ty: Type) -> u64 {
             0
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index 2f910f399c23..8757978827f2 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -265,6 +265,10 @@
 (decl pure ty_lane_mask (Type) u64)
 (extern constructor ty_lane_mask ty_lane_mask)
 
+;; Get the number of lanes for a given type.
+(decl pure ty_lane_count (Type) u64)
+(extern constructor ty_lane_count ty_lane_count)
+
 ;; Get the byte width of a given type.
 (decl pure ty_bytes (Type) u16)
 (extern constructor ty_bytes ty_bytes)
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif b/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif
new file mode 100644
index 000000000000..9342ca8d3e5a
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif
@@ -0,0 +1,142 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %uload8x8(i64) -> i16x8 {
+block0(v0: i64):
+  v1 = uload8x8 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vzext.vf2 v5,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x22, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
+function %sload8x8(i64) -> i16x8 {
+block0(v0: i64):
+  v1 = sload8x8 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vsext.vf2 v5,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0xa2, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
+function %uload16x4(i64) -> i32x4 {
+block0(v0: i64):
+  v1 = uload16x4 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vzext.vf2 v5,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x22, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
+function %sload16x4(i64) -> i32x4 {
+block0(v0: i64):
+  v1 = sload16x4 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vsext.vf2 v5,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0xa2, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
+function %uload32x2(i64) -> i64x2 {
+block0(v0: i64):
+  v1 = uload32x2 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vzext.vf2 v5,v3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x22, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
+function %sload32x2(i64) -> i64x2 {
+block0(v0: i64):
+  v1 = sload32x2 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma)
+;   vsext.vf2 v5,v3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x87, 0x71, 0x05, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0xa2, 0x33, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif b/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif
new file mode 100644
index 000000000000..70b8072cf7a0
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif
@@ -0,0 +1,141 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %load_splat_i8x16(i64) -> i8x16 {
+block0(v0: i64):
+    v1 = load.i8 v0
+    v2 = splat.i8x16 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   lb a2,0(a0)
+;   vmv.v.x v4,a2 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lb a2, 0(a0)
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x57, 0x42, 0x06, 0x5e
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
+function %load_splat_i16x8(i64) -> i16x8 {
+block0(v0: i64):
+    v1 = load.i16 v0
+    v2 = splat.i16x8 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   lh a2,0(a0)
+;   vmv.v.x v4,a2 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lh a2, 0(a0)
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x42, 0x06, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
+function %load_splat_i32x4(i64) -> i32x4 {
+block0(v0: i64):
+    v1 = load.i32 v0
+    v2 = splat.i32x4 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   lw a2,0(a0)
+;   vmv.v.x v4,a2 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lw a2, 0(a0)
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x42, 0x06, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
+function %load_splat_i64x2(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = load.i64 v0
+    v2 = splat.i64x2 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ld a2,0(a0)
+;   vmv.v.x v4,a2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ld a2, 0(a0)
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x42, 0x06, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
+function %load_splat_f32x4(i64) -> f32x4 {
+block0(v0: i64):
+    v1 = load.f32 v0
+    v2 = splat.f32x4 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   flw ft4,0(a0)
+;   vfmv.v.f v4,ft4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   flw ft4, 0(a0)
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x52, 0x02, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
+function %load_splat_f64x2(i64) -> f64x2 {
+block0(v0: i64):
+    v1 = load.f64 v0
+    v2 = splat.f64x2 v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   fld ft4,0(a0)
+;   vfmv.v.f v4,ft4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   fld ft4, 0(a0)
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x52, 0x02, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x82, 0x05, 0x02
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif
new file mode 100644
index 000000000000..8c2e09f04d57
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif
@@ -0,0 +1,257 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %swidenhigh_i8x16(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = swiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf2 v6,v4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x14, 0x3e
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0xa3, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenhigh_i16x8(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = swiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vsext.vf2 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x12, 0x3e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xa3, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenhigh_i32x4(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = swiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,2 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vsext.vf2 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x32, 0x11, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa3, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenhigh_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,12 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf4 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x16, 0x3e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xa3, 0x42, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenhigh_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,6 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vsext.vf4 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x13, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa3, 0x42, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenhigh_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    v3 = swiden_high v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,14 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf8 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x17, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa3, 0x41, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif
new file mode 100644
index 000000000000..4885cf96e09f
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif
@@ -0,0 +1,242 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %swidenlow_i8x16(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = swiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf2 v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0xa2, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenlow_i16x8(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = swiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf2 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xa2, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenlow_i32x4(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = swiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf2 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa2, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenlow_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf4 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xa2, 0x12, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenlow_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf4 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa2, 0x12, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swidenlow_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    v3 = swiden_low v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsext.vf8 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xa2, 0x11, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif
new file mode 100644
index 000000000000..63849e265c36
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif
@@ -0,0 +1,258 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %uwidenhigh_i8x16(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,8 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf2 v6,v4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x14, 0x3e
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x23, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenhigh_i16x8(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = uwiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vzext.vf2 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x12, 0x3e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x23, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenhigh_i32x4(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = uwiden_high v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,2 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vzext.vf2 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x32, 0x11, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x23, 0x43, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenhigh_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,12 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf4 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x16, 0x3e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x23, 0x42, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenhigh_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,6 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vzext.vf4 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x13, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x23, 0x42, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenhigh_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    v3 = uwiden_high v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vslidedown.vi v4,v1,14 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf8 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x17, 0x3e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x23, 0x41, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif
new file mode 100644
index 000000000000..5c273814c539
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif
@@ -0,0 +1,243 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %uwidenlow_i8x16(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = uwiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf2 v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x22, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenlow_i16x8(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = uwiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf2 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x22, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenlow_i32x4(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = uwiden_low v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf2 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x22, 0x13, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenlow_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf4 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x22, 0x12, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenlow_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf4 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x22, 0x12, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %uwidenlow_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    v3 = uwiden_low v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vzext.vf8 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x22, 0x11, 0x4a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
index 813826e817c7..3a9ca809a520 100644
--- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
@@ -6,6 +6,7 @@ set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse41 has_avx
+target riscv64gc has_v
 
 function %swidenhigh_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
@@ -27,3 +28,29 @@ block0(v0: i32x4):
     return v1
 }
 ; run: %swidenhigh_i32x4([1 -2 3 -4]) == [3 -4]
+
+function %swidenhigh_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    return v2
+}
+; run: %swidenhigh_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [13 -14 15 -16]
+
+function %swidenhigh_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    return v2
+}
+; run: %swidenhigh_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [7 -8]
+
+
+function %swidenhigh_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = swiden_high v0
+    v2 = swiden_high v1
+    v3 = swiden_high v2
+    return v3
+}
+; run: %swidenhigh_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [15 -16]
diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
index c671f3781ebe..e1d54353dee6 100644
--- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
@@ -6,6 +6,7 @@ set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse41 has_avx
+target riscv64gc has_v
 
 function %swidenlow_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
@@ -27,3 +28,30 @@ block0(v0: i32x4):
     return v1
 }
 ; run: %swidenlow_i32x4([1 -2 3 -4]) == [1 -2]
+
+
+function %swidenlow_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    return v2
+}
+; run: %swidenlow_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 -2 3 -4]
+
+function %swidenlow_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    return v2
+}
+; run: %swidenlow_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [1 -2]
+
+
+function %swidenlow_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = swiden_low v0
+    v2 = swiden_low v1
+    v3 = swiden_low v2
+    return v3
+}
+; run: %swidenlow_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 -2]
diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
index 56b68a3830c8..f636b3cb07e7 100644
--- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
+++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
@@ -6,6 +6,7 @@ set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse41 has_avx
+target riscv64gc has_v
 
 function %uwidenhigh_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
@@ -30,3 +31,30 @@ block0(v0: i32x4):
 }
 ; run: %uwidenhigh_i32x4([1 2 3 4]) == [3 4]
 ; run: %uwidenhigh_i32x4([4 5 6 -1]) == [6 0xffffffff]
+
+
+function %uwidenhigh_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    return v2
+}
+; run: %uwidenhigh_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [13 0xF2 15 0xF0]
+
+function %uwidenhigh_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    return v2
+}
+; run: %uwidenhigh_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [7 0xFFF8]
+
+
+function %uwidenhigh_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    v2 = uwiden_high v1
+    v3 = uwiden_high v2
+    return v3
+}
+; run: %uwidenhigh_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [15 0xF0]
diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
index e5b2801b8410..5b2a1963e971 100644
--- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
@@ -6,24 +6,52 @@ set enable_simd
 target x86_64
 target x86_64 sse41
 target x86_64 sse41 has_avx
+target riscv64gc has_v
 
 function %uwidenlow_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
     v1 = uwiden_low v0
     return v1
 }
-; run: %uwidenlow_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8]
+; run: %uwidenlow_i8x16([1 -2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [1 0xFE 3 4 5 6 7 8]
 
 function %uwidenlow_i16x8(i16x8) -> i32x4 {
 block0(v0: i16x8):
     v1 = uwiden_low v0
     return v1
 }
-; run: %uwidenlow_i16x8([1 2 3 4 5 6 7 8]) == [1 2 3 4]
+; run: %uwidenlow_i16x8([1 -2 3 4 5 6 7 8]) == [1 0xFFFE 3 4]
 
 function %uwidenlow_i32x4(i32x4) -> i64x2 {
 block0(v0: i32x4):
     v1 = uwiden_low v0
     return v1
 }
-; run: %uwidenlow_i32x4([1 2 3 4]) == [1 2]
+; run: %uwidenlow_i32x4([1 -2 3 4]) == [1 0xFFFFFFFE]
+
+
+function %uwidenlow_twice_i8x16(i8x16) -> i32x4 {
+block0(v0: i8x16):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    return v2
+}
+; run: %uwidenlow_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 0xFE 3 0xFC]
+
+function %uwidenlow_twice_i16x8(i16x8) -> i64x2 {
+block0(v0: i16x8):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    return v2
+}
+; run: %uwidenlow_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [1 0xFFFE]
+
+
+function %uwidenlow_triple_i8x16(i8x16) -> i64x2 {
+block0(v0: i8x16):
+    v1 = uwiden_low v0
+    v2 = uwiden_low v1
+    v3 = uwiden_low v2
+    return v3
+}
+; run: %uwidenlow_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 0xFE]