From 47561149c8f3cae1150c82ef824be390c62cb273 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonso360@users.noreply.github.com>
Date: Sat, 17 Jun 2023 22:27:53 +0100
Subject: [PATCH] riscv64: Implement a few misc SIMD instructions (#6598)

* riscv64: Add immediate rule to `gen_vec_mask`

* riscv64: Implement `scalar_to_vector`

* riscv64: Implement vector `select`

* riscv64: Implement SIMD `iabs`

* wasmtime: Enable SIMD memory64 tests for riscv64

* cranelift: Update targets for `simd-select` tests
---
 build.rs                                      |   9 -
 cranelift/codegen/src/isa/riscv64/inst.isle   |  12 -
 .../codegen/src/isa/riscv64/inst_vector.isle  |  11 +-
 cranelift/codegen/src/isa/riscv64/lower.isle  |  35 ++-
 .../codegen/src/isa/riscv64/lower/isle.rs     |   2 +-
 .../filetests/isa/riscv64/simd-iabs.clif      | 166 +++++++++++
 .../isa/riscv64/simd-insertlane.clif          | 112 ++++----
 .../isa/riscv64/simd-scalartovector.clif      | 159 ++++++++++
 .../filetests/isa/riscv64/simd-select.clif    | 272 ++++++++++++++++++
 .../filetests/runtests/simd-iabs.clif         |   1 +
 .../runtests/simd-scalartovector-aarch64.clif |   1 +
 .../runtests/simd-scalartovector.clif         |   1 +
 .../filetests/runtests/simd-select.clif       |  65 +++++
 13 files changed, 753 insertions(+), 93 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-select.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-select.clif

diff --git a/build.rs b/build.rs
index 28dceb3ee1f6..10e15a3b578b 100644
--- a/build.rs
+++ b/build.rs
@@ -228,11 +228,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 return true;
             }
 
-            // The memory64 testsuite has a single SIMD test that we don't pass yet.
-            if testname == "simd" && testsuite == "memory64" {
-                return true;
-            }
-
             let known_failure = [
                 "canonicalize_nan",
                 "cvt_from_uint",
@@ -249,18 +244,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
                 "simd_i16x8_q15mulr_sat_s",
-                "simd_i32x4_arith2",
                 "simd_i32x4_cmp",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
-                "simd_i64x2_arith2",
                 "simd_i64x2_cmp",
                 "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
                 "simd_load",
-                "simd_load_zero",
                 "simd_splat",
-                "v128_select",
             ]
             .contains(&testname);
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index a02145caa417..ec70bbe361d9 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -2673,18 +2673,6 @@
   (gen_select_reg (IntCC.SignedGreaterThan) x y x y))
 
 
-(decl lower_iabs (Type XReg) XReg)
-
-; I64 and lower
-; Generate the following code:
-;   sext.{b,h,w} a0, a0
-;   neg a1, a0
-;   max a0, a0, a1
-(rule (lower_iabs (fits_in_64 ty) val)
-  (let ((extended XReg (sext val ty $I64))
-        (negated XReg (rv_neg extended)))
-    (max $I64 extended negated)))
-
 (decl gen_trapif (XReg TrapCode) InstOutput)
 (rule
   (gen_trapif test trap_code)
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index baef0578a1a1..3ec6c58f976a 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -1006,14 +1006,17 @@
 
 
 ;; Build a vector mask from a u64
-;; TODO: We should merge this with the `vconst` rules, and take advantage of
-;; the other existing `vconst` rules. One example is using `vmv.v.i` which
-;; can represent some of these masks.
+;; TODO(#6571): We should merge this with the `vconst` rules, and take advantage of
+;; the other existing `vconst` rules.
 (decl gen_vec_mask (u64) VReg)
 
+;; When the immediate fits in a 5-bit immediate, we can use `vmv.v.i` directly.
+(rule 1 (gen_vec_mask (imm5_from_u64 imm))
+  (rv_vmv_vi imm (vstate_from_type $I64X2)))
+
 ;; Materialize the mask into an X register, and move it into the bottom of
 ;; the vector register.
-(rule (gen_vec_mask mask)
+(rule 0 (gen_vec_mask mask)
   (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2)))
 
 
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 93e0bda7fd92..acd5d8db65f3 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1556,9 +1556,23 @@
   (load_ra))
 
 ;;; Rules for `iabs` ;;;;;;;;;;;;;
-(rule
-  (lower (has_type (fits_in_64 ty) (iabs x)))
-  (lower_iabs ty x))
+
+;; I64 and lower
+;; Generate the following code:
+;;   sext.{b,h,w} a0, a0
+;;   neg a1, a0
+;;   max a0, a0, a1
+(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iabs x)))
+  (let ((extended XReg (sext x ty $I64))
+        (negated XReg (rv_neg extended)))
+    (max $I64 extended negated)))
+
+;; For vectors we generate the same code, but with vector instructions
+;; we can skip the sign extension, since the vector unit will only process
+;; Element Sized chunks.
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (iabs x)))
+  (let ((negated VReg (rv_vneg_v x (unmasked) ty)))
+    (rv_vmax_vv x negated (unmasked) ty)))
 
 ;;;; Rules for calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1842,3 +1856,18 @@
         (xor  VReg (rv_vxor_vv x y (unmasked) ty))
         (rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
     (rv_vadd_vv lhs rhs (unmasked) ty)))
+
+;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x)))
+  (if (ty_vector_not_float ty))
+  (let ((zero VReg (rv_vmv_vx (zero_reg) ty))
+        (mask VReg (gen_vec_mask 1)))
+    (rv_vmerge_vxm zero x mask ty)))
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x)))
+  (if (ty_vector_float ty))
+  (let ((zero VReg (rv_vmv_vx (zero_reg) ty))
+        (elem VReg (rv_vfmv_sf x ty))
+        (mask VReg (gen_vec_mask 1)))
+    (rv_vmerge_vvm zero elem mask ty)))
diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
index c51d294fc55d..c03a93640131 100644
--- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -224,7 +224,7 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
             } else {
                 vec![self.temp_writable_reg(I64), self.temp_writable_reg(I64)]
             }
-        } else if ty.is_float() {
+        } else if ty.is_float() || ty.is_vector() {
             vec![self.temp_writable_reg(ty)]
         } else {
             unimplemented!("ty:{:?}", ty)
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif
new file mode 100644
index 000000000000..b14ebfa34f23
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif
@@ -0,0 +1,166 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %iabs_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif
index 0f8c14dfb6f2..975dbbf2def2 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-insertlane.clif
@@ -102,10 +102,9 @@ block0(v0: i32x4, v1: i32):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,4
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vmerge.vxm v9,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
-;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v1,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -121,13 +120,12 @@ block0(v0: i32x4, v1: i32):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 4
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x30, 0x02, 0x5e
 ;   .byte 0x57, 0x70, 0x02, 0xcd
-;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0xd7, 0x43, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x84, 0x05, 0x02
+;   .byte 0xa7, 0x83, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -146,10 +144,9 @@ block0(v0: i64x2, v1: i64):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,1
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vmerge.vxm v9,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
-;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v1,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -165,12 +162,11 @@ block0(v0: i64x2, v1: i64):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 1
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
-;   .byte 0xd7, 0x44, 0x15, 0x5c
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0xd7, 0x43, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x84, 0x05, 0x02
+;   .byte 0xa7, 0x83, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -189,10 +185,9 @@ block0(v0: f64x2, v1: f64):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,1
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
-;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v7,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -208,12 +203,11 @@ block0(v0: f64x2, v1: f64):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 1
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
-;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0xd7, 0x53, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x04, 0x05, 0x02
+;   .byte 0xa7, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -232,10 +226,9 @@ block0(v0: f64x2, v1: f64):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,2
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vfmerge.vfm v9,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
-;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v7,v1,fa0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -251,12 +244,11 @@ block0(v0: f64x2, v1: f64):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 2
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
-;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0x57, 0x30, 0x01, 0x5e
+;   .byte 0xd7, 0x53, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x04, 0x05, 0x02
+;   .byte 0xa7, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -275,10 +267,9 @@ block0(v0: f32x4, v1: f32):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,1
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
-;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v7,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -294,13 +285,12 @@ block0(v0: f32x4, v1: f32):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 1
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0xb0, 0x00, 0x5e
 ;   .byte 0x57, 0x70, 0x02, 0xcd
-;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0xd7, 0x53, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x04, 0x05, 0x02
+;   .byte 0xa7, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -319,10 +309,9 @@ block0(v0: f32x4, v1: f32):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a2,2
-;   vmv.s.x v0,a2 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vfmerge.vfm v9,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
-;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,2 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmerge.vfm v7,v1,fa0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -338,13 +327,12 @@ block0(v0: f32x4, v1: f32):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a2, zero, 2
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0x60, 0x06, 0x42
+;   .byte 0x57, 0x30, 0x01, 0x5e
 ;   .byte 0x57, 0x70, 0x02, 0xcd
-;   .byte 0xd7, 0x54, 0x15, 0x5c
+;   .byte 0xd7, 0x53, 0x15, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0xa7, 0x04, 0x05, 0x02
+;   .byte 0xa7, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -453,10 +441,9 @@ block0(v0: i32x4):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a1,4
-;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vmerge.vim v8,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
-;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v6,v1,15,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -472,13 +459,12 @@ block0(v0: i32x4):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a1, zero, 4
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0xe0, 0x05, 0x42
+;   .byte 0x57, 0x30, 0x02, 0x5e
 ;   .byte 0x57, 0x70, 0x02, 0xcd
-;   .byte 0x57, 0xb4, 0x17, 0x5c
+;   .byte 0x57, 0xb3, 0x17, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0x27, 0x04, 0x05, 0x02
+;   .byte 0x27, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
@@ -498,10 +484,9 @@ block0(v0: i64x2):
 ;   mv fp,sp
 ; block0:
 ;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
-;   li a1,1
-;   vmv.s.x v0,a1 #avl=2, #vtype=(e64, m1, ta, ma)
-;   vmerge.vim v8,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
-;   vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vim v6,v1,-9,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
 ;   ld ra,8(sp)
 ;   ld fp,0(sp)
 ;   add sp,+16
@@ -517,12 +502,11 @@ block0(v0: i64x2):
 ;   .byte 0x57, 0x70, 0x08, 0xcc
 ;   addi t6, s0, 0x10
 ;   .byte 0x87, 0x80, 0x0f, 0x02
-;   addi a1, zero, 1
 ;   .byte 0x57, 0x70, 0x81, 0xcd
-;   .byte 0x57, 0xe0, 0x05, 0x42
-;   .byte 0x57, 0xb4, 0x1b, 0x5c
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0x57, 0xb3, 0x1b, 0x5c
 ;   .byte 0x57, 0x70, 0x08, 0xcc
-;   .byte 0x27, 0x04, 0x05, 0x02
+;   .byte 0x27, 0x03, 0x05, 0x02
 ;   ld ra, 8(sp)
 ;   ld s0, 0(sp)
 ;   addi sp, sp, 0x10
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif b/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif
new file mode 100644
index 000000000000..b588dd312189
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-scalartovector.clif
@@ -0,0 +1,159 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %scalartovector_i8(i8) -> i8x16 {
+block0(v0: i8):
+    v1 = scalar_to_vector.i8x16 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v3,a0,v0.t #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xd7, 0x43, 0x35, 0x5c
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ret
+
+function %scalartovector_i16(i16) -> i16x8 {
+block0(v0: i16):
+    v1 = scalar_to_vector.i16x8 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=8, #vtype=(e16, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v3,a0,v0.t #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x43, 0x35, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ret
+
+function %scalartovector_i32(i32) -> i32x4 {
+block0(v0: i32):
+    v1 = scalar_to_vector.i32x4 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=4, #vtype=(e32, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v3,a0,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x43, 0x35, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ret
+
+function %scalartovector_i64(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = scalar_to_vector.i64x2 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vxm v7,v3,a0,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0xd7, 0x43, 0x35, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ret
+
+function %scalartovector_f32(f32) -> f32x4 {
+block0(v0: f32):
+    v1 = scalar_to_vector.f32x4 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=4, #vtype=(e32, m1, ta, ma)
+;   vfmv.s.f v5,fa0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vvm v9,v3,v5,v0.t #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0xd7, 0x52, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x84, 0x32, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ret
+
+function %scalartovector_f64(f64) -> f64x2 {
+block0(v0: f64):
+    v1 = scalar_to_vector.f64x2 v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.v.x v3,zero #avl=2, #vtype=(e64, m1, ta, ma)
+;   vfmv.s.f v5,fa0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmv.v.i v0,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmerge.vvm v9,v3,v5,v0.t #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v9,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x41, 0x00, 0x5e
+;   .byte 0xd7, 0x52, 0x05, 0x42
+;   .byte 0x57, 0xb0, 0x00, 0x5e
+;   .byte 0xd7, 0x84, 0x32, 0x5c
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x04, 0x05, 0x02
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-select.clif b/cranelift/filetests/filetests/isa/riscv64/simd-select.clif
new file mode 100644
index 000000000000..c2596fd50d7b
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-select.clif
@@ -0,0 +1,272 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %select_i64x2(i64, i64x2, i64x2) -> i64x2 {
+block0(v0: i64, v1: i64x2, v2: i64x2):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   select_i64x2 v7,v2,v4##condition=a0
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   beqz a0, 0xc
+;   .byte 0xd7, 0x33, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x33, 0x40, 0x9e
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %select_i32x4(i32, i32x4, i32x4) -> i32x4 {
+block0(v0: i32, v1: i32x4, v2: i32x4):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   slli a4,a0,32
+;   srli a6,a4,32
+;   select_i32x4 v11,v2,v4##condition=a6
+;   vse8.v v11,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   slli a4, a0, 0x20
+;   srli a6, a4, 0x20
+;   beqz a6, 0xc
+;   .byte 0xd7, 0x35, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x35, 0x40, 0x9e
+;   .byte 0xa7, 0x85, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %select_i16x8(i16, i16x8, i16x8) -> i16x8 {
+block0(v0: i16, v1: i16x8, v2: i16x8):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   slli a4,a0,48
+;   srli a6,a4,48
+;   select_i16x8 v11,v2,v4##condition=a6
+;   vse8.v v11,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   slli a4, a0, 0x30
+;   srli a6, a4, 0x30
+;   beqz a6, 0xc
+;   .byte 0xd7, 0x35, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x35, 0x40, 0x9e
+;   .byte 0xa7, 0x85, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %select_i8x16(i8, i8x16, i8x16) -> i8x16 {
+block0(v0: i8, v1: i8x16, v2: i8x16):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   andi a4,a0,255
+;   select_i8x16 v9,v2,v4##condition=a4
+;   vse8.v v9,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   andi a4, a0, 0xff
+;   beqz a4, 0xc
+;   .byte 0xd7, 0x34, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x34, 0x40, 0x9e
+;   .byte 0xa7, 0x84, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %select_f64x2(i64, f64x2, f64x2) -> f64x2 {
+block0(v0: i64, v1: f64x2, v2: f64x2):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   select_f64x2 v7,v2,v4##condition=a0
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   beqz a0, 0xc
+;   .byte 0xd7, 0x33, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x33, 0x40, 0x9e
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %select_f32x4(i64, f32x4, f32x4) -> f32x4 {
+block0(v0: i64, v1: f32x4, v2: f32x4):
+    v3 = select v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v4,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   select_f32x4 v7,v2,v4##condition=a0
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x07, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x07, 0x82, 0x0f, 0x02
+;   beqz a0, 0xc
+;   .byte 0xd7, 0x33, 0x20, 0x9e
+;   j 8
+;   .byte 0xd7, 0x33, 0x40, 0x9e
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-iabs.clif b/cranelift/filetests/filetests/runtests/simd-iabs.clif
index 2fa0ee71c348..63242f1920da 100644
--- a/cranelift/filetests/filetests/runtests/simd-iabs.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iabs.clif
@@ -8,6 +8,7 @@ target x86_64
 target x86_64 sse41
 target x86_64 sse42
 target x86_64 sse42 has_avx
+target riscv64 has_v
 
 function %iabs_i8x16(i8x16) -> i8x16 {
 block0(v0: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
index 0721599a2112..f6aaf71efbfe 100644
--- a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
+++ b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
@@ -2,6 +2,7 @@ test run
 test interpret
 target aarch64
 target s390x
+target riscv64 has_v
 ; i8 and i16 are invalid source sizes for x86_64
 
 function %scalartovector_i8(i8) -> i8x16 {
diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif
index 7891693f3f51..2ff203d644d2 100644
--- a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif
+++ b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif
@@ -5,6 +5,7 @@ target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target riscv64 has_v
 
 function %scalartovector_i32(i32) -> i32x4 {
 block0(v0: i32):
diff --git a/cranelift/filetests/filetests/runtests/simd-select.clif b/cranelift/filetests/filetests/runtests/simd-select.clif
new file mode 100644
index 000000000000..63112860a77a
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-select.clif
@@ -0,0 +1,65 @@
+test interpret
+test run
+target aarch64
+target s390x
+set enable_simd
+target x86_64
+target x86_64 sse41
+target x86_64 sse42
+target x86_64 sse42 has_avx
+target riscv64 has_v
+
+function %select_i64x2(i64, i64x2, i64x2) -> i64x2 {
+block0(v0: i64, v1: i64x2, v2: i64x2):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_i64x2(0, [1 2], [3 4]) == [3 4]
+; run: %select_i64x2(1, [1 2], [3 4]) == [1 2]
+; run: %select_i64x2(-1, [1 2], [3 4]) == [1 2]
+
+function %select_i32x4(i64, i32x4, i32x4) -> i32x4 {
+block0(v0: i64, v1: i32x4, v2: i32x4):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_i32x4(0, [1 2 3 4], [5 6 7 8]) == [5 6 7 8]
+; run: %select_i32x4(1, [1 2 3 4], [5 6 7 8]) == [1 2 3 4]
+; run: %select_i32x4(-1, [1 2 3 4], [5 6 7 8]) == [1 2 3 4]
+
+function %select_i16x8(i64, i16x8, i16x8) -> i16x8 {
+block0(v0: i64, v1: i16x8, v2: i16x8):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_i16x8(0, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 13 14 15 16]
+; run: %select_i16x8(1, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8]
+; run: %select_i16x8(-1, [1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8]
+
+function %select_i8x16(i64, i8x16, i8x16) -> i8x16 {
+block0(v0: i64, v1: i8x16, v2: i8x16):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_i8x16(0, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]
+; run: %select_i8x16(1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+; run: %select_i8x16(-1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+
+function %select_f64x2(i64, f64x2, f64x2) -> f64x2 {
+block0(v0: i64, v1: f64x2, v2: f64x2):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_f64x2(0, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x3.0 0x4.0]
+; run: %select_f64x2(1, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x1.0 0x2.0]
+; run: %select_f64x2(-1, [0x1.0 0x2.0], [0x3.0 0x4.0]) == [0x1.0 0x2.0]
+
+function %select_f32x4(i64, f32x4, f32x4) -> f32x4 {
+block0(v0: i64, v1: f32x4, v2: f32x4):
+    v3 = select v0, v1, v2
+    return v3
+}
+; run: %select_f32x4(0, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x5.0 0x6.0 0x7.0 0x8.0]
+; run: %select_f32x4(1, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x1.0 0x2.0 0x3.0 0x4.0]
+; run: %select_f32x4(-1, [0x1.0 0x2.0 0x3.0 0x4.0], [0x5.0 0x6.0 0x7.0 0x8.0]) == [0x1.0 0x2.0 0x3.0 0x4.0]
+