riscv64: Implement a few misc SIMD instructions (bytecodealliance#6598)

* riscv64: Add immediate rule to `gen_vec_mask` * riscv64: Implement `scalar_to_vector` * riscv64: Implement vector `select` * riscv64: Implement SIMD `iabs` * wasmtime: Enable SIMD memory64 tests for riscv64 * cranelift: Update targets for `simd-select` tests
gurry · Jun 17, 2023 · 4756114 · 4756114
1 parent 728d0f5
commit 4756114
Show file tree

Hide file tree

Showing 13 changed files with 753 additions and 93 deletions.
diff --git a/build.rs b/build.rs
@@ -228,11 +228,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 return true;
             }
 
-            // The memory64 testsuite has a single SIMD test that we don't pass yet.
-            if testname == "simd" && testsuite == "memory64" {
-                return true;
-            }
-
             let known_failure = [
                 "canonicalize_nan",
                 "cvt_from_uint",
@@ -249,18 +244,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
                 "simd_i16x8_q15mulr_sat_s",
-                "simd_i32x4_arith2",
                 "simd_i32x4_cmp",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
-                "simd_i64x2_arith2",
                 "simd_i64x2_cmp",
                 "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
                 "simd_load",
-                "simd_load_zero",
                 "simd_splat",
-                "v128_select",
             ]
             .contains(&testname);
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -2673,18 +2673,6 @@
   (gen_select_reg (IntCC.SignedGreaterThan) x y x y))
 
 
-(decl lower_iabs (Type XReg) XReg)
-
-; I64 and lower
-; Generate the following code:
-;   sext.{b,h,w} a0, a0
-;   neg a1, a0
-;   max a0, a0, a1
-(rule (lower_iabs (fits_in_64 ty) val)
-  (let ((extended XReg (sext val ty $I64))
-        (negated XReg (rv_neg extended)))
-    (max $I64 extended negated)))
-
 (decl gen_trapif (XReg TrapCode) InstOutput)
 (rule
   (gen_trapif test trap_code)

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -1006,14 +1006,17 @@
 
 
 ;; Build a vector mask from a u64
-;; TODO: We should merge this with the `vconst` rules, and take advantage of
-;; the other existing `vconst` rules. One example is using `vmv.v.i` which
-;; can represent some of these masks.
+;; TODO(#6571): We should merge this with the `vconst` rules, and take advantage of
+;; the other existing `vconst` rules.
 (decl gen_vec_mask (u64) VReg)
 
+;; When the immediate fits in a 5-bit immediate, we can use `vmv.v.i` directly.
+(rule 1 (gen_vec_mask (imm5_from_u64 imm))
+  (rv_vmv_vi imm (vstate_from_type $I64X2)))
+
 ;; Materialize the mask into an X register, and move it into the bottom of
 ;; the vector register.
-(rule (gen_vec_mask mask)
+(rule 0 (gen_vec_mask mask)
   (rv_vmv_sx (imm $I64 mask) (vstate_from_type $I64X2)))
 
 

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1556,9 +1556,23 @@
   (load_ra))
 
 ;;; Rules for `iabs` ;;;;;;;;;;;;;
-(rule
-  (lower (has_type (fits_in_64 ty) (iabs x)))
-  (lower_iabs ty x))
+
+;; I64 and lower
+;; Generate the following code:
+;;   sext.{b,h,w} a0, a0
+;;   neg a1, a0
+;;   max a0, a0, a1
+(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iabs x)))
+  (let ((extended XReg (sext x ty $I64))
+        (negated XReg (rv_neg extended)))
+    (max $I64 extended negated)))
+
+;; For vectors we generate the same code, but with vector instructions
+;; we can skip the sign extension, since the vector unit will only process
+;; Element Sized chunks.
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (iabs x)))
+  (let ((negated VReg (rv_vneg_v x (unmasked) ty)))
+    (rv_vmax_vv x negated (unmasked) ty)))
 
 ;;;; Rules for calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1842,3 +1856,18 @@
         (xor  VReg (rv_vxor_vv x y (unmasked) ty))
         (rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
     (rv_vadd_vv lhs rhs (unmasked) ty)))
+
+;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x)))
+  (if (ty_vector_not_float ty))
+  (let ((zero VReg (rv_vmv_vx (zero_reg) ty))
+        (mask VReg (gen_vec_mask 1)))
+    (rv_vmerge_vxm zero x mask ty)))
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (scalar_to_vector x)))
+  (if (ty_vector_float ty))
+  (let ((zero VReg (rv_vmv_vx (zero_reg) ty))
+        (elem VReg (rv_vfmv_sf x ty))
+        (mask VReg (gen_vec_mask 1)))
+    (rv_vmerge_vvm zero elem mask ty)))
diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -224,7 +224,7 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
             } else {
                 vec![self.temp_writable_reg(I64), self.temp_writable_reg(I64)]
             }
-        } else if ty.is_float() {
+        } else if ty.is_float() || ty.is_vector() {
             vec![self.temp_writable_reg(ty)]
         } else {
             unimplemented!("ty:{:?}", ty)

diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iabs.clif
@@ -0,0 +1,166 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %iabs_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iabs_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iabs v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmax.vv v6,v1,v4 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x03, 0x12, 0x1e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+