riscv64: Implement {u,s}widen_{low,high} and load+extend instruct…

…ions (bytecodealliance#6534) * riscv64: Add SIMD Load+Extends * riscv64: Add SIMD `{u,s}widen_{low,high}` * riscv64: Add `gen_slidedown_half` This isn't really necessary yet, but we are going to make a lot of use for it in the widening arithmetic instructions, so might as well add it now. * riscv64: Add multi widen SIMD instructions * riscv64: Typo Fix
gurry · Jun 8, 2023 · 1d0565b · 1d0565b
1 parent f5fafba
commit 1d0565b
Show file tree

Hide file tree

Showing 18 changed files with 1,636 additions and 17 deletions.
diff --git a/build.rs b/build.rs
@@ -234,11 +234,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             }
 
             let known_failure = [
-                "almost_extmul",
                 "canonicalize_nan",
                 "cvt_from_uint",
                 "issue_3327_bnot_lowering",
-                "simd_align",
                 "simd_conversions",
                 "simd_f32x4",
                 "simd_f32x4_cmp",
@@ -251,23 +249,18 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "simd_i16x8_arith2",
                 "simd_i16x8_cmp",
                 "simd_i16x8_extadd_pairwise_i8x16",
-                "simd_i16x8_extmul_i8x16",
                 "simd_i16x8_q15mulr_sat_s",
                 "simd_i32x4_arith2",
                 "simd_i32x4_cmp",
                 "simd_i32x4_dot_i16x8",
                 "simd_i32x4_extadd_pairwise_i16x8",
-                "simd_i32x4_extmul_i16x8",
                 "simd_i32x4_trunc_sat_f32x4",
                 "simd_i32x4_trunc_sat_f64x2",
                 "simd_i64x2_arith2",
                 "simd_i64x2_cmp",
-                "simd_i64x2_extmul_i32x4",
                 "simd_i8x16_arith2",
                 "simd_i8x16_cmp",
-                "simd_int_to_int_extend",
                 "simd_load",
-                "simd_load_extend",
                 "simd_load_zero",
                 "simd_splat",
                 "v128_select",

diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -1592,6 +1592,10 @@
 (decl pure partial uimm5_from_u64 (UImm5) u64)
 (extern extractor uimm5_from_u64 uimm5_from_u64)
 
+;; Convert a `u64` into an `UImm5`
+(decl pure partial u64_to_uimm5 (u64) UImm5)
+(rule (u64_to_uimm5 (uimm5_from_u64 n)) n)
+
 (decl uimm5_bitcast_to_imm5 (UImm5) Imm5)
 (extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5)
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -700,7 +700,16 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             debug_assert_eq!(vs.class(), op.src_regclass());
 
             collector.reg_use(vs);
-            collector.reg_def(vd);
+
+            // If the operation forbids source/destination overlap, then we must
+            // register it as an early_def. This encodes the constraint that
+            // these must not overlap.
+            if op.forbids_src_dst_overlaps() {
+                collector.reg_early_def(vd);
+            } else {
+                collector.reg_def(vd);
+            }
+
             vec_mask_operands(mask, collector);
         }
         &Inst::VecAluRImm5 { vd, ref mask, .. } => {

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -498,6 +498,12 @@ impl VecAluOpRR {
             VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => {
                 0b010000
             }
+            VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => 0b010010,
             VecAluOpRR::VfsqrtV => 0b010011,
             VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111,
         }
@@ -506,7 +512,13 @@ impl VecAluOpRR {
     pub fn category(&self) -> VecOpCategory {
         match self {
             VecAluOpRR::VmvSX => VecOpCategory::OPMVX,
-            VecAluOpRR::VmvXS => VecOpCategory::OPMVV,
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV,
             VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF,
             VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV,
             VecAluOpRR::VmvVV => VecOpCategory::OPIVV,
@@ -527,6 +539,13 @@ impl VecAluOpRR {
             VecAluOpRR::VfmvFS => 0b00000,
             // VFUNARY1
             VecAluOpRR::VfsqrtV => 0b00000,
+            // VXUNARY0
+            VecAluOpRR::VzextVF8 => 0b00010,
+            VecAluOpRR::VsextVF8 => 0b00011,
+            VecAluOpRR::VzextVF4 => 0b00100,
+            VecAluOpRR::VsextVF4 => 0b00101,
+            VecAluOpRR::VzextVF2 => 0b00110,
+            VecAluOpRR::VsextVF2 => 0b00111,
             // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states:
             // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved.
             VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0,
@@ -538,7 +557,15 @@ impl VecAluOpRR {
     /// other way around. As far as I can tell only vmv.v.* are backwards.
     pub fn vs_is_vs2_encoded(&self) -> bool {
         match self {
-            VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true,
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VfmvFS
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => true,
             VecAluOpRR::VmvSX
             | VecAluOpRR::VfmvSF
             | VecAluOpRR::VmvVV
@@ -554,21 +581,47 @@ impl VecAluOpRR {
             | VecAluOpRR::VmvVV
             | VecAluOpRR::VmvVX
             | VecAluOpRR::VfmvVF
-            | VecAluOpRR::VfsqrtV => RegClass::Vector,
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => RegClass::Vector,
             VecAluOpRR::VmvXS => RegClass::Int,
             VecAluOpRR::VfmvFS => RegClass::Float,
         }
     }
 
     pub fn src_regclass(&self) -> RegClass {
         match self {
-            VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV | VecAluOpRR::VfsqrtV => {
-                RegClass::Vector
-            }
+            VecAluOpRR::VmvXS
+            | VecAluOpRR::VfmvFS
+            | VecAluOpRR::VmvVV
+            | VecAluOpRR::VfsqrtV
+            | VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => RegClass::Vector,
             VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float,
             VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int,
         }
     }
+
+    /// Some instructions do not allow the source and destination registers to overlap.
+    pub fn forbids_src_dst_overlaps(&self) -> bool {
+        match self {
+            VecAluOpRR::VzextVF2
+            | VecAluOpRR::VzextVF4
+            | VecAluOpRR::VzextVF8
+            | VecAluOpRR::VsextVF2
+            | VecAluOpRR::VsextVF4
+            | VecAluOpRR::VsextVF8 => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRR {
@@ -579,6 +632,12 @@ impl fmt::Display for VecAluOpRR {
             VecAluOpRR::VfmvSF => "vfmv.s.f",
             VecAluOpRR::VfmvFS => "vfmv.f.s",
             VecAluOpRR::VfsqrtV => "vfsqrt.v",
+            VecAluOpRR::VzextVF2 => "vzext.vf2",
+            VecAluOpRR::VzextVF4 => "vzext.vf4",
+            VecAluOpRR::VzextVF8 => "vzext.vf8",
+            VecAluOpRR::VsextVF2 => "vsext.vf2",
+            VecAluOpRR::VsextVF4 => "vsext.vf4",
+            VecAluOpRR::VsextVF8 => "vsext.vf8",
             VecAluOpRR::VmvVV => "vmv.v.v",
             VecAluOpRR::VmvVX => "vmv.v.x",
             VecAluOpRR::VfmvVF => "vfmv.v.f",

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -186,6 +186,12 @@
   (VmvVX)
   (VfmvVF)
   (VfsqrtV)
+  (VsextVF2)
+  (VsextVF4)
+  (VsextVF8)
+  (VzextVF2)
+  (VzextVF4)
+  (VzextVF8)
 ))
 
 ;; Returns the canonical destination type for a VecAluOpRRImm5.
@@ -745,6 +751,42 @@
 (rule (rv_vmslt_vx vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vzext.vf2` instruction.
+;; Zero-extend SEW/2 source to SEW destination
+(decl rv_vzext_vf2 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf2 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF2) vs mask vstate))
+
+;; Helper for emitting the `vzext.vf4` instruction.
+;; Zero-extend SEW/4 source to SEW destination
+(decl rv_vzext_vf4 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf4 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF4) vs mask vstate))
+
+;; Helper for emitting the `vzext.vf8` instruction.
+;; Zero-extend SEW/8 source to SEW destination
+(decl rv_vzext_vf8 (VReg VecOpMasking VState) VReg)
+(rule (rv_vzext_vf8 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VzextVF8) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf2` instruction.
+;; Sign-extend SEW/2 source to SEW destination
+(decl rv_vsext_vf2 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf2 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF2) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf4` instruction.
+;; Sign-extend SEW/4 source to SEW destination
+(decl rv_vsext_vf4 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf4 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF4) vs mask vstate))
+
+;; Helper for emitting the `vsext.vf8` instruction.
+;; Sign-extend SEW/8 source to SEW destination
+(decl rv_vsext_vf8 (VReg VecOpMasking VState) VReg)
+(rule (rv_vsext_vf8 vs mask vstate)
+  (vec_alu_rr (VecAluOpRR.VsextVF8) vs mask vstate))
+
 ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl gen_extractlane (Type VReg u8) Reg)
@@ -796,3 +838,17 @@
     (mem_flags_trusted)
     (unmasked)
     ty))
+
+
+;; Emits a vslidedown instruction that moves half the lanes down.
+(decl gen_slidedown_half (Type VReg) VReg)
+
+;; If the lane count can fit in a 5-bit immediate, we can use `vslidedown.vi`.
+(rule 1 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
+  (if-let (uimm5_from_u64 amt) (u64_udiv (ty_lane_count ty) 2))
+  (rv_vslidedown_vi src amt (unmasked) ty))
+
+;; Otherwise lower it into an X register.
+(rule 0 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
+  (if-let amt (u64_udiv (ty_lane_count ty) 2))
+  (rv_vslidedown_vx src (imm $I64 amt) (unmasked) ty))
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1087,6 +1087,51 @@
   (let ((eew VecElementWidth (element_width_from_type ty)))
     (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty)))
 
+;;;;;  Rules for Load + Extend Combos ;;;;;;;;;
+
+;; These rules cover the special loads that load a 64bit value and do some sort of extension.
+;; We don't have any special instructions to do this, so just load the 64 bits as a vector, and
+;; do a SEW/2 extension. This only reads half width elements from the source vector register
+;; extends it, and writes the back the full register.
+
+(decl gen_load64_extend (Type ExtendOp MemFlags XReg Offset32) VReg)
+
+(rule (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)
+  (let ((eew VecElementWidth (element_width_from_type $I64))
+        (load_state VState (vstate_from_type $I64))
+        (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
+    (rv_vsext_vf2 loaded (unmasked) ty)))
+
+(rule (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)
+  (let ((eew VecElementWidth (element_width_from_type $I64))
+        (load_state VState (vstate_from_type $I64))
+        (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
+    (rv_vzext_vf2 loaded (unmasked) ty)))
+
+;;;;;  Rules for `uload8x8`;;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (uload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `uload16x4`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (uload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `uload32x2`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (uload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Zero) flags addr offset))
+
+;;;;;  Rules for `sload8x8`;;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (sload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
+;;;;;  Rules for `sload16x4`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (sload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
+;;;;;  Rules for `sload32x2`;;;;;;;;;
+(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (sload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
+  (gen_load64_extend ty (ExtendOp.Signed) flags addr offset))
+
 ;;;;;  Rules for `istore8`;;;;;;;;;
 (rule
   (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset))
@@ -1428,3 +1473,53 @@
         (y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty))
         (y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty)))
     (rv_vor_vv x_lanes y_lanes (unmasked) ty)))
+
+;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Slide down half the vector, and do a signed extension.
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high x @ (value_type in_ty))))
+  (rv_vsext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high x @ (value_type in_ty)))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vsext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high (swiden_high x @ (value_type in_ty))))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vsext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Slide down half the vector, and do a zero extension.
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high x @ (value_type in_ty))))
+  (rv_vzext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high x @ (value_type in_ty)))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vzext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high (uwiden_high x @ (value_type in_ty))))))
+  (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
+  (rv_vzext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))
+
+;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low x)))
+  (rv_vsext_vf2 x (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low x))))
+  (rv_vsext_vf4 x (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low (swiden_low x)))))
+  (rv_vsext_vf8 x (unmasked) out_ty))
+
+;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low x)))
+  (rv_vzext_vf2 x (unmasked) out_ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low x))))
+  (rv_vzext_vf4 x (unmasked) out_ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x)))))
+  (rv_vzext_vf8 x (unmasked) out_ty))
diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs
@@ -237,6 +237,11 @@ macro_rules! isle_common_prelude_methods {
             u64::MAX >> shift
         }
 
+        #[inline]
+        fn ty_lane_count(&mut self, ty: Type) -> u64 {
+            ty.lane_count() as u64
+        }
+
         #[inline]
         fn ty_umin(&mut self, _ty: Type) -> u64 {
             0

diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
@@ -265,6 +265,10 @@
 (decl pure ty_lane_mask (Type) u64)
 (extern constructor ty_lane_mask ty_lane_mask)
 
+;; Get the number of lanes for a given type.
+(decl pure ty_lane_count (Type) u64)
+(extern constructor ty_lane_count ty_lane_count)
+
 ;; Get the byte width of a given type.
 (decl pure ty_bytes (Type) u16)
 (extern constructor ty_bytes ty_bytes)