Skip to content

Commit

Permalink
riscv64: Implement {u,s}widen_{low,high} and load+extend instruct…
Browse files Browse the repository at this point in the history
…ions (bytecodealliance#6534)

* riscv64: Add SIMD Load+Extends

* riscv64: Add SIMD `{u,s}widen_{low,high}`

* riscv64: Add `gen_slidedown_half`

This isn't really necessary yet, but we are going to make a lot of use for it
in the widening arithmetic instructions, so might as well add it now.

* riscv64: Add multi widen SIMD instructions

* riscv64: Typo Fix
  • Loading branch information
afonso360 authored Jun 8, 2023
1 parent f5fafba commit 1d0565b
Show file tree
Hide file tree
Showing 18 changed files with 1,636 additions and 17 deletions.
7 changes: 0 additions & 7 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
}

let known_failure = [
"almost_extmul",
"canonicalize_nan",
"cvt_from_uint",
"issue_3327_bnot_lowering",
"simd_align",
"simd_conversions",
"simd_f32x4",
"simd_f32x4_cmp",
Expand All @@ -251,23 +249,18 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
"simd_i16x8_arith2",
"simd_i16x8_cmp",
"simd_i16x8_extadd_pairwise_i8x16",
"simd_i16x8_extmul_i8x16",
"simd_i16x8_q15mulr_sat_s",
"simd_i32x4_arith2",
"simd_i32x4_cmp",
"simd_i32x4_dot_i16x8",
"simd_i32x4_extadd_pairwise_i16x8",
"simd_i32x4_extmul_i16x8",
"simd_i32x4_trunc_sat_f32x4",
"simd_i32x4_trunc_sat_f64x2",
"simd_i64x2_arith2",
"simd_i64x2_cmp",
"simd_i64x2_extmul_i32x4",
"simd_i8x16_arith2",
"simd_i8x16_cmp",
"simd_int_to_int_extend",
"simd_load",
"simd_load_extend",
"simd_load_zero",
"simd_splat",
"v128_select",
Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1592,6 +1592,10 @@
(decl pure partial uimm5_from_u64 (UImm5) u64)
(extern extractor uimm5_from_u64 uimm5_from_u64)

;; Convert a `u64` into an `UImm5`
(decl pure partial u64_to_uimm5 (u64) UImm5)
(rule (u64_to_uimm5 (uimm5_from_u64 n)) n)

(decl uimm5_bitcast_to_imm5 (UImm5) Imm5)
(extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5)

Expand Down
11 changes: 10 additions & 1 deletion cranelift/codegen/src/isa/riscv64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,16 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
debug_assert_eq!(vs.class(), op.src_regclass());

collector.reg_use(vs);
collector.reg_def(vd);

// If the operation forbids source/destination overlap, then we must
// register it as an early_def. This encodes the constraint that
// these must not overlap.
if op.forbids_src_dst_overlaps() {
collector.reg_early_def(vd);
} else {
collector.reg_def(vd);
}

vec_mask_operands(mask, collector);
}
&Inst::VecAluRImm5 { vd, ref mask, .. } => {
Expand Down
71 changes: 65 additions & 6 deletions cranelift/codegen/src/isa/riscv64/inst/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,12 @@ impl VecAluOpRR {
VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => {
0b010000
}
VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => 0b010010,
VecAluOpRR::VfsqrtV => 0b010011,
VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111,
}
Expand All @@ -506,7 +512,13 @@ impl VecAluOpRR {
pub fn category(&self) -> VecOpCategory {
match self {
VecAluOpRR::VmvSX => VecOpCategory::OPMVX,
VecAluOpRR::VmvXS => VecOpCategory::OPMVV,
VecAluOpRR::VmvXS
| VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV,
VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF,
VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV,
VecAluOpRR::VmvVV => VecOpCategory::OPIVV,
Expand All @@ -527,6 +539,13 @@ impl VecAluOpRR {
VecAluOpRR::VfmvFS => 0b00000,
// VFUNARY1
VecAluOpRR::VfsqrtV => 0b00000,
// VXUNARY0
VecAluOpRR::VzextVF8 => 0b00010,
VecAluOpRR::VsextVF8 => 0b00011,
VecAluOpRR::VzextVF4 => 0b00100,
VecAluOpRR::VsextVF4 => 0b00101,
VecAluOpRR::VzextVF2 => 0b00110,
VecAluOpRR::VsextVF2 => 0b00111,
// These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states:
// > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved.
VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0,
Expand All @@ -538,7 +557,15 @@ impl VecAluOpRR {
/// other way around. As far as I can tell only vmv.v.* are backwards.
pub fn vs_is_vs2_encoded(&self) -> bool {
match self {
VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true,
VecAluOpRR::VmvXS
| VecAluOpRR::VfmvFS
| VecAluOpRR::VfsqrtV
| VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => true,
VecAluOpRR::VmvSX
| VecAluOpRR::VfmvSF
| VecAluOpRR::VmvVV
Expand All @@ -554,21 +581,47 @@ impl VecAluOpRR {
| VecAluOpRR::VmvVV
| VecAluOpRR::VmvVX
| VecAluOpRR::VfmvVF
| VecAluOpRR::VfsqrtV => RegClass::Vector,
| VecAluOpRR::VfsqrtV
| VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => RegClass::Vector,
VecAluOpRR::VmvXS => RegClass::Int,
VecAluOpRR::VfmvFS => RegClass::Float,
}
}

pub fn src_regclass(&self) -> RegClass {
match self {
VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV | VecAluOpRR::VfsqrtV => {
RegClass::Vector
}
VecAluOpRR::VmvXS
| VecAluOpRR::VfmvFS
| VecAluOpRR::VmvVV
| VecAluOpRR::VfsqrtV
| VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => RegClass::Vector,
VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float,
VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int,
}
}

/// Some instructions do not allow the source and destination registers to overlap.
pub fn forbids_src_dst_overlaps(&self) -> bool {
match self {
VecAluOpRR::VzextVF2
| VecAluOpRR::VzextVF4
| VecAluOpRR::VzextVF8
| VecAluOpRR::VsextVF2
| VecAluOpRR::VsextVF4
| VecAluOpRR::VsextVF8 => true,
_ => false,
}
}
}

impl fmt::Display for VecAluOpRR {
Expand All @@ -579,6 +632,12 @@ impl fmt::Display for VecAluOpRR {
VecAluOpRR::VfmvSF => "vfmv.s.f",
VecAluOpRR::VfmvFS => "vfmv.f.s",
VecAluOpRR::VfsqrtV => "vfsqrt.v",
VecAluOpRR::VzextVF2 => "vzext.vf2",
VecAluOpRR::VzextVF4 => "vzext.vf4",
VecAluOpRR::VzextVF8 => "vzext.vf8",
VecAluOpRR::VsextVF2 => "vsext.vf2",
VecAluOpRR::VsextVF4 => "vsext.vf4",
VecAluOpRR::VsextVF8 => "vsext.vf8",
VecAluOpRR::VmvVV => "vmv.v.v",
VecAluOpRR::VmvVX => "vmv.v.x",
VecAluOpRR::VfmvVF => "vfmv.v.f",
Expand Down
56 changes: 56 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst_vector.isle
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@
(VmvVX)
(VfmvVF)
(VfsqrtV)
(VsextVF2)
(VsextVF4)
(VsextVF8)
(VzextVF2)
(VzextVF4)
(VzextVF8)
))

;; Returns the canonical destination type for a VecAluOpRRImm5.
Expand Down Expand Up @@ -745,6 +751,42 @@
(rule (rv_vmslt_vx vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate))

;; Helper for emitting the `vzext.vf2` instruction.
;; Zero-extend SEW/2 source to SEW destination
(decl rv_vzext_vf2 (VReg VecOpMasking VState) VReg)
(rule (rv_vzext_vf2 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VzextVF2) vs mask vstate))

;; Helper for emitting the `vzext.vf4` instruction.
;; Zero-extend SEW/4 source to SEW destination
(decl rv_vzext_vf4 (VReg VecOpMasking VState) VReg)
(rule (rv_vzext_vf4 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VzextVF4) vs mask vstate))

;; Helper for emitting the `vzext.vf8` instruction.
;; Zero-extend SEW/8 source to SEW destination
(decl rv_vzext_vf8 (VReg VecOpMasking VState) VReg)
(rule (rv_vzext_vf8 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VzextVF8) vs mask vstate))

;; Helper for emitting the `vsext.vf2` instruction.
;; Sign-extend SEW/2 source to SEW destination
(decl rv_vsext_vf2 (VReg VecOpMasking VState) VReg)
(rule (rv_vsext_vf2 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VsextVF2) vs mask vstate))

;; Helper for emitting the `vsext.vf4` instruction.
;; Sign-extend SEW/4 source to SEW destination
(decl rv_vsext_vf4 (VReg VecOpMasking VState) VReg)
(rule (rv_vsext_vf4 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VsextVF4) vs mask vstate))

;; Helper for emitting the `vsext.vf8` instruction.
;; Sign-extend SEW/8 source to SEW destination
(decl rv_vsext_vf8 (VReg VecOpMasking VState) VReg)
(rule (rv_vsext_vf8 vs mask vstate)
(vec_alu_rr (VecAluOpRR.VsextVF8) vs mask vstate))

;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(decl gen_extractlane (Type VReg u8) Reg)
Expand Down Expand Up @@ -796,3 +838,17 @@
(mem_flags_trusted)
(unmasked)
ty))


;; Emits a vslidedown instruction that moves half the lanes down.
(decl gen_slidedown_half (Type VReg) VReg)

;; If the lane count can fit in a 5-bit immediate, we can use `vslidedown.vi`.
(rule 1 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
(if-let (uimm5_from_u64 amt) (u64_udiv (ty_lane_count ty) 2))
(rv_vslidedown_vi src amt (unmasked) ty))

;; Otherwise lower it into an X register.
(rule 0 (gen_slidedown_half (ty_vec_fits_in_register ty) src)
(if-let amt (u64_udiv (ty_lane_count ty) 2))
(rv_vslidedown_vx src (imm $I64 amt) (unmasked) ty))
95 changes: 95 additions & 0 deletions cranelift/codegen/src/isa/riscv64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,51 @@
(let ((eew VecElementWidth (element_width_from_type ty)))
(vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty)))

;;;;; Rules for Load + Extend Combos ;;;;;;;;;

;; These rules cover the special loads that load a 64bit value and do some sort of extension.
;; We don't have any special instructions to do this, so just load the 64 bits as a vector, and
;; do a SEW/2 extension. This only reads half width elements from the source vector register
;; extends it, and writes the back the full register.

(decl gen_load64_extend (Type ExtendOp MemFlags XReg Offset32) VReg)

(rule (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)
(let ((eew VecElementWidth (element_width_from_type $I64))
(load_state VState (vstate_from_type $I64))
(loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
(rv_vsext_vf2 loaded (unmasked) ty)))

(rule (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)
(let ((eew VecElementWidth (element_width_from_type $I64))
(load_state VState (vstate_from_type $I64))
(loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state)))
(rv_vzext_vf2 loaded (unmasked) ty)))

;;;;; Rules for `uload8x8`;;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (uload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Zero) flags addr offset))

;;;;; Rules for `uload16x4`;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (uload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Zero) flags addr offset))

;;;;; Rules for `uload32x2`;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (uload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Zero) flags addr offset))

;;;;; Rules for `sload8x8`;;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (sload8x8 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Signed) flags addr offset))

;;;;; Rules for `sload16x4`;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (sload16x4 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Signed) flags addr offset))

;;;;; Rules for `sload32x2`;;;;;;;;;
(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (sload32x2 flags addr @ (value_type (ty_addr64 _)) offset)))
(gen_load64_extend ty (ExtendOp.Signed) flags addr offset))

;;;;; Rules for `istore8`;;;;;;;;;
(rule
(lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset))
Expand Down Expand Up @@ -1428,3 +1473,53 @@
(y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty))
(y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty)))
(rv_vor_vv x_lanes y_lanes (unmasked) ty)))

;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Slide down half the vector, and do a signed extension.
(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high x @ (value_type in_ty))))
(rv_vsext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))

(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high x @ (value_type in_ty)))))
(if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
(rv_vsext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high (swiden_high x @ (value_type in_ty))))))
(if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
(rv_vsext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))

;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Slide down half the vector, and do a zero extension.
(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high x @ (value_type in_ty))))
(rv_vzext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty))

(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high x @ (value_type in_ty)))))
(if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
(rv_vzext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high (uwiden_high x @ (value_type in_ty))))))
(if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty)))
(rv_vzext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty))

;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low x)))
(rv_vsext_vf2 x (unmasked) out_ty))

(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low x))))
(rv_vsext_vf4 x (unmasked) out_ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low (swiden_low x)))))
(rv_vsext_vf8 x (unmasked) out_ty))

;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low x)))
(rv_vzext_vf2 x (unmasked) out_ty))

(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low x))))
(rv_vzext_vf4 x (unmasked) out_ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x)))))
(rv_vzext_vf8 x (unmasked) out_ty))
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isle_prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ macro_rules! isle_common_prelude_methods {
u64::MAX >> shift
}

#[inline]
fn ty_lane_count(&mut self, ty: Type) -> u64 {
ty.lane_count() as u64
}

#[inline]
fn ty_umin(&mut self, _ty: Type) -> u64 {
0
Expand Down
4 changes: 4 additions & 0 deletions cranelift/codegen/src/prelude.isle
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@
(decl pure ty_lane_mask (Type) u64)
(extern constructor ty_lane_mask ty_lane_mask)

;; Get the number of lanes for a given type.
(decl pure ty_lane_count (Type) u64)
(extern constructor ty_lane_count ty_lane_count)

;; Get the byte width of a given type.
(decl pure ty_bytes (Type) u16)
(extern constructor ty_bytes ty_bytes)
Expand Down
Loading

0 comments on commit 1d0565b

Please sign in to comment.