From 1d0565ba878928359ce9eb441d4e6a676b15834d Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Thu, 8 Jun 2023 10:39:12 +0100 Subject: [PATCH] riscv64: Implement `{u,s}widen_{low,high}` and `load+extend` instructions (#6534) * riscv64: Add SIMD Load+Extends * riscv64: Add SIMD `{u,s}widen_{low,high}` * riscv64: Add `gen_slidedown_half` This isn't really necessary yet, but we are going to make a lot of use for it in the widening arithmetic instructions, so might as well add it now. * riscv64: Add multi widen SIMD instructions * riscv64: Typo Fix --- build.rs | 7 - cranelift/codegen/src/isa/riscv64/inst.isle | 4 + cranelift/codegen/src/isa/riscv64/inst/mod.rs | 11 +- .../codegen/src/isa/riscv64/inst/vector.rs | 71 ++++- .../codegen/src/isa/riscv64/inst_vector.isle | 56 ++++ cranelift/codegen/src/isa/riscv64/lower.isle | 95 +++++++ cranelift/codegen/src/isle_prelude.rs | 5 + cranelift/codegen/src/prelude.isle | 4 + .../isa/riscv64/simd-load-extend.clif | 142 ++++++++++ .../isa/riscv64/simd-load-splat.clif | 141 ++++++++++ .../isa/riscv64/simd-swiden_high.clif | 257 +++++++++++++++++ .../isa/riscv64/simd-swiden_low.clif | 242 ++++++++++++++++ .../isa/riscv64/simd-uwiden_high.clif | 258 ++++++++++++++++++ .../isa/riscv64/simd-uwiden_low.clif | 243 +++++++++++++++++ .../filetests/runtests/simd-swidenhigh.clif | 27 ++ .../filetests/runtests/simd-swidenlow.clif | 28 ++ .../filetests/runtests/simd-uwidenhigh.clif | 28 ++ .../filetests/runtests/simd-uwidenlow.clif | 34 ++- 18 files changed, 1636 insertions(+), 17 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif diff --git a/build.rs b/build.rs index e887ae0a0eee..9cb05298cd64 100644 --- a/build.rs +++ b/build.rs @@ -234,11 +234,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { } let known_failure = [ - "almost_extmul", "canonicalize_nan", "cvt_from_uint", "issue_3327_bnot_lowering", - "simd_align", "simd_conversions", "simd_f32x4", "simd_f32x4_cmp", @@ -251,23 +249,18 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "simd_i16x8_arith2", "simd_i16x8_cmp", "simd_i16x8_extadd_pairwise_i8x16", - "simd_i16x8_extmul_i8x16", "simd_i16x8_q15mulr_sat_s", "simd_i32x4_arith2", "simd_i32x4_cmp", "simd_i32x4_dot_i16x8", "simd_i32x4_extadd_pairwise_i16x8", - "simd_i32x4_extmul_i16x8", "simd_i32x4_trunc_sat_f32x4", "simd_i32x4_trunc_sat_f64x2", "simd_i64x2_arith2", "simd_i64x2_cmp", - "simd_i64x2_extmul_i32x4", "simd_i8x16_arith2", "simd_i8x16_cmp", - "simd_int_to_int_extend", "simd_load", - "simd_load_extend", "simd_load_zero", "simd_splat", "v128_select", diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index a414ad826ceb..f0a79b9d5dbb 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -1592,6 +1592,10 @@ (decl pure partial uimm5_from_u64 (UImm5) u64) (extern extractor uimm5_from_u64 uimm5_from_u64) +;; Convert a `u64` into an `UImm5` +(decl pure partial u64_to_uimm5 (u64) UImm5) +(rule (u64_to_uimm5 (uimm5_from_u64 n)) n) + (decl uimm5_bitcast_to_imm5 (UImm5) Imm5) (extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5) diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 5e3cc27b90c2..79c1fbfd9fd3 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -700,7 +700,16 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan debug_assert_eq!(vs.class(), op.src_regclass()); collector.reg_use(vs); - collector.reg_def(vd); + + // If the operation forbids source/destination overlap, then we must + // register it as an early_def. This encodes the constraint that + // these must not overlap. + if op.forbids_src_dst_overlaps() { + collector.reg_early_def(vd); + } else { + collector.reg_def(vd); + } + vec_mask_operands(mask, collector); } &Inst::VecAluRImm5 { vd, ref mask, .. } => { diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index f68e6b558b0b..7136ccd2d6a8 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -498,6 +498,12 @@ impl VecAluOpRR { VecAluOpRR::VmvSX | VecAluOpRR::VmvXS | VecAluOpRR::VfmvSF | VecAluOpRR::VfmvFS => { 0b010000 } + VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => 0b010010, VecAluOpRR::VfsqrtV => 0b010011, VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0b010111, } @@ -506,7 +512,13 @@ impl VecAluOpRR { pub fn category(&self) -> VecOpCategory { match self { VecAluOpRR::VmvSX => VecOpCategory::OPMVX, - VecAluOpRR::VmvXS => VecOpCategory::OPMVV, + VecAluOpRR::VmvXS + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => VecOpCategory::OPMVV, VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => VecOpCategory::OPFVF, VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => VecOpCategory::OPFVV, VecAluOpRR::VmvVV => VecOpCategory::OPIVV, @@ -527,6 +539,13 @@ impl VecAluOpRR { VecAluOpRR::VfmvFS => 0b00000, // VFUNARY1 VecAluOpRR::VfsqrtV => 0b00000, + // VXUNARY0 + VecAluOpRR::VzextVF8 => 0b00010, + VecAluOpRR::VsextVF8 => 0b00011, + VecAluOpRR::VzextVF4 => 0b00100, + VecAluOpRR::VsextVF4 => 0b00101, + VecAluOpRR::VzextVF2 => 0b00110, + VecAluOpRR::VsextVF2 => 0b00111, // These don't have a explicit encoding table, but Section 11.16 Vector Integer Move Instruction states: // > The first operand specifier (vs2) must contain v0, and any other vector register number in vs2 is reserved. VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF => 0, @@ -538,7 +557,15 @@ impl VecAluOpRR { /// other way around. As far as I can tell only vmv.v.* are backwards. pub fn vs_is_vs2_encoded(&self) -> bool { match self { - VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VfsqrtV => true, + VecAluOpRR::VmvXS + | VecAluOpRR::VfmvFS + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => true, VecAluOpRR::VmvSX | VecAluOpRR::VfmvSF | VecAluOpRR::VmvVV @@ -554,7 +581,13 @@ impl VecAluOpRR { | VecAluOpRR::VmvVV | VecAluOpRR::VmvVX | VecAluOpRR::VfmvVF - | VecAluOpRR::VfsqrtV => RegClass::Vector, + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => RegClass::Vector, VecAluOpRR::VmvXS => RegClass::Int, VecAluOpRR::VfmvFS => RegClass::Float, } @@ -562,13 +595,33 @@ impl VecAluOpRR { pub fn src_regclass(&self) -> RegClass { match self { - VecAluOpRR::VmvXS | VecAluOpRR::VfmvFS | VecAluOpRR::VmvVV | VecAluOpRR::VfsqrtV => { - RegClass::Vector - } + VecAluOpRR::VmvXS + | VecAluOpRR::VfmvFS + | VecAluOpRR::VmvVV + | VecAluOpRR::VfsqrtV + | VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => RegClass::Vector, VecAluOpRR::VfmvSF | VecAluOpRR::VfmvVF => RegClass::Float, VecAluOpRR::VmvSX | VecAluOpRR::VmvVX => RegClass::Int, } } + + /// Some instructions do not allow the source and destination registers to overlap. + pub fn forbids_src_dst_overlaps(&self) -> bool { + match self { + VecAluOpRR::VzextVF2 + | VecAluOpRR::VzextVF4 + | VecAluOpRR::VzextVF8 + | VecAluOpRR::VsextVF2 + | VecAluOpRR::VsextVF4 + | VecAluOpRR::VsextVF8 => true, + _ => false, + } + } } impl fmt::Display for VecAluOpRR { @@ -579,6 +632,12 @@ impl fmt::Display for VecAluOpRR { VecAluOpRR::VfmvSF => "vfmv.s.f", VecAluOpRR::VfmvFS => "vfmv.f.s", VecAluOpRR::VfsqrtV => "vfsqrt.v", + VecAluOpRR::VzextVF2 => "vzext.vf2", + VecAluOpRR::VzextVF4 => "vzext.vf4", + VecAluOpRR::VzextVF8 => "vzext.vf8", + VecAluOpRR::VsextVF2 => "vsext.vf2", + VecAluOpRR::VsextVF4 => "vsext.vf4", + VecAluOpRR::VsextVF8 => "vsext.vf8", VecAluOpRR::VmvVV => "vmv.v.v", VecAluOpRR::VmvVX => "vmv.v.x", VecAluOpRR::VfmvVF => "vfmv.v.f", diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 126551791e83..bd021c7cf458 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -186,6 +186,12 @@ (VmvVX) (VfmvVF) (VfsqrtV) + (VsextVF2) + (VsextVF4) + (VsextVF8) + (VzextVF2) + (VzextVF4) + (VzextVF8) )) ;; Returns the canonical destination type for a VecAluOpRRImm5. @@ -745,6 +751,42 @@ (rule (rv_vmslt_vx vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate)) +;; Helper for emitting the `vzext.vf2` instruction. +;; Zero-extend SEW/2 source to SEW destination +(decl rv_vzext_vf2 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf2 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF2) vs mask vstate)) + +;; Helper for emitting the `vzext.vf4` instruction. +;; Zero-extend SEW/4 source to SEW destination +(decl rv_vzext_vf4 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf4 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF4) vs mask vstate)) + +;; Helper for emitting the `vzext.vf8` instruction. +;; Zero-extend SEW/8 source to SEW destination +(decl rv_vzext_vf8 (VReg VecOpMasking VState) VReg) +(rule (rv_vzext_vf8 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VzextVF8) vs mask vstate)) + +;; Helper for emitting the `vsext.vf2` instruction. +;; Sign-extend SEW/2 source to SEW destination +(decl rv_vsext_vf2 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf2 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF2) vs mask vstate)) + +;; Helper for emitting the `vsext.vf4` instruction. +;; Sign-extend SEW/4 source to SEW destination +(decl rv_vsext_vf4 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf4 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF4) vs mask vstate)) + +;; Helper for emitting the `vsext.vf8` instruction. +;; Sign-extend SEW/8 source to SEW destination +(decl rv_vsext_vf8 (VReg VecOpMasking VState) VReg) +(rule (rv_vsext_vf8 vs mask vstate) + (vec_alu_rr (VecAluOpRR.VsextVF8) vs mask vstate)) + ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl gen_extractlane (Type VReg u8) Reg) @@ -796,3 +838,17 @@ (mem_flags_trusted) (unmasked) ty)) + + +;; Emits a vslidedown instruction that moves half the lanes down. +(decl gen_slidedown_half (Type VReg) VReg) + +;; If the lane count can fit in a 5-bit immediate, we can use `vslidedown.vi`. +(rule 1 (gen_slidedown_half (ty_vec_fits_in_register ty) src) + (if-let (uimm5_from_u64 amt) (u64_udiv (ty_lane_count ty) 2)) + (rv_vslidedown_vi src amt (unmasked) ty)) + +;; Otherwise lower it into an X register. +(rule 0 (gen_slidedown_half (ty_vec_fits_in_register ty) src) + (if-let amt (u64_udiv (ty_lane_count ty) 2)) + (rv_vslidedown_vx src (imm $I64 amt) (unmasked) ty)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index de6f1d3a793f..4a2d14503751 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1087,6 +1087,51 @@ (let ((eew VecElementWidth (element_width_from_type ty))) (vec_load eew (VecAMode.UnitStride (gen_amode p offset $I64)) flags (unmasked) ty))) +;;;;; Rules for Load + Extend Combos ;;;;;;;;; + +;; These rules cover the special loads that load a 64bit value and do some sort of extension. +;; We don't have any special instructions to do this, so just load the 64 bits as a vector, and +;; do a SEW/2 extension. This only reads half width elements from the source vector register +;; extends it, and writes the back the full register. + +(decl gen_load64_extend (Type ExtendOp MemFlags XReg Offset32) VReg) + +(rule (gen_load64_extend ty (ExtendOp.Signed) flags addr offset) + (let ((eew VecElementWidth (element_width_from_type $I64)) + (load_state VState (vstate_from_type $I64)) + (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state))) + (rv_vsext_vf2 loaded (unmasked) ty))) + +(rule (gen_load64_extend ty (ExtendOp.Zero) flags addr offset) + (let ((eew VecElementWidth (element_width_from_type $I64)) + (load_state VState (vstate_from_type $I64)) + (loaded VReg (vec_load eew (VecAMode.UnitStride (gen_amode addr offset $I64)) flags (unmasked) load_state))) + (rv_vzext_vf2 loaded (unmasked) ty))) + +;;;;; Rules for `uload8x8`;;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (uload8x8 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)) + +;;;;; Rules for `uload16x4`;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (uload16x4 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)) + +;;;;; Rules for `uload32x2`;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (uload32x2 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Zero) flags addr offset)) + +;;;;; Rules for `sload8x8`;;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I16X8) (sload8x8 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)) + +;;;;; Rules for `sload16x4`;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I32X4) (sload16x4 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)) + +;;;;; Rules for `sload32x2`;;;;;;;;; +(rule (lower (has_type (ty_vec_fits_in_register ty @ $I64X2) (sload32x2 flags addr @ (value_type (ty_addr64 _)) offset))) + (gen_load64_extend ty (ExtendOp.Signed) flags addr offset)) + ;;;;; Rules for `istore8`;;;;;;;;; (rule (lower (istore8 flags x p @ (value_type (ty_addr64 _)) offset)) @@ -1428,3 +1473,53 @@ (y_mask VReg (rv_vadd_vi x_mask neg16 (unmasked) ty)) (y_lanes VReg (rv_vrgather_vv y y_mask (unmasked) ty))) (rv_vor_vv x_lanes y_lanes (unmasked) ty))) + +;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Slide down half the vector, and do a signed extension. +(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high x @ (value_type in_ty)))) + (rv_vsext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high x @ (value_type in_ty))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vsext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_high (swiden_high (swiden_high x @ (value_type in_ty)))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vsext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Slide down half the vector, and do a zero extension. +(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high x @ (value_type in_ty)))) + (rv_vzext_vf2 (gen_slidedown_half in_ty x) (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high x @ (value_type in_ty))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vzext_vf4 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_high (uwiden_high (uwiden_high x @ (value_type in_ty)))))) + (if-let (uimm5_from_u64 amt) (u64_sub (ty_lane_count in_ty) (ty_lane_count out_ty))) + (rv_vzext_vf8 (rv_vslidedown_vi x amt (unmasked) in_ty) (unmasked) out_ty)) + +;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low x))) + (rv_vsext_vf2 x (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low x)))) + (rv_vsext_vf4 x (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (swiden_low (swiden_low (swiden_low x))))) + (rv_vsext_vf8 x (unmasked) out_ty)) + +;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 0 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low x))) + (rv_vzext_vf2 x (unmasked) out_ty)) + +(rule 1 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low x)))) + (rv_vzext_vf4 x (unmasked) out_ty)) + +(rule 2 (lower (has_type (ty_vec_fits_in_register out_ty) (uwiden_low (uwiden_low (uwiden_low x))))) + (rv_vzext_vf8 x (unmasked) out_ty)) diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index 2c252ddb54d4..0a21c3750f2e 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -237,6 +237,11 @@ macro_rules! isle_common_prelude_methods { u64::MAX >> shift } + #[inline] + fn ty_lane_count(&mut self, ty: Type) -> u64 { + ty.lane_count() as u64 + } + #[inline] fn ty_umin(&mut self, _ty: Type) -> u64 { 0 diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 2f910f399c23..8757978827f2 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -265,6 +265,10 @@ (decl pure ty_lane_mask (Type) u64) (extern constructor ty_lane_mask ty_lane_mask) +;; Get the number of lanes for a given type. +(decl pure ty_lane_count (Type) u64) +(extern constructor ty_lane_count ty_lane_count) + ;; Get the byte width of a given type. (decl pure ty_bytes (Type) u16) (extern constructor ty_bytes ty_bytes) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif b/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif new file mode 100644 index 000000000000..9342ca8d3e5a --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-load-extend.clif @@ -0,0 +1,142 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %uload8x8(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vzext.vf2 v5,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x22, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + +function %sload8x8(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vsext.vf2 v5,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0xa2, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + +function %uload16x4(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vzext.vf2 v5,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x22, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + +function %sload16x4(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vsext.vf2 v5,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0xa2, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + +function %uload32x2(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vzext.vf2 v5,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x22, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + +function %sload32x2(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 v0 + return v1 +} + +; VCode: +; block0: +; vle64.v v3,0(a0) #avl=1, #vtype=(e64, m1, ta, ma) +; vsext.vf2 v5,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x57, 0xf0, 0x80, 0xcd +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0xa2, 0x33, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif b/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif new file mode 100644 index 000000000000..70b8072cf7a0 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-load-splat.clif @@ -0,0 +1,141 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %load_splat_i8x16(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; VCode: +; block0: +; lb a2,0(a0) +; vmv.v.x v4,a2 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; lb a2, 0(a0) +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x57, 0x42, 0x06, 0x5e +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + +function %load_splat_i16x8(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; VCode: +; block0: +; lh a2,0(a0) +; vmv.v.x v4,a2 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; lh a2, 0(a0) +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x42, 0x06, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + +function %load_splat_i32x4(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; VCode: +; block0: +; lw a2,0(a0) +; vmv.v.x v4,a2 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; lw a2, 0(a0) +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x42, 0x06, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + +function %load_splat_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; VCode: +; block0: +; ld a2,0(a0) +; vmv.v.x v4,a2 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ld a2, 0(a0) +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x42, 0x06, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + +function %load_splat_f32x4(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; VCode: +; block0: +; flw ft4,0(a0) +; vfmv.v.f v4,ft4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; flw ft4, 0(a0) +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x52, 0x02, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + +function %load_splat_f64x2(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; VCode: +; block0: +; fld ft4,0(a0) +; vfmv.v.f v4,ft4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; fld ft4, 0(a0) +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x52, 0x02, 0x5e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x82, 0x05, 0x02 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif new file mode 100644 index 000000000000..8c2e09f04d57 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_high.clif @@ -0,0 +1,257 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %swidenhigh_i8x16(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,8 #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf2 v6,v4 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x14, 0x3e +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0xa3, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenhigh_i16x8(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,4 #avl=8, #vtype=(e16, m1, ta, ma) +; vsext.vf2 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x12, 0x3e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xa3, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenhigh_i32x4(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,2 #avl=4, #vtype=(e32, m1, ta, ma) +; vsext.vf2 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x32, 0x11, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa3, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenhigh_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = swiden_high v0 + v2 = swiden_high v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,12 #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf4 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x16, 0x3e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xa3, 0x42, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenhigh_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = swiden_high v0 + v2 = swiden_high v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,6 #avl=8, #vtype=(e16, m1, ta, ma) +; vsext.vf4 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x13, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa3, 0x42, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenhigh_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = swiden_high v0 + v2 = swiden_high v1 + v3 = swiden_high v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,14 #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf8 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x17, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa3, 0x41, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif new file mode 100644 index 000000000000..4885cf96e09f --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-swiden_low.clif @@ -0,0 +1,242 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %swidenlow_i8x16(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf2 v4,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0xa2, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenlow_i16x8(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf2 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xa2, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenlow_i32x4(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf2 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa2, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenlow_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = swiden_low v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf4 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xa2, 0x12, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenlow_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = swiden_low v0 + v2 = swiden_low v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf4 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa2, 0x12, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %swidenlow_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = swiden_low v1 + v3 = swiden_low v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsext.vf8 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xa2, 0x11, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif new file mode 100644 index 000000000000..63849e265c36 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_high.clif @@ -0,0 +1,258 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %uwidenhigh_i8x16(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,8 #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf2 v6,v4 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x14, 0x3e +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x23, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenhigh_i16x8(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,4 #avl=8, #vtype=(e16, m1, ta, ma) +; vzext.vf2 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x12, 0x3e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x23, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenhigh_i32x4(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_high v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,2 #avl=4, #vtype=(e32, m1, ta, ma) +; vzext.vf2 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x32, 0x11, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x23, 0x43, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenhigh_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,12 #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf4 v6,v4 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x16, 0x3e +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x23, 0x42, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenhigh_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,6 #avl=8, #vtype=(e16, m1, ta, ma) +; vzext.vf4 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x13, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x23, 0x42, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenhigh_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + v3 = uwiden_high v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vslidedown.vi v4,v1,14 #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf8 v6,v4 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x32, 0x17, 0x3e +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x23, 0x41, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif new file mode 100644 index 000000000000..5c273814c539 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-uwiden_low.clif @@ -0,0 +1,243 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %uwidenlow_i8x16(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf2 v4,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x22, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenlow_i16x8(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf2 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x22, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenlow_i32x4(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_low v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf2 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x22, 0x13, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenlow_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf4 v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x22, 0x12, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenlow_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + return v2 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf4 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x22, 0x12, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %uwidenlow_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + v3 = uwiden_low v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vzext.vf8 v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x22, 0x11, 0x4a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif index 813826e817c7..3a9ca809a520 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif @@ -6,6 +6,7 @@ set enable_simd target x86_64 target x86_64 sse41 target x86_64 sse41 has_avx +target riscv64gc has_v function %swidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): @@ -27,3 +28,29 @@ block0(v0: i32x4): return v1 } ; run: %swidenhigh_i32x4([1 -2 3 -4]) == [3 -4] + +function %swidenhigh_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = swiden_high v0 + v2 = swiden_high v1 + return v2 +} +; run: %swidenhigh_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [13 -14 15 -16] + +function %swidenhigh_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = swiden_high v0 + v2 = swiden_high v1 + return v2 +} +; run: %swidenhigh_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [7 -8] + + +function %swidenhigh_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = swiden_high v0 + v2 = swiden_high v1 + v3 = swiden_high v2 + return v3 +} +; run: %swidenhigh_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [15 -16] diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif index c671f3781ebe..e1d54353dee6 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif @@ -6,6 +6,7 @@ set enable_simd target x86_64 target x86_64 sse41 target x86_64 sse41 has_avx +target riscv64gc has_v function %swidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): @@ -27,3 +28,30 @@ block0(v0: i32x4): return v1 } ; run: %swidenlow_i32x4([1 -2 3 -4]) == [1 -2] + + +function %swidenlow_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = swiden_low v1 + return v2 +} +; run: %swidenlow_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 -2 3 -4] + +function %swidenlow_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = swiden_low v0 + v2 = swiden_low v1 + return v2 +} +; run: %swidenlow_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [1 -2] + + +function %swidenlow_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = swiden_low v0 + v2 = swiden_low v1 + v3 = swiden_low v2 + return v3 +} +; run: %swidenlow_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 -2] diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif index 56b68a3830c8..f636b3cb07e7 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif @@ -6,6 +6,7 @@ set enable_simd target x86_64 target x86_64 sse41 target x86_64 sse41 has_avx +target riscv64gc has_v function %uwidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): @@ -30,3 +31,30 @@ block0(v0: i32x4): } ; run: %uwidenhigh_i32x4([1 2 3 4]) == [3 4] ; run: %uwidenhigh_i32x4([4 5 6 -1]) == [6 0xffffffff] + + +function %uwidenhigh_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + return v2 +} +; run: %uwidenhigh_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [13 0xF2 15 0xF0] + +function %uwidenhigh_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + return v2 +} +; run: %uwidenhigh_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [7 0xFFF8] + + +function %uwidenhigh_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = uwiden_high v0 + v2 = uwiden_high v1 + v3 = uwiden_high v2 + return v3 +} +; run: %uwidenhigh_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [15 0xF0] diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif index e5b2801b8410..5b2a1963e971 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif @@ -6,24 +6,52 @@ set enable_simd target x86_64 target x86_64 sse41 target x86_64 sse41 has_avx +target riscv64gc has_v function %uwidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): v1 = uwiden_low v0 return v1 } -; run: %uwidenlow_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [1 2 3 4 5 6 7 8] +; run: %uwidenlow_i8x16([1 -2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [1 0xFE 3 4 5 6 7 8] function %uwidenlow_i16x8(i16x8) -> i32x4 { block0(v0: i16x8): v1 = uwiden_low v0 return v1 } -; run: %uwidenlow_i16x8([1 2 3 4 5 6 7 8]) == [1 2 3 4] +; run: %uwidenlow_i16x8([1 -2 3 4 5 6 7 8]) == [1 0xFFFE 3 4] function %uwidenlow_i32x4(i32x4) -> i64x2 { block0(v0: i32x4): v1 = uwiden_low v0 return v1 } -; run: %uwidenlow_i32x4([1 2 3 4]) == [1 2] +; run: %uwidenlow_i32x4([1 -2 3 4]) == [1 0xFFFFFFFE] + + +function %uwidenlow_twice_i8x16(i8x16) -> i32x4 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + return v2 +} +; run: %uwidenlow_twice_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 0xFE 3 0xFC] + +function %uwidenlow_twice_i16x8(i16x8) -> i64x2 { +block0(v0: i16x8): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + return v2 +} +; run: %uwidenlow_twice_i16x8([1 -2 3 -4 5 -6 7 -8]) == [1 0xFFFE] + + +function %uwidenlow_triple_i8x16(i8x16) -> i64x2 { +block0(v0: i8x16): + v1 = uwiden_low v0 + v2 = uwiden_low v1 + v3 = uwiden_low v2 + return v3 +} +; run: %uwidenlow_triple_i8x16([1 -2 3 -4 5 -6 7 -8 9 -10 11 -12 13 -14 15 -16]) == [1 0xFE]