diff --git a/build.rs b/build.rs index 374bb83fa76c..b735f076b095 100644 --- a/build.rs +++ b/build.rs @@ -30,6 +30,12 @@ fn main() -> anyhow::Result<()> { test_directory_module(out, "tests/misc_testsuite/threads", strategy)?; test_directory_module(out, "tests/misc_testsuite/memory64", strategy)?; test_directory_module(out, "tests/misc_testsuite/component-model", strategy)?; + + // NB: these are copied from upstream and updated to wasmtime's + // current version of `wast`. This local copy should go away when + // all of Wasmtime's tooling is updated and the upstream + // `testsuite` module is additionally updated. + test_directory_module(out, "tests/misc_testsuite/relaxed-simd", strategy)?; Ok(()) })?; @@ -64,6 +70,7 @@ fn main() -> anyhow::Result<()> { drop(Command::new("rustfmt").arg(&output).status()); Ok(()) } + fn test_directory_module( out: &mut String, path: impl AsRef, @@ -182,7 +189,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { // Currently the simd wasm proposal is not implemented in the riscv64 // backend so skip all tests which could use simd. "riscv64" => { - testsuite == "simd" || testname.contains("simd") || testname.contains("memory_multi") + testsuite.contains("simd") + || testname.contains("simd") + || testname.contains("memory_multi") } _ => false, diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 46fcff76f3b5..8692207c5481 100755 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -386,6 +386,27 @@ fn define_simd_lane_access( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "x86_pshufb", + r#" + A vector swizzle lookalike which has the semantics of `pshufb` on x64. + + This instruction will permute the 8-bit lanes of `x` with the indices + specified in `y`. Each lane in the mask, `y`, uses the bottom four + bits for selecting the lane from `x` unless the most significant bit + is set, in which case the lane is zeroed. The output vector will have + the following contents when the element of `y` is in these ranges: + + * `[0, 127]` -> `x[y[i] % 16]` + * `[128, 255]` -> 0 + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let x = &Operand::new("x", TxN).with_doc("The vector to modify"); let y = &Operand::new("y", &TxN.lane_of()).with_doc("New lane value"); let Idx = &Operand::new("Idx", &imm.uimm8).with_doc("Lane index"); @@ -1436,7 +1457,7 @@ pub(crate) fn define( Conditional select of bits. For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit - in `c` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also: + in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also: `select`, `vselect`. "#, &formats.ternary, @@ -1445,6 +1466,24 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "x86_blendv", + r#" + A bitselect-lookalike instruction except with the semantics of + `blendv`-related instructions on x86. + + This instruction will use the top bit of each lane in `c`, the condition + mask. If the bit is 1 then the corresponding lane from `x` is chosen. + Otherwise the corresponding lane from `y` is chosen. + + "#, + &formats.ternary, + ) + .operands_in(vec![c, x, y]) + .operands_out(vec![a]), + ); + let c = &Operand::new("c", &TxN.as_bool()).with_doc("Controlling vector"); let x = &Operand::new("x", TxN).with_doc("Value to use where `c` is true"); let y = &Operand::new("y", TxN).with_doc("Value to use where `c` is false"); @@ -1698,6 +1737,22 @@ pub(crate) fn define( .operands_out(vec![qa]), ); + ig.push( + Inst::new( + "x86_pmulhrsw", + r#" + A similar instruction to `sqmul_round_sat` except with the semantics + of x86's `pmulhrsw` instruction. + + This is the same as `sqmul_round_sat` except when both input lanes are + `i16::MIN`. + "#, + &formats.binary, + ) + .operands_in(vec![qx, qy]) + .operands_out(vec![qa]), + ); + { // Integer division and remainder are scalar-only; most // hardware does not directly support vector integer division. @@ -3135,6 +3190,36 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let I8x16 = &TypeVar::new( + "I8x16", + "A SIMD vector type consisting of 16 lanes of 8-bit integers", + TypeSetBuilder::new() + .ints(8..8) + .simd_lanes(16..16) + .includes_scalars(false) + .build(), + ); + let x = &Operand::new("x", I8x16); + let y = &Operand::new("y", I8x16); + let a = &Operand::new("a", I16x8); + + ig.push( + Inst::new( + "x86_pmaddubsw", + r#" + An instruction with equivalent semantics to `pmaddubsw` on x86. + + This instruction will take signed bytes from the first argument and + multiply them against unsigned bytes in the second argument. Adjacent + pairs are then added, with saturating, to a 16-bit value and are packed + into the result. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let IntTo = &TypeVar::new( "IntTo", "A larger integer type with the same number of lanes", @@ -3378,6 +3463,20 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "x86_cvtt2dq", + r#" + A float-to-integer conversion instruction for vectors-of-floats which + has the same semantics as `cvttp{s,d}2dq` on x86. This specifically + returns `INT_MIN` for NaN or out-of-bounds lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + let Int = &TypeVar::new( "Int", "A scalar or vector integer type", diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs index afd32dd40af6..65c628b430f1 100644 --- a/cranelift/codegen/src/isa/aarch64/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -214,6 +214,10 @@ impl TargetIsa for AArch64Backend { cs.set_skipdata(true)?; Ok(cs) } + + fn has_native_fma(&self) -> bool { + true + } } impl fmt::Display for AArch64Backend { diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index eef8233b3603..539bdb431723 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -315,6 +315,13 @@ pub trait TargetIsa: fmt::Display + Send + Sync { fn to_capstone(&self) -> Result { Err(capstone::Error::UnsupportedArch) } + + /// Returns whether this ISA has a native fused-multiply-and-add instruction + /// for floats. + /// + /// Currently this only returns false on x86 when some native features are + /// not detected. + fn has_native_fma(&self) -> bool; } /// Methods implemented for free for target ISA! diff --git a/cranelift/codegen/src/isa/riscv64/mod.rs b/cranelift/codegen/src/isa/riscv64/mod.rs index 53d90348172d..69711e0c8228 100644 --- a/cranelift/codegen/src/isa/riscv64/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/mod.rs @@ -186,6 +186,10 @@ impl TargetIsa for Riscv64Backend { cs.set_skipdata(true)?; Ok(cs) } + + fn has_native_fma(&self) -> bool { + true + } } impl fmt::Display for Riscv64Backend { diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs index 6a6dad94c3ab..9ba81d14ac39 100644 --- a/cranelift/codegen/src/isa/s390x/mod.rs +++ b/cranelift/codegen/src/isa/s390x/mod.rs @@ -186,6 +186,10 @@ impl TargetIsa for S390xBackend { Ok(cs) } + + fn has_native_fma(&self) -> bool { + true + } } impl fmt::Display for S390xBackend { diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index ec21968b381b..f6abbec48df2 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1212,6 +1212,20 @@ (decl pure vconst_all_ones_or_all_zeros () Constant) (extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) +;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I8X16 + (x86_blendv condition if_true if_false))) + (x64_pblendvb if_false if_true condition)) + +(rule (lower (has_type $I32X4 + (x86_blendv condition if_true if_false))) + (x64_blendvps if_false if_true condition)) + +(rule (lower (has_type $I64X2 + (x86_blendv condition if_true if_false))) + (x64_blendvpd if_false if_true condition)) + ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) @@ -2145,6 +2159,11 @@ (rule (lower (debugtrap)) (side_effect (x64_hlt))) +;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I16X8 (x86_pmaddubsw x y))) + (x64_pmaddubsw y x)) + ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fadd x y))) @@ -3169,6 +3188,11 @@ ;; values greater than max signed int. (x64_paddd tmp1 dst))) +;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I32X4 (x86_cvtt2dq val @ (value_type $F32X4)))) + (x64_cvttps2dq val)) + ;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I16X8 (iadd_pairwise x y))) @@ -3304,6 +3328,12 @@ (dst Xmm (x64_minpd a tmp1))) (x64_cvttpd2dq dst))) +;; This rule is a special case for handling the translation of the wasm op +;; `i32x4.relaxed_trunc_f64x2_s_zero`. +(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (x86_cvtt2dq val)) + (vconst (u128_from_constant 0))))) + (x64_cvttpd2dq val)) + ;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (unarrow a @ (value_type $I16X8) b))) @@ -3559,6 +3589,11 @@ (let ((mask Xmm (x64_paddusb mask (swizzle_zero_mask)))) (x64_pshufb src mask))) +;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (x86_pshufb src mask)) + (x64_pshufb src mask)) + ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Remove the extractlane instruction, leaving the float where it is. The upper @@ -3736,7 +3771,12 @@ (cmp Xmm (x64_pcmpeqw dst mask))) (x64_pxor dst cmp))) -;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (x86_pmulhrsw qx @ (value_type $I16X8) qy)) + (x64_pmulhrsw qx qy)) + +;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; TODO: currently we only lower a special case of `uunarrow` needed to support ;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation. diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 7c085ad583b7..68f218fae44f 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -184,6 +184,10 @@ impl TargetIsa for X64Backend { .syntax(arch::x86::ArchSyntax::Att) .build() } + + fn has_native_fma(&self) -> bool { + self.x64_flags.use_fma() + } } impl fmt::Display for X64Backend { diff --git a/cranelift/filetests/filetests/wasm/aarch64-relaxed-simd.wat b/cranelift/filetests/filetests/wasm/aarch64-relaxed-simd.wat new file mode 100644 index 000000000000..b1a0bcd4cb87 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/aarch64-relaxed-simd.wat @@ -0,0 +1,87 @@ +;;! target = "aarch64" +;;! compile = true + +(module + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_s + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_u + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_s_zero + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_u_zero + ) + + (func (param v128 v128) (result v128) + local.get 0 + local.get 1 + i16x8.relaxed_dot_i8x16_i7x16_s + ) + + (func (param v128 v128 v128) (result v128) + local.get 0 + local.get 1 + local.get 2 + i32x4.relaxed_dot_i8x16_i7x16_add_s + ) +) + +;; function u0:0: +;; block0: +;; fcvtzs v0.4s, v0.4s +;; b label1 +;; block1: +;; ret +;; +;; function u0:1: +;; block0: +;; fcvtzu v0.4s, v0.4s +;; b label1 +;; block1: +;; ret +;; +;; function u0:2: +;; block0: +;; fcvtzs v4.2d, v0.2d +;; sqxtn v0.2s, v4.2d +;; b label1 +;; block1: +;; ret +;; +;; function u0:3: +;; block0: +;; fcvtzu v4.2d, v0.2d +;; uqxtn v0.2s, v4.2d +;; b label1 +;; block1: +;; ret +;; +;; function u0:4: +;; block0: +;; smull v6.8h, v0.8b, v1.8b +;; smull2 v7.8h, v0.16b, v1.16b +;; addp v0.8h, v6.8h, v7.8h +;; b label1 +;; block1: +;; ret +;; +;; function u0:5: +;; block0: +;; smull v17.8h, v0.8b, v1.8b +;; smull2 v18.8h, v0.16b, v1.16b +;; addp v17.8h, v17.8h, v18.8h +;; saddlp v17.4s, v17.8h +;; add v0.4s, v17.4s, v2.4s +;; b label1 +;; block1: +;; ret diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat new file mode 100644 index 000000000000..f3ae7c7358d5 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat @@ -0,0 +1,161 @@ +;;! target = "x86_64" +;;! compile = true +;;! relaxed_simd_deterministic = true +;;! settings = ["has_avx=true"] + +(module + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_s + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_u + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_s_zero + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_u_zero + ) + + (func (param v128 v128) (result v128) + local.get 0 + local.get 1 + i16x8.relaxed_dot_i8x16_i7x16_s + ) + + (func (param v128 v128 v128) (result v128) + local.get 0 + local.get 1 + local.get 2 + i32x4.relaxed_dot_i8x16_i7x16_add_s + ) +) + +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; vcmpps $0 %xmm0, %xmm0, %xmm3 +;; vandps %xmm0, %xmm3, %xmm5 +;; vpxor %xmm3, %xmm5, %xmm7 +;; vcvttps2dq %xmm5, %xmm9 +;; vpand %xmm9, %xmm7, %xmm11 +;; vpsrad %xmm11, $31, %xmm13 +;; vpxor %xmm13, %xmm9, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; xorps %xmm3, %xmm3, %xmm3 +;; vmaxps %xmm0, %xmm3, %xmm5 +;; vpcmpeqd %xmm3, %xmm3, %xmm7 +;; vpsrld %xmm7, $1, %xmm9 +;; vcvtdq2ps %xmm9, %xmm11 +;; vcvttps2dq %xmm5, %xmm13 +;; vsubps %xmm5, %xmm11, %xmm15 +;; vcmpps $2 %xmm11, %xmm15, %xmm1 +;; vcvttps2dq %xmm15, %xmm3 +;; vpxor %xmm3, %xmm1, %xmm5 +;; pxor %xmm7, %xmm7, %xmm7 +;; vpmaxsd %xmm5, %xmm7, %xmm9 +;; vpaddd %xmm9, %xmm13, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; vcmppd $0 %xmm0, %xmm0, %xmm3 +;; vandps %xmm3, const(0), %xmm5 +;; vminpd %xmm0, %xmm5, %xmm7 +;; vcvttpd2dq %xmm7, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; xorpd %xmm3, %xmm3, %xmm3 +;; vmaxpd %xmm0, %xmm3, %xmm5 +;; vminpd %xmm5, const(0), %xmm7 +;; vroundpd $3, %xmm7, %xmm9 +;; vaddpd %xmm9, const(1), %xmm11 +;; vshufps $136 %xmm11, %xmm3, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; vpmovsxbw %xmm0, %xmm10 +;; vpmovsxbw %xmm1, %xmm12 +;; vpmullw %xmm10, %xmm12, %xmm14 +;; vpalignr $8 %xmm0, %xmm0, %xmm8 +;; vpmovsxbw %xmm8, %xmm10 +;; vpalignr $8 %xmm1, %xmm1, %xmm12 +;; vpmovsxbw %xmm12, %xmm15 +;; vpmullw %xmm10, %xmm15, %xmm0 +;; vphaddw %xmm14, %xmm0, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; vpmovsxbw %xmm0, %xmm13 +;; vpmovsxbw %xmm1, %xmm15 +;; vpmullw %xmm13, %xmm15, %xmm3 +;; vpalignr $8 %xmm0, %xmm0, %xmm11 +;; vpmovsxbw %xmm11, %xmm13 +;; vpalignr $8 %xmm1, %xmm1, %xmm15 +;; vpmovsxbw %xmm15, %xmm1 +;; vpmullw %xmm13, %xmm1, %xmm4 +;; vphaddw %xmm3, %xmm4, %xmm15 +;; vpmaddwd %xmm15, const(0), %xmm15 +;; vpaddd %xmm15, %xmm2, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat new file mode 100644 index 000000000000..43586fcb2c47 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat @@ -0,0 +1,140 @@ +;;! target = "x86_64" +;;! compile = true + +(module + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_s + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f32x4_u + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_s_zero + ) + + (func (param v128) (result v128) + local.get 0 + i32x4.relaxed_trunc_f64x2_u_zero + ) + + (func (param v128 v128) (result v128) + local.get 0 + local.get 1 + i16x8.relaxed_dot_i8x16_i7x16_s + ) + + (func (param v128 v128 v128) (result v128) + local.get 0 + local.get 1 + local.get 2 + i32x4.relaxed_dot_i8x16_i7x16_add_s + ) +) + +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; cvttps2dq %xmm0, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; xorps %xmm6, %xmm6, %xmm6 +;; movdqa %xmm0, %xmm10 +;; maxps %xmm10, %xmm6, %xmm10 +;; pcmpeqd %xmm6, %xmm6, %xmm6 +;; psrld %xmm6, $1, %xmm6 +;; cvtdq2ps %xmm6, %xmm14 +;; cvttps2dq %xmm10, %xmm13 +;; subps %xmm10, %xmm14, %xmm10 +;; cmpps $2, %xmm14, %xmm10, %xmm14 +;; cvttps2dq %xmm10, %xmm0 +;; pxor %xmm0, %xmm14, %xmm0 +;; pxor %xmm7, %xmm7, %xmm7 +;; pmaxsd %xmm0, %xmm7, %xmm0 +;; paddd %xmm0, %xmm13, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; cvttpd2dq %xmm0, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; xorpd %xmm3, %xmm3, %xmm3 +;; movdqa %xmm0, %xmm6 +;; maxpd %xmm6, %xmm3, %xmm6 +;; minpd %xmm6, const(0), %xmm6 +;; roundpd $3, %xmm6, %xmm0 +;; addpd %xmm0, const(1), %xmm0 +;; shufps $136, %xmm0, %xmm3, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; movdqa %xmm1, %xmm4 +;; pmaddubsw %xmm4, %xmm0, %xmm4 +;; movdqa %xmm4, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; movdqa %xmm0, %xmm8 +;; movdqa %xmm1, %xmm0 +;; pmaddubsw %xmm0, %xmm8, %xmm0 +;; pmaddwd %xmm0, const(0), %xmm0 +;; paddd %xmm0, %xmm2, %xmm0 +;; jmp label1 +;; block1: +;; movq %rbp, %rsp +;; popq %rbp +;; ret diff --git a/cranelift/filetests/src/test_wasm/config.rs b/cranelift/filetests/src/test_wasm/config.rs index 4b8ad4ad5236..6ba4f3c66a30 100644 --- a/cranelift/filetests/src/test_wasm/config.rs +++ b/cranelift/filetests/src/test_wasm/config.rs @@ -29,6 +29,9 @@ pub struct TestConfig { #[serde(default)] pub heaps: Vec, + + #[serde(default)] + pub relaxed_simd_deterministic: bool, } impl TestConfig { diff --git a/cranelift/filetests/src/test_wasm/env.rs b/cranelift/filetests/src/test_wasm/env.rs index 5d363aa412ab..8a0ea656b9a4 100644 --- a/cranelift/filetests/src/test_wasm/env.rs +++ b/cranelift/filetests/src/test_wasm/env.rs @@ -82,6 +82,7 @@ impl<'data> ModuleEnvironment<'data> for ModuleEnv { wasmparser::WasmFeatures { memory64: true, multi_memory: true, + relaxed_simd: true, ..self.inner.wasm_features() } } @@ -613,4 +614,12 @@ impl<'a> FuncEnvironment for FuncEnv<'a> { { self.inner.heaps() } + + fn relaxed_simd_deterministic(&self) -> bool { + self.config.relaxed_simd_deterministic + } + + fn is_x86(&self) -> bool { + self.config.target.contains("x86_64") + } } diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 155e5dd7d1d9..56a894fd6275 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -1358,6 +1358,11 @@ where Opcode::GetFramePointer => unimplemented!("GetFramePointer"), Opcode::GetStackPointer => unimplemented!("GetStackPointer"), Opcode::GetReturnAddress => unimplemented!("GetReturnAddress"), + Opcode::X86Pshufb => unimplemented!("X86Pshufb"), + Opcode::X86Blendv => unimplemented!("X86Blendv"), + Opcode::X86Pmulhrsw => unimplemented!("X86Pmulhrsw"), + Opcode::X86Pmaddubsw => unimplemented!("X86Pmaddubsw"), + Opcode::X86Cvtt2dq => unimplemented!("X86Cvtt2dq"), }) } diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index cb7b29fffb41..3f506374b622 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1778,13 +1778,10 @@ pub fn translate_operator( state.push1(builder.ins().sshr(bitcast_a, b)) } Operator::V128Bitselect => { - let (a, b, c) = state.pop3(); - let bitcast_a = optionally_bitcast_vector(a, I8X16, builder); - let bitcast_b = optionally_bitcast_vector(b, I8X16, builder); - let bitcast_c = optionally_bitcast_vector(c, I8X16, builder); + let (a, b, c) = pop3_with_bitcast(state, I8X16, builder); // The CLIF operand ordering is slightly different and the types of all three // operands must match (hence the bitcast). - state.push1(builder.ins().bitselect(bitcast_c, bitcast_a, bitcast_b)) + state.push1(builder.ins().bitselect(c, a, b)) } Operator::V128AnyTrue => { let a = pop1_with_bitcast(state, type_of(op), builder); @@ -1938,11 +1935,23 @@ pub fn translate_operator( state.push1(builder.ins().snarrow(converted_a, zero)); } - Operator::I32x4TruncSatF32x4U => { + + // FIXME(#5913): the relaxed instructions here are translated the same + // as the saturating instructions, even when the code generator + // configuration allow for different semantics across hosts. On x86, + // however, it's theoretically possible to have a slightly more optimal + // lowering which accounts for NaN differently, although the lowering is + // still not trivial (e.g. one instruction). At this time the + // more-optimal-but-still-large lowering for x86 is not implemented so + // the relaxed instructions are listed here instead of down below with + // the other relaxed instructions. An x86-specific implementation (or + // perhaps for other backends too) should be added and the codegen for + // the relaxed instruction should conditionally be different. + Operator::I32x4RelaxedTruncF32x4U | Operator::I32x4TruncSatF32x4U => { let a = pop1_with_bitcast(state, F32X4, builder); state.push1(builder.ins().fcvt_to_uint_sat(I32X4, a)) } - Operator::I32x4TruncSatF64x2UZero => { + Operator::I32x4RelaxedTruncF64x2UZero | Operator::I32x4TruncSatF64x2UZero => { let a = pop1_with_bitcast(state, F64X2, builder); let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a); let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into()); @@ -1950,6 +1959,7 @@ pub fn translate_operator( state.push1(builder.ins().uunarrow(converted_a, zero)); } + Operator::I8x16NarrowI16x8S => { let (a, b) = pop2_with_bitcast(state, I16X8, builder); state.push1(builder.ins().snarrow(a, b)) @@ -2156,27 +2166,175 @@ pub fn translate_operator( op )); } - Operator::I8x16RelaxedSwizzle - | Operator::I32x4RelaxedTruncF32x4S - | Operator::I32x4RelaxedTruncF32x4U - | Operator::I32x4RelaxedTruncF64x2SZero - | Operator::I32x4RelaxedTruncF64x2UZero - | Operator::F32x4RelaxedMadd - | Operator::F32x4RelaxedNmadd - | Operator::F64x2RelaxedMadd - | Operator::F64x2RelaxedNmadd - | Operator::I8x16RelaxedLaneselect + + Operator::F32x4RelaxedMax | Operator::F64x2RelaxedMax => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics match the `fmax` instruction, or + // the `fAAxBB.max` wasm instruction. + builder.ins().fmax(a, b) + } else { + builder.ins().fmax_pseudo(a, b) + }, + ) + } + + Operator::F32x4RelaxedMin | Operator::F64x2RelaxedMin => { + let (a, b) = pop2_with_bitcast(state, type_of(op), builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics match the `fmin` instruction, or + // the `fAAxBB.min` wasm instruction. + builder.ins().fmin(a, b) + } else { + builder.ins().fmin_pseudo(a, b) + }, + ); + } + + Operator::I8x16RelaxedSwizzle => { + let (a, b) = pop2_with_bitcast(state, I8X16, builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics match the `i8x16.swizzle` + // instruction which is the CLIF `swizzle`. + builder.ins().swizzle(a, b) + } else { + builder.ins().x86_pshufb(a, b) + }, + ); + } + + Operator::F32x4RelaxedMadd | Operator::F64x2RelaxedMadd => { + let (a, b, c) = pop3_with_bitcast(state, type_of(op), builder); + state.push1( + if environ.relaxed_simd_deterministic() || environ.has_native_fma() { + // Deterministic semantics are "fused multiply and add" + // which the CLIF `fma` guarantees. + builder.ins().fma(a, b, c) + } else { + let mul = builder.ins().fmul(a, b); + builder.ins().fadd(mul, c) + }, + ); + } + Operator::F32x4RelaxedNmadd | Operator::F64x2RelaxedNmadd => { + let (a, b, c) = pop3_with_bitcast(state, type_of(op), builder); + let a = builder.ins().fneg(a); + state.push1( + if environ.relaxed_simd_deterministic() || environ.has_native_fma() { + // Deterministic semantics are "fused multiply and add" + // which the CLIF `fma` guarantees. + builder.ins().fma(a, b, c) + } else { + let mul = builder.ins().fmul(a, b); + builder.ins().fadd(mul, c) + }, + ); + } + + Operator::I8x16RelaxedLaneselect | Operator::I16x8RelaxedLaneselect | Operator::I32x4RelaxedLaneselect - | Operator::I64x2RelaxedLaneselect - | Operator::F32x4RelaxedMin - | Operator::F32x4RelaxedMax - | Operator::F64x2RelaxedMin - | Operator::F64x2RelaxedMax - | Operator::I16x8RelaxedQ15mulrS - | Operator::I16x8RelaxedDotI8x16I7x16S - | Operator::I32x4RelaxedDotI8x16I7x16AddS => { - return Err(wasm_unsupported!("proposed relaxed-simd operator {:?}", op)); + | Operator::I64x2RelaxedLaneselect => { + let ty = type_of(op); + let (a, b, c) = pop3_with_bitcast(state, ty, builder); + // Note that the variable swaps here are intentional due to + // the difference of the order of the wasm op and the clif + // op. + // + // Additionally note that even on x86 the I16X8 type uses the + // `bitselect` instruction since x86 has no corresponding + // `blendv`-style instruction for 16-bit operands. + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() || ty == I16X8 { + // Deterministic semantics are a `bitselect` along the lines + // of the wasm `v128.bitselect` instruction. + builder.ins().bitselect(c, a, b) + } else { + builder.ins().x86_blendv(c, a, b) + }, + ); + } + + Operator::I32x4RelaxedTruncF32x4S => { + let a = pop1_with_bitcast(state, F32X4, builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics are to match the + // `i32x4.trunc_sat_f32x4_s` instruction. + builder.ins().fcvt_to_sint_sat(I32X4, a) + } else { + builder.ins().x86_cvtt2dq(I32X4, a) + }, + ) + } + Operator::I32x4RelaxedTruncF64x2SZero => { + let a = pop1_with_bitcast(state, F64X2, builder); + let converted_a = if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics are to match the + // `i32x4.trunc_sat_f64x2_s_zero` instruction. + builder.ins().fcvt_to_sint_sat(I64X2, a) + } else { + builder.ins().x86_cvtt2dq(I64X2, a) + }; + let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into()); + let zero = builder.ins().vconst(I64X2, handle); + + state.push1(builder.ins().snarrow(converted_a, zero)); + } + Operator::I16x8RelaxedQ15mulrS => { + let (a, b) = pop2_with_bitcast(state, I16X8, builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics are to match the + // `i16x8.q15mulr_sat_s` instruction. + builder.ins().sqmul_round_sat(a, b) + } else { + builder.ins().x86_pmulhrsw(a, b) + }, + ); + } + Operator::I16x8RelaxedDotI8x16I7x16S => { + let (a, b) = pop2_with_bitcast(state, I8X16, builder); + state.push1( + if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics are to treat both operands as + // signed integers and perform the dot product. + let alo = builder.ins().swiden_low(a); + let blo = builder.ins().swiden_low(b); + let lo = builder.ins().imul(alo, blo); + let ahi = builder.ins().swiden_high(a); + let bhi = builder.ins().swiden_high(b); + let hi = builder.ins().imul(ahi, bhi); + builder.ins().iadd_pairwise(lo, hi) + } else { + builder.ins().x86_pmaddubsw(a, b) + }, + ); + } + + Operator::I32x4RelaxedDotI8x16I7x16AddS => { + let c = pop1_with_bitcast(state, I32X4, builder); + let (a, b) = pop2_with_bitcast(state, I8X16, builder); + let dot = if environ.relaxed_simd_deterministic() || !environ.is_x86() { + // Deterministic semantics are to treat both operands as + // signed integers and perform the dot product. + let alo = builder.ins().swiden_low(a); + let blo = builder.ins().swiden_low(b); + let lo = builder.ins().imul(alo, blo); + let ahi = builder.ins().swiden_high(a); + let bhi = builder.ins().swiden_high(b); + let hi = builder.ins().imul(ahi, bhi); + builder.ins().iadd_pairwise(lo, hi) + } else { + builder.ins().x86_pmaddubsw(a, b) + }; + let dotlo = builder.ins().swiden_low(dot); + let dothi = builder.ins().swiden_high(dot); + let dot32 = builder.ins().iadd_pairwise(dotlo, dothi); + state.push1(builder.ins().iadd(dot32, c)); } Operator::CallRef { .. } @@ -2945,7 +3103,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I8x16MaxU | Operator::I8x16AvgrU | Operator::I8x16Bitmask - | Operator::I8x16Popcnt => I8X16, + | Operator::I8x16Popcnt + | Operator::I8x16RelaxedLaneselect => I8X16, Operator::I16x8Splat | Operator::V128Load16Splat { .. } @@ -2982,7 +3141,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I16x8MaxU | Operator::I16x8AvgrU | Operator::I16x8Mul - | Operator::I16x8Bitmask => I16X8, + | Operator::I16x8Bitmask + | Operator::I16x8RelaxedLaneselect => I16X8, Operator::I32x4Splat | Operator::V128Load32Splat { .. } @@ -3016,6 +3176,7 @@ fn type_of(operator: &Operator) -> Type { | Operator::I32x4Bitmask | Operator::I32x4TruncSatF32x4S | Operator::I32x4TruncSatF32x4U + | Operator::I32x4RelaxedLaneselect | Operator::V128Load32Zero { .. } => I32X4, Operator::I64x2Splat @@ -3040,6 +3201,7 @@ fn type_of(operator: &Operator) -> Type { | Operator::I64x2Sub | Operator::I64x2Mul | Operator::I64x2Bitmask + | Operator::I64x2RelaxedLaneselect | Operator::V128Load64Zero { .. } => I64X2, Operator::F32x4Splat @@ -3067,7 +3229,11 @@ fn type_of(operator: &Operator) -> Type { | Operator::F32x4Ceil | Operator::F32x4Floor | Operator::F32x4Trunc - | Operator::F32x4Nearest => F32X4, + | Operator::F32x4Nearest + | Operator::F32x4RelaxedMax + | Operator::F32x4RelaxedMin + | Operator::F32x4RelaxedMadd + | Operator::F32x4RelaxedNmadd => F32X4, Operator::F64x2Splat | Operator::F64x2ExtractLane { .. } @@ -3092,7 +3258,11 @@ fn type_of(operator: &Operator) -> Type { | Operator::F64x2Ceil | Operator::F64x2Floor | Operator::F64x2Trunc - | Operator::F64x2Nearest => F64X2, + | Operator::F64x2Nearest + | Operator::F64x2RelaxedMax + | Operator::F64x2RelaxedMin + | Operator::F64x2RelaxedMadd + | Operator::F64x2RelaxedNmadd => F64X2, _ => unimplemented!( "Currently only SIMD instructions are mapped to their return type; the \ @@ -3219,6 +3389,18 @@ fn pop2_with_bitcast( (bitcast_a, bitcast_b) } +fn pop3_with_bitcast( + state: &mut FuncTranslationState, + needed_type: Type, + builder: &mut FunctionBuilder, +) -> (Value, Value, Value) { + let (a, b, c) = state.pop3(); + let bitcast_a = optionally_bitcast_vector(a, needed_type, builder); + let bitcast_b = optionally_bitcast_vector(b, needed_type, builder); + let bitcast_c = optionally_bitcast_vector(c, needed_type, builder); + (bitcast_a, bitcast_b, bitcast_c) +} + fn bitcast_arguments<'a>( builder: &FunctionBuilder, arguments: &'a mut [Value], diff --git a/cranelift/wasm/src/environ/spec.rs b/cranelift/wasm/src/environ/spec.rs index 1b64ec811a62..03121c8c9cfc 100644 --- a/cranelift/wasm/src/environ/spec.rs +++ b/cranelift/wasm/src/environ/spec.rs @@ -525,6 +525,27 @@ pub trait FuncEnvironment: TargetEnvironment { /// Returns the target ISA's condition to check for unsigned addition /// overflowing. fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC; + + /// Whether or not to force relaxed simd instructions to have deterministic + /// lowerings meaning they will produce the same results across all hosts, + /// regardless of the cost to performance. + fn relaxed_simd_deterministic(&self) -> bool { + true + } + + /// Whether or not the target being translated for has a native fma + /// instruction. If it does not then when relaxed simd isn't deterministic + /// the translation of the `f32x4.relaxed_fma` instruction, for example, + /// will do a multiplication and then an add instead of the fused version. + fn has_native_fma(&self) -> bool { + false + } + + /// Returns whether this is an x86 target, which may alter lowerings of + /// relaxed simd instructions. + fn is_x86(&self) -> bool { + false + } } /// An object satisfying the `ModuleEnvironment` trait can be passed as argument to the diff --git a/crates/cli-flags/src/lib.rs b/crates/cli-flags/src/lib.rs index 7e40bff3aa8d..3488096acefd 100644 --- a/crates/cli-flags/src/lib.rs +++ b/crates/cli-flags/src/lib.rs @@ -35,6 +35,10 @@ pub const SUPPORTED_WASM_FEATURES: &[(&str, &str)] = &[ ("multi-value", "enables support for multi-value functions"), ("reference-types", "enables support for reference types"), ("simd", "enables support for proposed SIMD instructions"), + ( + "relaxed-simd", + "enables support for the relaxed simd proposal", + ), ("threads", "enables support for WebAssembly threads"), ("memory64", "enables support for 64-bit memories"), #[cfg(feature = "component-model")] @@ -235,6 +239,17 @@ pub struct CommonOptions { /// stack overflow is reported. #[clap(long)] pub max_wasm_stack: Option, + + /// Whether or not to force deterministic and host-independent behavior of + /// the relaxed-simd instructions. + /// + /// By default these instructions may have architecture-specific behavior as + /// allowed by the specification, but this can be used to force the behavior + /// of these instructions to match the deterministic behavior classified in + /// the specification. Note that enabling this option may come at a + /// performance cost. + #[clap(long)] + pub relaxed_simd_deterministic: bool, } impl CommonOptions { @@ -329,12 +344,15 @@ impl CommonOptions { config.max_wasm_stack(max); } + config.relaxed_simd_deterministic(self.relaxed_simd_deterministic); + Ok(config) } pub fn enable_wasm_features(&self, config: &mut Config) { let WasmFeatures { simd, + relaxed_simd, bulk_memory, reference_types, multi_value, @@ -348,6 +366,9 @@ impl CommonOptions { if let Some(enable) = simd { config.wasm_simd(enable); } + if let Some(enable) = relaxed_simd { + config.wasm_relaxed_simd(enable); + } if let Some(enable) = bulk_memory { config.wasm_bulk_memory(enable); } @@ -400,6 +421,7 @@ pub struct WasmFeatures { pub multi_value: Option, pub bulk_memory: Option, pub simd: Option, + pub relaxed_simd: Option, pub threads: Option, pub multi_memory: Option, pub memory64: Option, @@ -450,6 +472,7 @@ fn parse_wasm_features(features: &str) -> Result { multi_value: all.or(values["multi-value"]), bulk_memory: all.or(values["bulk-memory"]), simd: all.or(values["simd"]), + relaxed_simd: all.or(values["relaxed-simd"]), threads: all.or(values["threads"]), multi_memory: all.or(values["multi-memory"]), memory64: all.or(values["memory64"]), @@ -560,6 +583,7 @@ mod test { multi_value, bulk_memory, simd, + relaxed_simd, threads, multi_memory, memory64, @@ -572,6 +596,7 @@ mod test { assert_eq!(threads, Some(true)); assert_eq!(multi_memory, Some(true)); assert_eq!(memory64, Some(true)); + assert_eq!(relaxed_simd, Some(true)); Ok(()) } @@ -585,6 +610,7 @@ mod test { multi_value, bulk_memory, simd, + relaxed_simd, threads, multi_memory, memory64, @@ -597,6 +623,7 @@ mod test { assert_eq!(threads, Some(false)); assert_eq!(multi_memory, Some(false)); assert_eq!(memory64, Some(false)); + assert_eq!(relaxed_simd, Some(false)); Ok(()) } @@ -613,6 +640,7 @@ mod test { multi_value, bulk_memory, simd, + relaxed_simd, threads, multi_memory, memory64, @@ -625,6 +653,7 @@ mod test { assert_eq!(threads, None); assert_eq!(multi_memory, Some(true)); assert_eq!(memory64, Some(true)); + assert_eq!(relaxed_simd, None); Ok(()) } @@ -662,6 +691,7 @@ mod test { feature_test!(test_multi_value_feature, multi_value, "multi-value"); feature_test!(test_bulk_memory_feature, bulk_memory, "bulk-memory"); feature_test!(test_simd_feature, simd, "simd"); + feature_test!(test_relaxed_simd_feature, relaxed_simd, "relaxed-simd"); feature_test!(test_threads_feature, threads, "threads"); feature_test!(test_multi_memory_feature, multi_memory, "multi-memory"); feature_test!(test_memory64_feature, memory64, "memory64"); diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 205c43b2cd05..55272b1b9804 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2153,4 +2153,16 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { self.isa.unsigned_add_overflow_condition() } + + fn relaxed_simd_deterministic(&self) -> bool { + self.tunables.relaxed_simd_deterministic + } + + fn has_native_fma(&self) -> bool { + self.isa.has_native_fma() + } + + fn is_x86(&self) -> bool { + self.isa.triple().architecture == target_lexicon::Architecture::X86_64 + } } diff --git a/crates/cranelift/src/obj.rs b/crates/cranelift/src/obj.rs index a596a1a75ceb..d5983598aa56 100644 --- a/crates/cranelift/src/obj.rs +++ b/crates/cranelift/src/obj.rs @@ -545,6 +545,8 @@ fn libcall_name(call: LibCall) -> &'static str { LibCall::CeilF64 => LC::CeilF64, LibCall::TruncF32 => LC::TruncF32, LibCall::TruncF64 => LC::TruncF64, + LibCall::FmaF32 => LC::FmaF32, + LibCall::FmaF64 => LC::FmaF64, _ => panic!("unknown libcall to give a name to: {call:?}"), }; other.symbol() diff --git a/crates/environ/src/obj.rs b/crates/environ/src/obj.rs index efd48f0e2f89..6e39cc319f9f 100644 --- a/crates/environ/src/obj.rs +++ b/crates/environ/src/obj.rs @@ -166,4 +166,6 @@ libcalls! { CeilF64 = "libcall_ceilf64" TruncF32 = "libcall_truncf32" TruncF64 = "libcall_truncf64" + FmaF32 = "libcall_fmaf32" + FmaF64 = "libcall_fmaf64" } diff --git a/crates/environ/src/tunables.rs b/crates/environ/src/tunables.rs index 59992f60d6ed..4b37cd08b935 100644 --- a/crates/environ/src/tunables.rs +++ b/crates/environ/src/tunables.rs @@ -45,6 +45,10 @@ pub struct Tunables { /// Flag for the component module whether adapter modules have debug /// assertions baked into them. pub debug_adapter_modules: bool, + + /// Whether or not lowerings for relaxed simd instructions are forced to + /// be deterministic. + pub relaxed_simd_deterministic: bool, } impl Default for Tunables { @@ -91,6 +95,7 @@ impl Default for Tunables { guard_before_linear_memory: true, generate_address_map: true, debug_adapter_modules: false, + relaxed_simd_deterministic: false, } } } diff --git a/crates/jit/src/code_memory.rs b/crates/jit/src/code_memory.rs index f4bd18a2d6a7..128c3acec3e5 100644 --- a/crates/jit/src/code_memory.rs +++ b/crates/jit/src/code_memory.rs @@ -296,6 +296,8 @@ impl CodeMemory { obj::LibCall::CeilF64 => libcalls::relocs::ceilf64 as usize, obj::LibCall::TruncF32 => libcalls::relocs::truncf32 as usize, obj::LibCall::TruncF64 => libcalls::relocs::truncf64 as usize, + obj::LibCall::FmaF32 => libcalls::relocs::fmaf32 as usize, + obj::LibCall::FmaF64 => libcalls::relocs::fmaf64 as usize, }; *self.mmap.as_mut_ptr().add(offset).cast::() = libcall; } diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs index 2ce3bfc7d3aa..ee04b146b44f 100644 --- a/crates/runtime/src/libcalls.rs +++ b/crates/runtime/src/libcalls.rs @@ -584,4 +584,12 @@ pub mod relocs { (x.abs() + TOINT_64 - TOINT_64).copysign(x) } } + + pub extern "C" fn fmaf32(a: f32, b: f32, c: f32) -> f32 { + a.mul_add(b, c) + } + + pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 { + a.mul_add(b, c) + } } diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index eae23a75733b..44c3367fa931 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -682,6 +682,56 @@ impl Config { self } + /// Configures whether the WebAssembly Relaxed SIMD proposal will be + /// enabled for compilation. + /// + /// The [WebAssembly Relaxed SIMD proposal][proposal] is not, at the time of + /// this writing, at stage 4. The relaxed SIMD proposal adds new + /// instructions to WebAssembly which, for some specific inputs, are allowed + /// to produce different results on different hosts. More-or-less this + /// proposal enables exposing platform-specific semantics of SIMD + /// instructions in a controlled fashion to a WebAssembly program. From an + /// embedder's perspective this means that WebAssembly programs may execute + /// differently depending on whether the host is x86_64 or AArch64, for + /// example. + /// + /// By default Wasmtime lowers relaxed SIMD instructions to the fastest + /// lowering for the platform it's running on. This means that, by default, + /// some relaxed SIMD instructions may have different results for the same + /// inputs across x86_64 and AArch64. This behavior can be disabled through + /// the [`Config::relaxed_simd_deterministic`] option which will force + /// deterministic behavior across all platforms, as classified by the + /// specification, at the cost of performance. + /// + /// This is `false` by default. + /// + /// [proposal]: https://github.com/webassembly/relaxed-simd + pub fn wasm_relaxed_simd(&mut self, enable: bool) -> &mut Self { + self.features.relaxed_simd = enable; + self + } + + /// This option can be used to control the behavior of the [relaxed SIMD + /// proposal's][proposal] instructions. + /// + /// The relaxed SIMD proposal introduces instructions that are allowed to + /// have different behavior on different architectures, primarily to afford + /// an efficient implementation on all architectures. This means, however, + /// that the same module may execute differently on one host than another, + /// which typically is not otherwise the case. This option is provided to + /// force Wasmtime to generate deterministic code for all relaxed simd + /// instructions, at the cost of performance, for all architectures. When + /// this option is enabled then the deterministic behavior of all + /// instructions in the relaxed SIMD proposal is selected. + /// + /// This is `false` by default. + /// + /// [proposal]: https://github.com/webassembly/relaxed-simd + pub fn relaxed_simd_deterministic(&mut self, enable: bool) -> &mut Self { + self.tunables.relaxed_simd_deterministic = enable; + self + } + /// Configures whether the [WebAssembly bulk memory operations /// proposal][proposal] will be enabled for compilation. /// @@ -1560,6 +1610,10 @@ impl Config { } } + if self.features.relaxed_simd && !self.features.simd { + bail!("cannot disable the simd proposal but enable the relaxed simd proposal"); + } + // Apply compiler settings and flags for (k, v) in self.compiler_config.settings.iter() { compiler.set(k, v)?; @@ -1608,6 +1662,7 @@ impl fmt::Debug for Config { .field("wasm_reference_types", &self.features.reference_types) .field("wasm_bulk_memory", &self.features.bulk_memory) .field("wasm_simd", &self.features.simd) + .field("wasm_relaxed_simd", &self.features.relaxed_simd) .field("wasm_multi_value", &self.features.multi_value) .field( "static_memory_maximum_size", diff --git a/crates/wasmtime/src/engine/serialization.rs b/crates/wasmtime/src/engine/serialization.rs index 198273f5e79d..36063b49509f 100644 --- a/crates/wasmtime/src/engine/serialization.rs +++ b/crates/wasmtime/src/engine/serialization.rs @@ -309,6 +309,7 @@ impl Metadata { epoch_interruption, static_memory_bound_is_maximum, guard_before_linear_memory, + relaxed_simd_deterministic, // This doesn't affect compilation, it's just a runtime setting. dynamic_memory_growth_reserve: _, @@ -364,6 +365,11 @@ impl Metadata { other.guard_before_linear_memory, "guard before linear memory", )?; + Self::check_bool( + relaxed_simd_deterministic, + other.relaxed_simd_deterministic, + "relaxed simd deterministic semantics", + )?; Ok(()) } diff --git a/crates/wast/src/core.rs b/crates/wast/src/core.rs index e87d0b1f7e15..46c1f8c95759 100644 --- a/crates/wast/src/core.rs +++ b/crates/wast/src/core.rs @@ -39,6 +39,14 @@ fn extract_lane_as_i64(bytes: u128, lane: usize) -> i64 { pub fn match_val(actual: &Val, expected: &WastRetCore) -> Result<()> { match (actual, expected) { + (_, WastRetCore::Either(expected)) => { + for expected in expected { + if match_val(actual, expected).is_ok() { + return Ok(()); + } + } + match_val(actual, &expected[0]) + } (Val::I32(a), WastRetCore::I32(b)) => match_int(a, b), (Val::I64(a), WastRetCore::I64(b)) => match_int(a, b), // Note that these float comparisons are comparing bits, not float diff --git a/tests/all/wast.rs b/tests/all/wast.rs index 5f274b5be689..5da3e4cbdde1 100644 --- a/tests/all/wast.rs +++ b/tests/all/wast.rs @@ -30,6 +30,7 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()> let multi_memory = feature_found(wast, "multi-memory"); let threads = feature_found(wast, "threads"); let reference_types = !(threads && feature_found(wast, "proposals")); + let relaxed_simd = feature_found(wast, "relaxed-simd"); let use_shared_memory = feature_found_src(&wast_bytes, "shared_memory") || feature_found_src(&wast_bytes, "shared)"); @@ -43,6 +44,7 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()> .wasm_threads(threads) .wasm_memory64(memory64) .wasm_reference_types(reference_types) + .wasm_relaxed_simd(relaxed_simd) .cranelift_debug_verifier(true); cfg.wasm_component_model(feature_found(wast, "component-model")); @@ -108,11 +110,26 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()> None }; - let store = Store::new(&Engine::new(&cfg)?, ()); - let mut wast_context = WastContext::new(store); + let mut engines = vec![(Engine::new(&cfg)?, "default")]; - wast_context.register_spectest(use_shared_memory)?; - wast_context.run_buffer(wast.to_str().unwrap(), &wast_bytes)?; + // For tests that use relaxed-simd test both the default engine and the + // guaranteed-deterministic engine to ensure that both the 'native' + // semantics of the instructions plus the canonical semantics work. + if relaxed_simd { + engines.push(( + Engine::new(cfg.relaxed_simd_deterministic(true))?, + "deterministic", + )); + } + + for (engine, desc) in engines { + let store = Store::new(&engine, ()); + let mut wast_context = WastContext::new(store); + wast_context.register_spectest(use_shared_memory)?; + wast_context + .run_buffer(wast.to_str().unwrap(), &wast_bytes) + .with_context(|| format!("failed to run spec test with {desc} engine"))?; + } Ok(()) } diff --git a/tests/misc_testsuite/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/tests/misc_testsuite/relaxed-simd/i16x8_relaxed_q15mulr_s.wast new file mode 100644 index 000000000000..265d99160da6 --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/i16x8_relaxed_q15mulr_s.wast @@ -0,0 +1,26 @@ +;; Tests for i16x8.relaxed_q15mulr_s. +(module + (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))) + + (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)) + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))) +) + +;; INT16_MIN = -32768 +(assert_return (invoke "i16x8.relaxed_q15mulr_s" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0) + (v128.const i16x8 32767 32767 32766 0 0 0 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + diff --git a/tests/misc_testsuite/relaxed-simd/i32x4_relaxed_trunc.wast b/tests/misc_testsuite/relaxed-simd/i32x4_relaxed_trunc.wast new file mode 100644 index 000000000000..889542c6ad0d --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/i32x4_relaxed_trunc.wast @@ -0,0 +1,123 @@ +;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero. + +(module + (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0))) + (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))) + + (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_s (local.get 0)) + (i32x4.relaxed_trunc_f32x4_s (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_u (local.get 0)) + (i32x4.relaxed_trunc_f32x4_u (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))) +) + +;; Test some edge cases around min/max to ensure that the instruction either +;; saturates correctly or returns INT_MIN. +;; +;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is +;; 2147483647 but that is not representable in a `f32` since it requires 31 bits +;; when a f32 has only 24 bits available. This means that the closest integers +;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning +;; that the INT_MAX test case cannot be tested. +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647) + (v128.const i32x4 -2147483648 -2147483648 2 -2147483648))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (either (v128.const i32x4 0 0 4294967040 0xffffffff) + (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 2147483647 0 0) + (v128.const i32x4 -2147483648 -2147483648 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (either (v128.const i32x4 0 0xffffffff 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0 0 0xffffffff 0xffffffff))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/tests/misc_testsuite/relaxed-simd/i8x16_relaxed_swizzle.wast b/tests/misc_testsuite/relaxed-simd/i8x16_relaxed_swizzle.wast new file mode 100644 index 000000000000..1b20668d3d93 --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/i8x16_relaxed_swizzle.wast @@ -0,0 +1,44 @@ +;; Tests for relaxed i8x16 swizzle. + +(module + (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1))) + + (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)) + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) diff --git a/tests/misc_testsuite/relaxed-simd/relaxed_dot_product.wast b/tests/misc_testsuite/relaxed-simd/relaxed_dot_product.wast new file mode 100644 index 000000000000..41dee0afcfa4 --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/relaxed_dot_product.wast @@ -0,0 +1,106 @@ +;; Tests for relaxed dot products. + +(module + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)) + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))) +) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (v128.const i16x8 1 13 41 85 145 221 313 421)) + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -32512 32258 0 0 0 0 0 0)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (either + (v128.const i16x8 -32768 0 0 0 0 0 0 0) + (v128.const i16x8 32512 0 0 0 0 0 0 0) + (v128.const i16x8 33024 0 0 0 0 0 0 0))) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i32x4 0 1 2 3)) + ;; intermediate result is [14, 126, 366, 734] + (v128.const i32x4 14 127 368 737)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -65023 64518 3 4)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (either + (v128.const i32x4 -66047 2 3 4) + (v128.const i32x4 -65535 2 3 4) + (v128.const i32x4 65025 2 3 4) + (v128.const i32x4 66049 2 3 4))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/tests/misc_testsuite/relaxed-simd/relaxed_laneselect.wast b/tests/misc_testsuite/relaxed-simd/relaxed_laneselect.wast new file mode 100644 index 000000000000..4ea6eb818462 --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/relaxed_laneselect.wast @@ -0,0 +1,92 @@ +;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect. + +(module + (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i32x4.eq + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i64x2.eq + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) +) + +(assert_return (invoke "i8x16.relaxed_laneselect" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (either (v128.const i8x16 0 17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0 17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31))) + +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (either (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15))) + +(assert_return (invoke "i32x4.relaxed_laneselect" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (either (v128.const i32x4 0 5 0x12345678 0x56781234) + (v128.const i32x4 0 5 0x12341234 0x56785678))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (either (v128.const i64x2 0 3) + (v128.const i64x2 0 3))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (either (v128.const i64x2 0x1234123456785678 0x5678567812341234) + (v128.const i64x2 0x1234123412341234 0x5678567856785678))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i8x16.relaxed_laneselect_cmp" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i16x8.relaxed_laneselect_cmp" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_laneselect_cmp" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (v128.const i64x2 -1 -1)) diff --git a/tests/misc_testsuite/relaxed-simd/relaxed_madd_nmadd.wast b/tests/misc_testsuite/relaxed-simd/relaxed_madd_nmadd.wast new file mode 100644 index 000000000000..0e0e0c2bfc4f --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/relaxed_madd_nmadd.wast @@ -0,0 +1,190 @@ +;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd. + +(module + (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + + (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) +) + + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127) + (v128.const f32x4 inf inf inf inf))) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; fnma tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; fnma tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 inf inf))) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; fnma tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; fnma tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; fnma tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; fnma tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (v128.const i64x2 -1 -1)) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; fnma tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; fnma tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) diff --git a/tests/misc_testsuite/relaxed-simd/relaxed_min_max.wast b/tests/misc_testsuite/relaxed-simd/relaxed_min_max.wast new file mode 100644 index 000000000000..d8a04ba4df93 --- /dev/null +++ b/tests/misc_testsuite/relaxed-simd/relaxed_min_max.wast @@ -0,0 +1,183 @@ +;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max. + +(module + (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1))) + (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1))) + + (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_min (local.get 0) (local.get 1)) + (f32x4.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_max (local.get 0) (local.get 1)) + (f32x4.relaxed_max (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_min (local.get 0) (local.get 1)) + (f64x2.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_max (local.get 0) (local.get 1)) + (f64x2.relaxed_max (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 -0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 +0.0 +0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1))