diff --git a/build.rs b/build.rs index 4918c3466e43..18e4c80e7ca4 100644 --- a/build.rs +++ b/build.rs @@ -171,9 +171,9 @@ fn write_testsuite_tests( fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { match strategy { "Cranelift" => match (testsuite, testname) { - // No simd support yet for s390x. - ("simd", _) if platform_is_s390x() => return true, - _ if platform_is_s390x() && testname.starts_with("simd") => return true, + // FIXME: These tests fail under qemu due to a qemu bug. + (_, "simd_f32x4_pmin_pmax") if platform_is_s390x() => return true, + (_, "simd_f64x2_pmin_pmax") if platform_is_s390x() => return true, _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/src/data_value.rs b/cranelift/codegen/src/data_value.rs index 13aa23767499..e2b6d5aba395 100644 --- a/cranelift/codegen/src/data_value.rs +++ b/cranelift/codegen/src/data_value.rs @@ -89,7 +89,7 @@ impl DataValue { DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]), DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]), DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]), - DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]), + DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()), _ => unimplemented!(), }; } @@ -120,7 +120,7 @@ impl DataValue { DataValue::B(src[..size].iter().any(|&i| i != 0)) } _ if ty.is_vector() && ty.bytes() == 16 => { - DataValue::V128(src[..16].try_into().unwrap()) + DataValue::V128(u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes()) } _ => unimplemented!(), } diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index 77dcc87e9421..52db56de435c 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -97,6 +97,10 @@ fn in_flt_reg(ty: Type) -> bool { } } +fn in_vec_reg(ty: Type) -> bool { + ty.is_vector() && ty.bits() == 128 +} + fn get_intreg_for_arg(idx: usize) -> Option { match idx { 0 => Some(regs::gpr(2)), @@ -118,6 +122,20 @@ fn get_fltreg_for_arg(idx: usize) -> Option { } } +fn get_vecreg_for_arg(idx: usize) -> Option { + match idx { + 0 => Some(regs::vr(24)), + 1 => Some(regs::vr(25)), + 2 => Some(regs::vr(26)), + 3 => Some(regs::vr(27)), + 4 => Some(regs::vr(28)), + 5 => Some(regs::vr(29)), + 6 => Some(regs::vr(30)), + 7 => Some(regs::vr(31)), + _ => None, + } +} + fn get_intreg_for_ret(idx: usize) -> Option { match idx { 0 => Some(regs::gpr(2)), @@ -140,6 +158,21 @@ fn get_fltreg_for_ret(idx: usize) -> Option { } } +fn get_vecreg_for_ret(idx: usize) -> Option { + match idx { + 0 => Some(regs::vr(24)), + // ABI extension to support multi-value returns: + 1 => Some(regs::vr(25)), + 2 => Some(regs::vr(26)), + 3 => Some(regs::vr(27)), + 4 => Some(regs::vr(28)), + 5 => Some(regs::vr(29)), + 6 => Some(regs::vr(30)), + 7 => Some(regs::vr(31)), + _ => None, + } +} + /// This is the limit for the size of argument and return-value areas on the /// stack. We place a reasonable limit here to avoid integer overflow issues /// with 32-bit arithmetic: for now, 128 MB. @@ -182,6 +215,7 @@ impl ABIMachineSpec for S390xMachineDeps { ) -> CodegenResult<(Vec, i64, Option)> { let mut next_gpr = 0; let mut next_fpr = 0; + let mut next_vr = 0; let mut next_stack: u64 = 0; let mut ret = vec![]; @@ -206,8 +240,8 @@ impl ABIMachineSpec for S390xMachineDeps { let intreg = in_int_reg(param.value_type); let fltreg = in_flt_reg(param.value_type); - debug_assert!(intreg || fltreg); - debug_assert!(!(intreg && fltreg)); + let vecreg = in_vec_reg(param.value_type); + debug_assert!(intreg as i32 + fltreg as i32 + vecreg as i32 == 1); let (next_reg, candidate) = if intreg { let candidate = match args_or_rets { @@ -215,12 +249,18 @@ impl ABIMachineSpec for S390xMachineDeps { ArgsOrRets::Rets => get_intreg_for_ret(next_gpr), }; (&mut next_gpr, candidate) - } else { + } else if fltreg { let candidate = match args_or_rets { ArgsOrRets::Args => get_fltreg_for_arg(next_fpr), ArgsOrRets::Rets => get_fltreg_for_ret(next_fpr), }; (&mut next_fpr, candidate) + } else { + let candidate = match args_or_rets { + ArgsOrRets::Args => get_vecreg_for_arg(next_vr), + ArgsOrRets::Rets => get_vecreg_for_ret(next_vr), + }; + (&mut next_vr, candidate) }; // In the Wasmtime ABI only the first return value can be in a register. @@ -252,7 +292,8 @@ impl ABIMachineSpec for S390xMachineDeps { // Align the stack slot. debug_assert!(slot_size.is_power_of_two()); - next_stack = align_to(next_stack, slot_size); + let slot_align = std::cmp::min(slot_size, 8); + next_stack = align_to(next_stack, slot_align); // If the type is actually of smaller size (and the argument // was not extended), it is passed right-aligned. @@ -477,6 +518,13 @@ impl ABIMachineSpec for S390xMachineDeps { RegClass::Float => clobbered_fpr.push(reg), } } + // We need to save the link register in non-leaf functions. + // FIXME: This should be included in the clobber list to begin with, + // but isn't because we have have excluded call instructions via the + // is_included_in_clobbers callback. + if outgoing_args_size > 0 { + clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14)))); + } let mut first_clobbered_gpr = 16; for reg in clobbered_gpr { @@ -534,13 +582,15 @@ impl ABIMachineSpec for S390xMachineDeps { // Save FPRs. for (i, reg) in clobbered_fpr.iter().enumerate() { - insts.push(Inst::FpuStore64 { + insts.push(Inst::VecStoreLane { + size: 64, rd: reg.to_reg().into(), mem: MemArg::reg_plus_off( stack_reg(), (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64, MemFlags::trusted(), ), + lane_imm: 0, }); if flags.unwind_info() { insts.push(Inst::Unwind { @@ -566,7 +616,14 @@ impl ABIMachineSpec for S390xMachineDeps { let mut insts = SmallVec::new(); // Collect clobbered registers. - let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers); + let (mut clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers); + // We need to restore the link register in non-leaf functions. + // FIXME: This should be included in the clobber list to begin with, + // but isn't because we have have excluded call instructions via the + // is_included_in_clobbers callback. + if outgoing_args_size > 0 { + clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14)))); + } let mut first_clobbered_gpr = 16; for reg in clobbered_gpr { let enc = reg.to_reg().hw_enc(); @@ -578,13 +635,15 @@ impl ABIMachineSpec for S390xMachineDeps { // Restore FPRs. for (i, reg) in clobbered_fpr.iter().enumerate() { - insts.push(Inst::FpuLoad64 { + insts.push(Inst::VecLoadLaneUndef { + size: 64, rd: Writable::from_reg(reg.to_reg().into()), mem: MemArg::reg_plus_off( stack_reg(), (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64, MemFlags::trusted(), ), + lane_imm: 0, }); } @@ -639,7 +698,7 @@ impl ABIMachineSpec for S390xMachineDeps { // We allocate in terms of 8-byte slots. match rc { RegClass::Int => 1, - RegClass::Float => 1, + RegClass::Float => 2, } } @@ -739,6 +798,21 @@ const fn clobbers() -> PRegSet { .with(gpr_preg(3)) .with(gpr_preg(4)) .with(gpr_preg(5)) + // v0 - v7 inclusive and v16 - v31 inclusive are + // caller-saves. The upper 64 bits of v8 - v15 inclusive are + // also caller-saves. However, because we cannot currently + // represent partial registers to regalloc2, we indicate here + // that every vector register is caller-save. Because this + // function is used at *callsites*, approximating in this + // direction (save more than necessary) is conservative and + // thus safe. + // + // Note that we exclude clobbers from a call instruction when + // a call instruction's callee has the same ABI as the caller + // (the current function body); this is safe (anything + // clobbered by callee can be clobbered by caller as well) and + // avoids unnecessary saves of v8-v15 in the prologue even + // though we include them as defs here. .with(vr_preg(0)) .with(vr_preg(1)) .with(vr_preg(2)) @@ -747,6 +821,14 @@ const fn clobbers() -> PRegSet { .with(vr_preg(5)) .with(vr_preg(6)) .with(vr_preg(7)) + .with(vr_preg(8)) + .with(vr_preg(9)) + .with(vr_preg(10)) + .with(vr_preg(11)) + .with(vr_preg(12)) + .with(vr_preg(13)) + .with(vr_preg(14)) + .with(vr_preg(15)) .with(vr_preg(16)) .with(vr_preg(17)) .with(vr_preg(18)) diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index 00868224c544..38c90e4711da 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -467,26 +467,6 @@ (cond Cond) (rm Reg)) - ;; A 32-bit move instruction from GPR to FPR or vector element. - (MovToFpr32 - (rd WritableReg) - (rn Reg)) - - ;; A 64-bit move instruction from GPR to FPR or vector element. - (MovToFpr64 - (rd WritableReg) - (rn Reg)) - - ;; A 32-bit move instruction from FPR or vector element to GPR. - (MovFromFpr32 - (rd WritableReg) - (rn Reg)) - - ;; A 64-bit move instruction from FPR or vector element to GPR. - (MovFromFpr64 - (rd WritableReg) - (rn Reg)) - ;; 1-op FPU instruction implemented as vector instruction with the W bit. (FpuRR (fpu_op FPUOp1) @@ -508,6 +488,13 @@ (rm Reg) (ra Reg)) + ;; 1-op FPU instruction with rounding mode. + (FpuRound + (op FpuRoundOp) + (mode FpuRoundMode) + (rd WritableReg) + (rn Reg)) + ;; FPU comparison, single-precision (32 bit). (FpuCmp32 (rn Reg) @@ -518,69 +505,255 @@ (rn Reg) (rm Reg)) - ;; Floating-point load, single-precision (32 bit). - (FpuLoad32 + ;; Load floating-point constant, single-precision (32 bit). + (LoadFpuConst32 (rd WritableReg) - (mem MemArg)) + (const_data u32)) - ;; Floating-point store, single-precision (32 bit). - (FpuStore32 - (rd Reg) - (mem MemArg)) + ;; Load floating-point constant, double-precision (64 bit). + (LoadFpuConst64 + (rd WritableReg) + (const_data u64)) - ;; Floating-point load, double-precision (64 bit). - (FpuLoad64 + ;; A binary vector operation with two vector register sources. + (VecRRR + (op VecBinaryOp) (rd WritableReg) - (mem MemArg)) + (rn Reg) + (rm Reg)) - ;; Floating-point store, double-precision (64 bit). - (FpuStore64 - (rd Reg) + ;; A unary vector operation with a vector register source. + (VecRR + (op VecUnaryOp) + (rd WritableReg) + (rn Reg)) + + ;; Vector shift instruction with a register source, a register destination, + ;; and an immediate plus an optional register as shift count. + (VecShiftRR + (shift_op VecShiftOp) + (rd WritableReg) + (rn Reg) + (shift_imm u8) + (shift_reg Reg)) + + ;; Vector select instruction. + (VecSelect + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + + ;; Vector permute instruction. + (VecPermute + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + + ;; Vector permute doubleword immediate instruction. + (VecPermuteDWImm + (rd WritableReg) + (rn Reg) + (rm Reg) + (idx1 u8) + (idx2 u8)) + + ;; Vector integer comparison with two register sources and a register + ;; destination. + (VecIntCmp + (op VecIntCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Same, but also set the condition code. + (VecIntCmpS + (op VecIntCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Vector floating-point comparison with two register sources and a register + ;; destination. + (VecFloatCmp + (op VecFloatCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Same, but also set the condition code. + (VecFloatCmpS + (op VecFloatCmpOp) + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; 128-bit vector load instruction. + (VecLoad + (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed load, single-precision (32 bit). - (FpuLoadRev32 + ;; 128-bit byte-reversed vector load instruction. + (VecLoadRev (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed store, single-precision (32 bit). - (FpuStoreRev32 + ;; 128-bit vector store instruction. + (VecStore + (rd Reg) + (mem MemArg)) + + ;; 128-bit byte-reversed vector store instruction. + (VecStoreRev (rd Reg) (mem MemArg)) - ;; Floating-point byte-reversed load, double-precision (64 bit). - (FpuLoadRev64 + ;; 128-bit vector load replicated element instruction. + (VecLoadReplicate + (size u32) (rd WritableReg) (mem MemArg)) - ;; Floating-point byte-reversed store, double-precision (64 bit). - (FpuStoreRev64 - (rd Reg) + ;; 128-bit byte-reversed vector load replicated element instruction. + (VecLoadReplicateRev + (size u32) + (rd WritableReg) (mem MemArg)) - ;; Load floating-point constant, single-precision (32 bit). - (LoadFpuConst32 + ;; Vector move instruction. + (VecMov (rd WritableReg) - (const_data u32)) + (rn Reg)) - ;; Load floating-point constant, double-precision (64 bit). - (LoadFpuConst64 + ;; Conditional vector move instruction. + (VecCMov + (rd WritableReg) + (cond Cond) + (rm Reg)) + + ;; A 128-bit move instruction from two GPRs to a VR. + (MovToVec128 + (rd WritableReg) + (rn Reg) + (rm Reg)) + + ;; Load 128-bit (big-endian) vector constant. + (VecLoadConst + (rd WritableReg) + (const_data u128)) + + ;; Load 128-bit (big-endian) replicated vector constant. + (VecLoadConstReplicate + (size u32) (rd WritableReg) (const_data u64)) - ;; 1-op FPU instruction with rounding mode. - (FpuRound - (op FpuRoundOp) - (mode FpuRoundMode) + ;; Load vector immediate generated via byte mask. + (VecImmByteMask (rd WritableReg) - (rn Reg)) + (mask u16)) - ;; Vector select instruction. - (VecSelect + ;; Load vector replicated contiguous bit mask. + (VecImmBitMask + (size u32) + (rd WritableReg) + (start_bit u8) + (end_bit u8)) + + ;; Load vector replicated immediate. + (VecImmReplicate + (size u32) + (rd WritableReg) + (imm i16)) + + ;; Vector lane insertion with an in/out VR, a memory source, + ;; and an immediate as lane index. + (VecLoadLane + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Same as VecLoadLane, but allow undefined input VR. + (VecLoadLaneUndef + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Byte-reversed vector lane insertion with an in/out VR, a memory source, + ;; and an immediate as lane index. + (VecLoadLaneRev + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Same as VecLoadLaneRev, but allow undefined input VR. + (VecLoadLaneRevUndef + (size u32) + (rd WritableReg) + (mem MemArg) + (lane_imm u8)) + + ;; Vector lane extraction with a memory destination, a VR source, + ;; and an immediate as lane index. + (VecStoreLane + (size u32) + (rd Reg) + (mem MemArg) + (lane_imm u8)) + + ;; Byte-reversed vector lane extraction with a memory destination, a VR source, + ;; and an immediate as lane index. + (VecStoreLaneRev + (size u32) + (rd Reg) + (mem MemArg) + (lane_imm u8)) + + ;; Vector lane insertion with an in/out VR, a GPR source, + ;; and an immediate plus an optional register as lane index. + (VecInsertLane + (size u32) (rd WritableReg) (rn Reg) - (rm Reg) - (ra Reg)) + (lane_imm u8) + (lane_reg Reg)) + + ;; Same as VecInsertLane, but allow undefined input VR. + (VecInsertLaneUndef + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8) + (lane_reg Reg)) + + ;; Vector lane extraction with a VR source, a GPR destination, + ;; and an immediate plus an optional register as lane index. + (VecExtractLane + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8) + (lane_reg Reg)) + + ;; Vector lane insertion with an in/out VR, an immediate source, + ;; and an immediate as lane index. + (VecInsertLaneImm + (size u32) + (rd WritableReg) + (imm i16) + (lane_imm u8)) + + ;; Vector lane replication with a VR source, a VR destination, + ;; and an immediate as lane index. + (VecReplicateLane + (size u32) + (rd WritableReg) + (rn Reg) + (lane_imm u8)) ;; A machine call instruction. (Call @@ -807,18 +980,208 @@ (CmpL64Ext32) )) +;; A binary vector operation. +(type VecBinaryOp + (enum + ;; Addition and subtraction + (Add8x16) + (Add16x8) + (Add32x4) + (Add64x2) + (Sub8x16) + (Sub16x8) + (Sub32x4) + (Sub64x2) + ;; Multiplication (64-bit not supported) + (Mul8x16) + (Mul16x8) + (Mul32x4) + (UMulHi8x16) + (UMulHi16x8) + (UMulHi32x4) + (SMulHi8x16) + (SMulHi16x8) + (SMulHi32x4) + (UMulEven8x16) + (UMulEven16x8) + (UMulEven32x4) + (SMulEven8x16) + (SMulEven16x8) + (SMulEven32x4) + (UMulOdd8x16) + (UMulOdd16x8) + (UMulOdd32x4) + (SMulOdd8x16) + (SMulOdd16x8) + (SMulOdd32x4) + ;; Minimum, maximum, and average + (UMax8x16) + (UMax16x8) + (UMax32x4) + (UMax64x2) + (SMax8x16) + (SMax16x8) + (SMax32x4) + (SMax64x2) + (UMin8x16) + (UMin16x8) + (UMin32x4) + (UMin64x2) + (SMin8x16) + (SMin16x8) + (SMin32x4) + (SMin64x2) + (UAvg8x16) + (UAvg16x8) + (UAvg32x4) + (UAvg64x2) + (SAvg8x16) + (SAvg16x8) + (SAvg32x4) + (SAvg64x2) + ;; Bitwise operations + (And128) + (Orr128) + (Xor128) + (NotAnd128) + (NotOrr128) + (NotXor128) + (AndNot128) + (OrrNot128) + ;; Bit permute + (BitPermute128) + ;; Full vector shift operations + (LShLByByte128) + (LShRByByte128) + (AShRByByte128) + (LShLByBit128) + (LShRByBit128) + (AShRByBit128) + ;; Pack + (Pack16x8) + (Pack32x4) + (Pack64x2) + ;; Pack saturate (unsigned) + (PackUSat16x8) + (PackUSat32x4) + (PackUSat64x2) + ;; Pack saturate (signed) + (PackSSat16x8) + (PackSSat32x4) + (PackSSat64x2) + ;; Merge + (MergeLow8x16) + (MergeLow16x8) + (MergeLow32x4) + (MergeLow64x2) + (MergeHigh8x16) + (MergeHigh16x8) + (MergeHigh32x4) + (MergeHigh64x2) +)) + +;; A vector unary operation. +(type VecUnaryOp + (enum + ;; Sign operations + (Abs8x16) + (Abs16x8) + (Abs32x4) + (Abs64x2) + (Neg8x16) + (Neg16x8) + (Neg32x4) + (Neg64x2) + ;; Population count + (Popcnt8x16) + (Popcnt16x8) + (Popcnt32x4) + (Popcnt64x2) + ;; Unpack + (UnpackULow8x16) + (UnpackULow16x8) + (UnpackULow32x4) + (UnpackUHigh8x16) + (UnpackUHigh16x8) + (UnpackUHigh32x4) + (UnpackSLow8x16) + (UnpackSLow16x8) + (UnpackSLow32x4) + (UnpackSHigh8x16) + (UnpackSHigh16x8) + (UnpackSHigh32x4) +)) + +;; A vector shift operation. +(type VecShiftOp + (enum + (RotL8x16) + (RotL16x8) + (RotL32x4) + (RotL64x2) + (LShL8x16) + (LShL16x8) + (LShL32x4) + (LShL64x2) + (LShR8x16) + (LShR16x8) + (LShR32x4) + (LShR64x2) + (AShR8x16) + (AShR16x8) + (AShR32x4) + (AShR64x2) +)) + +;; An integer vector comparison operation. +(type VecIntCmpOp + (enum + (CmpEq8x16) + (CmpEq16x8) + (CmpEq32x4) + (CmpEq64x2) + (SCmpHi8x16) + (SCmpHi16x8) + (SCmpHi32x4) + (SCmpHi64x2) + (UCmpHi8x16) + (UCmpHi16x8) + (UCmpHi32x4) + (UCmpHi64x2) +)) + +;; A floatint-point vector comparison operation. +(type VecFloatCmpOp + (enum + (CmpEq32x4) + (CmpEq64x2) + (CmpHi32x4) + (CmpHi64x2) + (CmpHiEq32x4) + (CmpHiEq64x2) +)) + ;; A floating-point unit (FPU) operation with one arg. (type FPUOp1 (enum (Abs32) (Abs64) + (Abs32x4) + (Abs64x2) (Neg32) (Neg64) + (Neg32x4) + (Neg64x2) (NegAbs32) (NegAbs64) + (NegAbs32x4) + (NegAbs64x2) (Sqrt32) (Sqrt64) + (Sqrt32x4) + (Sqrt64x2) (Cvt32To64) + (Cvt32x4To64x2) )) ;; A floating-point unit (FPU) operation with two args. @@ -826,16 +1189,36 @@ (enum (Add32) (Add64) + (Add32x4) + (Add64x2) (Sub32) (Sub64) + (Sub32x4) + (Sub64x2) (Mul32) (Mul64) + (Mul32x4) + (Mul64x2) (Div32) (Div64) + (Div32x4) + (Div64x2) (Max32) (Max64) + (Max32x4) + (Max64x2) (Min32) (Min64) + (Min32x4) + (Min64x2) + (MaxPseudo32) + (MaxPseudo64) + (MaxPseudo32x4) + (MaxPseudo64x2) + (MinPseudo32) + (MinPseudo64) + (MinPseudo32x4) + (MinPseudo64x2) )) ;; A floating-point unit (FPU) operation with three args. @@ -843,24 +1226,39 @@ (enum (MAdd32) (MAdd64) + (MAdd32x4) + (MAdd64x2) (MSub32) (MSub64) + (MSub32x4) + (MSub64x2) )) ;; A floating-point unit (FPU) operation with one arg, and rounding mode. (type FpuRoundOp (enum (Cvt64To32) + (Cvt64x2To32x4) (Round32) (Round64) + (Round32x4) + (Round64x2) (ToSInt32) (ToSInt64) (ToUInt32) (ToUInt64) + (ToSInt32x4) + (ToSInt64x2) + (ToUInt32x4) + (ToUInt64x2) (FromSInt32) (FromSInt64) (FromUInt32) (FromUInt64) + (FromSInt32x4) + (FromSInt64x2) + (FromUInt32x4) + (FromUInt64x2) )) ;; Rounding modes for floating-point ops. @@ -949,6 +1347,55 @@ (decl u64_as_i16 (u64) i16) (extern constructor u64_as_i16 u64_as_i16) +;; Construct and extract immediate vector constants. + +(decl u64_pair (u64 u64) u128) +(extern constructor u64_pair u64_pair_concat) +(extern extractor infallible u64_pair u64_pair_split) + +(decl u32_pair (u32 u32) u64) +(extern constructor u32_pair u32_pair_concat) +(extern extractor infallible u32_pair u32_pair_split) + +(decl u16_pair (u16 u16) u32) +(extern constructor u16_pair u16_pair_concat) +(extern extractor infallible u16_pair u16_pair_split) + +(decl u8_pair (u8 u8) u16) +(extern constructor u8_pair u8_pair_concat) +(extern extractor infallible u8_pair u8_pair_split) + +(decl imm8x16 (u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8 u8) u128) +(extractor (imm8x16 a b c d e f g h i j k l m n o p) + (u64_pair (u32_pair (u16_pair (u8_pair a b) (u8_pair c d)) + (u16_pair (u8_pair e f) (u8_pair g h))) + (u32_pair (u16_pair (u8_pair i j) (u8_pair k l)) + (u16_pair (u8_pair m n) (u8_pair o p))))) +(rule (imm8x16 a b c d e f g h i j k l m n o p) + (u64_pair (u32_pair (u16_pair (u8_pair a b) (u8_pair c d)) + (u16_pair (u8_pair e f) (u8_pair g h))) + (u32_pair (u16_pair (u8_pair i j) (u8_pair k l)) + (u16_pair (u8_pair m n) (u8_pair o p))))) + +;; Convert a little-endian lane index to a big-endian lane index. + +(decl be_lane_idx (Type u8) u8) +(extern constructor be_lane_idx be_lane_idx) + +;; Construct a VGBM mask to set all bits in one lane of a vector. + +(decl lane_byte_mask (Type u8) u16) +(extern constructor lane_byte_mask lane_byte_mask) + +;; Extract "permute" and "and" masks from a shuffle constant + +(decl shuffle_mask_from_u128 (u128 u16) u128) +(extern extractor infallible shuffle_mask_from_u128 shuffle_mask_from_u128) + +(decl shuffle_mask (u128 u16) Immediate) +(extractor (shuffle_mask permute_mask and_mask) + (u128_from_immediate (shuffle_mask_from_u128 permute_mask and_mask))) + ;; Split an u64 into high and low parts. (decl u64_nonzero_hipart (u64) u64) @@ -965,6 +1412,9 @@ (decl i16_from_u64 (i16) u64) (extern extractor i16_from_u64 i16_from_u64) +(decl i16_from_u32 (i16) u32) +(extern extractor i16_from_u32 i16_from_u32) + (decl uimm32shifted_from_u64 (UImm32Shifted) u64) (extern extractor uimm32shifted_from_u64 uimm32shifted_from_u64) @@ -985,6 +1435,9 @@ (decl u64_from_signed_value (u64) Value) (extern extractor u64_from_signed_value u64_from_signed_value) +(decl u64_from_inverted_value (u64) Value) +(extern extractor u64_from_inverted_value u64_from_inverted_value) + (decl i64_from_value (i64) Value) (extern extractor i64_from_value i64_from_value) @@ -1097,10 +1550,10 @@ (type MemArg extern (enum)) -(decl memarg_reg_plus_reg (Reg Reg MemFlags) MemArg) +(decl memarg_reg_plus_reg (Reg Reg u8 MemFlags) MemArg) (extern constructor memarg_reg_plus_reg memarg_reg_plus_reg) -(decl memarg_reg_plus_off (Reg i64 MemFlags) MemArg) +(decl memarg_reg_plus_off (Reg i64 u8 MemFlags) MemArg) (extern constructor memarg_reg_plus_off memarg_reg_plus_off) (decl memarg_symbol (ExternalName i32 MemFlags) MemArg) @@ -1126,10 +1579,10 @@ (decl lower_address (MemFlags Value Offset32) MemArg) (rule (lower_address flags addr (i64_from_offset offset)) - (memarg_reg_plus_off addr offset flags)) + (memarg_reg_plus_off addr offset 0 flags)) (rule (lower_address flags (iadd x y) (i64_from_offset 0)) - (memarg_reg_plus_reg x y flags)) + (memarg_reg_plus_reg x y 0 flags)) (rule (lower_address flags (symbol_value (symbol_value_data name (reloc_distance_near) sym_offset)) @@ -1138,6 +1591,17 @@ (memarg_symbol name final_offset flags)) +;; Lower an address plus a small bias into a `MemArg`. + +(decl lower_address_bias (MemFlags Value Offset32 u8) MemArg) + +(rule (lower_address_bias flags addr (i64_from_offset offset) bias) + (memarg_reg_plus_off addr offset bias flags)) + +(rule (lower_address_bias flags (iadd x y) (i64_from_offset 0) bias) + (memarg_reg_plus_reg x y bias flags)) + + ;; Test whether a `load` address will be lowered to a `MemArg::Symbol`. (decl pure load_sym (Inst) Inst) @@ -1206,6 +1670,11 @@ (extractor (sinkable_load_16 inst) (and (value_type $I16) (sinkable_load inst))) +;; Sinkable little-endian load instruction. +(decl sinkable_load_little (Inst) Value) +(extractor (sinkable_load_little inst) + (sinkable_inst (and inst (load (littleendian) _addr _offset)))) + ;; Sinkable big-endian sload16 instruction. (decl sinkable_sload16 (Inst) Value) (extractor (sinkable_sload16 inst) @@ -1615,87 +2084,225 @@ (_ Unit (emit (MInst.FpuRound op mode dst src)))) dst)) -;; Helper for emitting `MInst.MovToFpr32` instructions. -(decl mov_to_fpr32 (Reg) Reg) -(rule (mov_to_fpr32 src) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.MovToFpr32 dst src)))) +;; Helper for emitting `MInst.VecRRR` instructions. +(decl vec_rrr (Type VecBinaryOp Reg Reg) Reg) +(rule (vec_rrr ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecRRR op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.MovToFpr64` instructions. -(decl mov_to_fpr64 (Reg) Reg) -(rule (mov_to_fpr64 src) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.MovToFpr64 dst src)))) +;; Helper for emitting `MInst.VecRR` instructions. +(decl vec_rr (Type VecUnaryOp Reg) Reg) +(rule (vec_rr ty op src) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecRR op dst src)))) dst)) -;; Helper for emitting `MInst.MovFromFpr32` instructions. -(decl mov_from_fpr32 (Reg) Reg) -(rule (mov_from_fpr32 src) - (let ((dst WritableReg (temp_writable_reg $I32)) - (_ Unit (emit (MInst.MovFromFpr32 dst src)))) +;; Helper for emitting `MInst.VecShiftRR` instructions. +(decl vec_shift_rr (Type VecShiftOp Reg u8 Reg) Reg) +(rule (vec_shift_rr ty op src shift_imm shift_reg) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecShiftRR op dst src shift_imm shift_reg)))) dst)) -;; Helper for emitting `MInst.MovFromFpr64` instructions. -(decl mov_from_fpr64 (Reg) Reg) -(rule (mov_from_fpr64 src) - (let ((dst WritableReg (temp_writable_reg $I64)) - (_ Unit (emit (MInst.MovFromFpr64 dst src)))) +;; Helper for emitting `MInst.VecSelect` instructions. +(decl vec_select (Type Reg Reg Reg) Reg) +(rule (vec_select ty src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecSelect dst src1 src2 src3)))) + dst)) + +;; Helper for emitting `MInst.VecPermute` instructions. +(decl vec_permute (Type Reg Reg Reg) Reg) +(rule (vec_permute ty src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecPermute dst src1 src2 src3)))) + dst)) + +;; Helper for emitting `MInst.VecPermuteDWImm` instructions. +(decl vec_permute_dw_imm (Type Reg u8 Reg u8) Reg) +(rule (vec_permute_dw_imm ty src1 idx1 src2 idx2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecPermuteDWImm dst src1 src2 idx1 idx2)))) dst)) -;; Helper for emitting `MInst.FpuLoad32` instructions. -(decl fpu_load32 (MemArg) Reg) -(rule (fpu_load32 addr) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.FpuLoad32 dst addr)))) +;; Helper for emitting `MInst.VecIntCmp` instructions. +(decl vec_int_cmp (Type VecIntCmpOp Reg Reg) Reg) +(rule (vec_int_cmp ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecIntCmp op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.FpuLoad64` instructions. -(decl fpu_load64 (MemArg) Reg) -(rule (fpu_load64 addr) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.FpuLoad64 dst addr)))) +;; Helper for emitting `MInst.VecIntCmpS` instructions. +(decl vec_int_cmps (Type VecIntCmpOp Reg Reg) ProducesFlags) +(rule (vec_int_cmps ty op src1 src2) + (let ((tmp WritableReg (temp_writable_reg ty))) + (ProducesFlags.ProducesFlagsSideEffect (MInst.VecIntCmpS op tmp src1 src2)))) + +;; Helper for emitting `MInst.VecFloatCmp` instructions. +(decl vec_float_cmp (Type VecFloatCmpOp Reg Reg) Reg) +(rule (vec_float_cmp ty op src1 src2) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecFloatCmp op dst src1 src2)))) dst)) -;; Helper for emitting `MInst.FpuLoadRev32` instructions. -(decl fpu_loadrev32 (MemArg) Reg) -(rule (fpu_loadrev32 addr) - (let ((dst WritableReg (temp_writable_reg $F32)) - (_ Unit (emit (MInst.FpuLoadRev32 dst addr)))) +;; Helper for emitting `MInst.VecFloatCmpS` instructions. +(decl vec_float_cmps (Type VecFloatCmpOp Reg Reg) ProducesFlags) +(rule (vec_float_cmps ty op src1 src2) + (let ((tmp WritableReg (temp_writable_reg ty))) + (ProducesFlags.ProducesFlagsSideEffect (MInst.VecFloatCmpS op tmp src1 src2)))) + +;; Helper for emitting `MInst.VecLoad` instructions. +(decl vec_load (Type MemArg) Reg) +(rule (vec_load ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoad dst addr)))) dst)) -;; Helper for emitting `MInst.FpuLoadRev64` instructions. -(decl fpu_loadrev64 (MemArg) Reg) -(rule (fpu_loadrev64 addr) - (let ((dst WritableReg (temp_writable_reg $F64)) - (_ Unit (emit (MInst.FpuLoadRev64 dst addr)))) +;; Helper for emitting `MInst.VecLoadRev` instructions. +(decl vec_loadrev (Type MemArg) Reg) +(rule (vec_loadrev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadRev dst addr)))) dst)) -;; Helper for emitting `MInst.FpuStore32` instructions. -(decl fpu_store32 (Reg MemArg) SideEffectNoResult) -(rule (fpu_store32 src addr) - (SideEffectNoResult.Inst (MInst.FpuStore32 src addr))) +;; Helper for emitting `MInst.VecStore` instructions. +(decl vec_store (Reg MemArg) SideEffectNoResult) +(rule (vec_store src addr) + (SideEffectNoResult.Inst (MInst.VecStore src addr))) -;; Helper for emitting `MInst.FpuStore64` instructions. -(decl fpu_store64 (Reg MemArg) SideEffectNoResult) -(rule (fpu_store64 src addr) - (SideEffectNoResult.Inst (MInst.FpuStore64 src addr))) +;; Helper for emitting `MInst.VecStoreRev` instructions. +(decl vec_storerev (Reg MemArg) SideEffectNoResult) +(rule (vec_storerev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreRev src addr))) -;; Helper for emitting `MInst.FpuStoreRev32` instructions. -(decl fpu_storerev32 (Reg MemArg) SideEffectNoResult) -(rule (fpu_storerev32 src addr) - (SideEffectNoResult.Inst (MInst.FpuStoreRev32 src addr))) +;; Helper for emitting `MInst.VecLoadReplicate` instructions. +(decl vec_load_replicate (Type MemArg) Reg) +(rule (vec_load_replicate (ty_vec128 ty @ (multi_lane size _)) addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadReplicate size dst addr)))) + dst)) -;; Helper for emitting `MInst.FpuStoreRev64` instructions. -(decl fpu_storerev64 (Reg MemArg) SideEffectNoResult) -(rule (fpu_storerev64 src addr) - (SideEffectNoResult.Inst (MInst.FpuStoreRev64 src addr))) +;; Helper for emitting `MInst.VecLoadReplicateRev` instructions. +(decl vec_load_replicate_rev (Type MemArg) Reg) +(rule (vec_load_replicate_rev (ty_vec128 ty @ (multi_lane size _)) addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadReplicateRev size dst addr)))) + dst)) -;; Helper for emitting `MInst.VecSelect` instructions. -(decl vec_select (Type Reg Reg Reg) Reg) -(rule (vec_select ty src1 src2 src3) +;; Helper for emitting `MInst.MovToVec128` instructions. +(decl mov_to_vec128 (Type Reg Reg) Reg) +(rule (mov_to_vec128 ty src1 src2) (let ((dst WritableReg (temp_writable_reg ty)) - (_ Unit (emit (MInst.VecSelect dst src1 src2 src3)))) + (_ Unit (emit (MInst.MovToVec128 dst src1 src2)))) + dst)) + +;; Helper for emitting `MInst.VecLoadConst` instructions. +(decl vec_load_const (Type u128) Reg) +(rule (vec_load_const (ty_vec128 ty) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadConst dst n)))) + dst)) + +;; Helper for emitting `MInst.VecLoadConstReplicate` instructions. +(decl vec_load_const_replicate (Type u64) Reg) +(rule (vec_load_const_replicate ty @ (multi_lane size _) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadConstReplicate size dst n)))) + dst)) + +;; Helper for emitting `MInst.VecImmByteMask` instructions. +(decl vec_imm_byte_mask (Type u16) Reg) +(rule (vec_imm_byte_mask (ty_vec128 ty) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmByteMask dst n)))) + dst)) + +;; Helper for emitting `MInst.VecImmBitMask` instructions. +(decl vec_imm_bit_mask (Type u8 u8) Reg) +(rule (vec_imm_bit_mask (ty_vec128 ty @ (multi_lane size _)) start_bit end_bit) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmBitMask size dst start_bit end_bit)))) + dst)) + +;; Helper for emitting `MInst.VecImmReplicate` instructions. +(decl vec_imm_replicate (Type i16) Reg) +(rule (vec_imm_replicate (ty_vec128 ty @ (multi_lane size _)) n) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecImmReplicate size dst n)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLane` instructions. +(decl vec_load_lane (Type Reg MemArg u8) Reg) +(rule (vec_load_lane ty @ (multi_lane size _) src addr lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecLoadLane size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneUndef` instructions. +(decl vec_load_lane_undef (Type MemArg u8) Reg) +(rule (vec_load_lane_undef ty @ (multi_lane size _) addr lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadLaneUndef size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneRev` instructions. +(decl vec_load_lane_rev (Type Reg MemArg u8) Reg) +(rule (vec_load_lane_rev ty @ (multi_lane size _) src addr lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecLoadLaneRev size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecLoadLaneRevUndef` instructions. +(decl vec_load_lane_rev_undef (Type MemArg u8) Reg) +(rule (vec_load_lane_rev_undef ty @ (multi_lane size _) addr lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadLaneRevUndef size dst addr lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecStoreLane` instructions. +(decl vec_store_lane (Type Reg MemArg u8) SideEffectNoResult) +(rule (vec_store_lane ty @ (multi_lane size _) src addr lane_imm) + (SideEffectNoResult.Inst (MInst.VecStoreLane size src addr lane_imm))) + +;; Helper for emitting `MInst.VecStoreLaneRev` instructions. +(decl vec_store_lane_rev (Type Reg MemArg u8) SideEffectNoResult) +(rule (vec_store_lane_rev ty @ (multi_lane size _) src addr lane_imm) + (SideEffectNoResult.Inst (MInst.VecStoreLaneRev size src addr lane_imm))) + +;; Helper for emitting `MInst.VecInsertLane` instructions. +(decl vec_insert_lane (Type Reg Reg u8 Reg) Reg) +(rule (vec_insert_lane ty @ (multi_lane size _) src1 src2 lane_imm lane_reg) + (let ((dst WritableReg (copy_writable_reg ty src1)) + (_ Unit (emit (MInst.VecInsertLane size dst src2 lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecInsertLaneUndef` instructions. +(decl vec_insert_lane_undef (Type Reg u8 Reg) Reg) +(rule (vec_insert_lane_undef ty @ (multi_lane size _) src lane_imm lane_reg) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecInsertLaneUndef size dst src lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecExtractLane` instructions. +(decl vec_extract_lane (Type Reg u8 Reg) Reg) +(rule (vec_extract_lane (multi_lane size _) src lane_imm lane_reg) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.VecExtractLane size dst src lane_imm lane_reg)))) + dst)) + +;; Helper for emitting `MInst.VecInsertLaneImm` instructions. +(decl vec_insert_lane_imm (Type Reg i16 u8) Reg) +(rule (vec_insert_lane_imm ty @ (multi_lane size _) src imm lane_imm) + (let ((dst WritableReg (copy_writable_reg ty src)) + (_ Unit (emit (MInst.VecInsertLaneImm size dst imm lane_imm)))) + dst)) + +;; Helper for emitting `MInst.VecReplicateLane` instructions. +(decl vec_replicate_lane (Type Reg u8) Reg) +(rule (vec_replicate_lane ty @ (multi_lane size _) src lane_imm) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecReplicateLane size dst src lane_imm)))) dst)) ;; Helper for emitting `MInst.LoadExtNameFar` instructions. @@ -1858,6 +2465,9 @@ (rule (emit_mov $F64 dst src) (emit (MInst.FpuMove64 dst src))) +(rule (emit_mov (ty_vec128 ty) dst src) + (emit (MInst.VecMov dst src))) + ;; Allocate a temporary (writable) register, initialized as a copy of the input. (decl copy_writable_reg (Type Reg) WritableReg) (rule (copy_writable_reg ty src) @@ -1888,8 +2498,12 @@ (rule (emit_arg_store $I32 reg mem) (emit_side_effect (store32 reg mem))) (rule (emit_arg_store $I64 reg mem) (emit_side_effect (store64 reg mem))) (rule (emit_arg_store $R64 reg mem) (emit_side_effect (store64 reg mem))) -(rule (emit_arg_store $F32 reg mem) (emit_side_effect (fpu_store32 reg mem))) -(rule (emit_arg_store $F64 reg mem) (emit_side_effect (fpu_store64 reg mem))) +(rule (emit_arg_store $F32 reg mem) + (emit_side_effect (vec_store_lane $F32X4 reg mem 0))) +(rule (emit_arg_store $F64 reg mem) + (emit_side_effect (vec_store_lane $F64X2 reg mem 0))) +(rule (emit_arg_store (ty_vec128 ty) reg mem) + (emit_side_effect (vec_store reg mem))) (decl emit_arg_load (Type MemArg) Reg) (rule (emit_arg_load $I8 mem) (zext32_mem $I8 mem)) @@ -1897,8 +2511,9 @@ (rule (emit_arg_load $I32 mem) (load32 mem)) (rule (emit_arg_load $I64 mem) (load64 mem)) (rule (emit_arg_load $R64 mem) (load64 mem)) -(rule (emit_arg_load $F32 mem) (fpu_load64 mem)) -(rule (emit_arg_load $F64 mem) (fpu_load64 mem)) +(rule (emit_arg_load $F32 mem) (vec_load_lane_undef $F32X4 mem 0)) +(rule (emit_arg_load $F64 mem) (vec_load_lane_undef $F64X2 mem 0)) +(rule (emit_arg_load (ty_vec128 ty) mem) (vec_load ty mem)) ;; Copy a single argument/return value to its slots. (decl copy_to_arg (i64 ABIArg Value) Unit) @@ -2026,6 +2641,36 @@ (_ Unit (emit (MInst.Mov64SImm32 dst n)))) (writable_reg_to_reg dst))) +;; Allocate a temporary register, initialized with a vector immediate. +(decl vec_imm (Type u128) Reg) +(rule (vec_imm (ty_vec128 ty) 0) + (vec_imm_byte_mask ty 0)) +(rule (vec_imm (ty_vec128 ty) (u64_pair n n)) + (vec_imm_splat $I64X2 n)) +(rule (vec_imm (ty_vec128 ty) n) + (vec_load_const ty n)) + +;; Variant with replicated immediate. +(decl vec_imm_splat (Type u64) Reg) +(rule (vec_imm_splat (ty_vec128 ty) 0) + (vec_imm_byte_mask ty 0)) +(rule (vec_imm_splat ty @ (multi_lane 8 _) n) + (vec_imm_replicate ty (u64_as_i16 n))) +(rule (vec_imm_splat ty @ (multi_lane 16 _) n) + (vec_imm_replicate ty (u64_as_i16 n))) +(rule (vec_imm_splat ty @ (multi_lane 32 _) (u32_pair _ (i16_from_u32 n))) + (vec_imm_replicate ty n)) +(rule (vec_imm_splat ty @ (multi_lane 64 _) (i16_from_u64 n)) + (vec_imm_replicate ty n)) +(rule (vec_imm_splat (multi_lane 16 _) (u32_pair _ (u16_pair _ (u8_pair n n)))) + (vec_imm_splat $I8X16 (u8_as_u64 n))) +(rule (vec_imm_splat (multi_lane 32 _) (u32_pair _ (u16_pair n n))) + (vec_imm_splat $I16X8 (u16_as_u64 n))) +(rule (vec_imm_splat (multi_lane 64 _) (u32_pair n n)) + (vec_imm_splat $I32X4 (u32_as_u64 n))) +(rule (vec_imm_splat (ty_vec128 ty) n) + (vec_load_const_replicate ty n)) + ;; Place an immediate into the low half of a register pair. ;; The high half is taken from the input. (decl imm_regpair_lo (Type u64 RegPair) RegPair) @@ -2337,6 +2982,10 @@ (rule (emit_cmov_reg $F64 dst cond src) (ConsumesFlags.ConsumesFlagsReturnsReg (MInst.FpuCMov64 dst cond src) dst)) +(rule (emit_cmov_reg (ty_vec128 ty) dst cond src) + (ConsumesFlags.ConsumesFlagsReturnsReg (MInst.VecCMov dst cond src) + dst)) + ;; Conditionally select between two source registers. (decl cmov_reg (Type Cond Reg Reg) ConsumesFlags) @@ -2488,7 +3137,7 @@ (decl casloop_emit (VecMInstBuilder Type MemFlags Reg Reg) Reg) (rule (casloop_emit ib ty flags aligned_addr val) (let (;; Construct a memory argument for the aligned word. - (aligned_mem MemArg (memarg_reg_plus_off aligned_addr 0 flags)) + (aligned_mem MemArg (memarg_reg_plus_off aligned_addr 0 0 flags)) ;; Add the compare-and-swap instruction to the builder. (result Reg (push_atomic_cas ib (ty_ext32 ty) (casloop_val_reg) val aligned_mem)) @@ -2607,6 +3256,91 @@ (extern constructor abi_accumulate_outgoing_args_size abi_accumulate_outgoing_args_size) +;; Helpers for generating vector pack and unpack instructions ;;;;;;;;;;;;;;;;;; + +(decl vec_widen_type (Type) Type) +(rule (vec_widen_type $I8X16) $I16X8) +(rule (vec_widen_type $I16X8) $I32X4) +(rule (vec_widen_type $I32X4) $I64X2) + +(decl vecop_pack (Type) VecBinaryOp) +(rule (vecop_pack $I16X8) (VecBinaryOp.Pack16x8)) +(rule (vecop_pack $I32X4) (VecBinaryOp.Pack32x4)) +(rule (vecop_pack $I64X2) (VecBinaryOp.Pack64x2)) + +(decl vec_pack (Type Reg Reg) Reg) +(rule (vec_pack ty x y) (vec_rrr ty (vecop_pack ty) x y)) + +(decl vecop_pack_ssat (Type) VecBinaryOp) +(rule (vecop_pack_ssat $I16X8) (VecBinaryOp.PackSSat16x8)) +(rule (vecop_pack_ssat $I32X4) (VecBinaryOp.PackSSat32x4)) +(rule (vecop_pack_ssat $I64X2) (VecBinaryOp.PackSSat64x2)) + +(decl vec_pack_ssat (Type Reg Reg) Reg) +(rule (vec_pack_ssat ty x y) (vec_rrr ty (vecop_pack_ssat ty) x y)) + +(decl vecop_pack_usat (Type) VecBinaryOp) +(rule (vecop_pack_usat $I16X8) (VecBinaryOp.PackUSat16x8)) +(rule (vecop_pack_usat $I32X4) (VecBinaryOp.PackUSat32x4)) +(rule (vecop_pack_usat $I64X2) (VecBinaryOp.PackUSat64x2)) + +(decl vec_pack_usat (Type Reg Reg) Reg) +(rule (vec_pack_usat ty x y) (vec_rrr ty (vecop_pack_usat ty) x y)) + +(decl vecop_unpacks_low (Type) VecUnaryOp) +(rule (vecop_unpacks_low $I8X16) (VecUnaryOp.UnpackSLow8x16)) +(rule (vecop_unpacks_low $I16X8) (VecUnaryOp.UnpackSLow16x8)) +(rule (vecop_unpacks_low $I32X4) (VecUnaryOp.UnpackSLow32x4)) + +(decl vec_unpacks_low (Type Reg) Reg) +(rule (vec_unpacks_low ty x) (vec_rr ty (vecop_unpacks_low ty) x)) + +(decl vecop_unpacks_high (Type) VecUnaryOp) +(rule (vecop_unpacks_high $I8X16) (VecUnaryOp.UnpackSHigh8x16)) +(rule (vecop_unpacks_high $I16X8) (VecUnaryOp.UnpackSHigh16x8)) +(rule (vecop_unpacks_high $I32X4) (VecUnaryOp.UnpackSHigh32x4)) + +(decl vec_unpacks_high (Type Reg) Reg) +(rule (vec_unpacks_high ty x) (vec_rr ty (vecop_unpacks_high ty) x)) + +(decl vecop_unpacku_low (Type) VecUnaryOp) +(rule (vecop_unpacku_low $I8X16) (VecUnaryOp.UnpackULow8x16)) +(rule (vecop_unpacku_low $I16X8) (VecUnaryOp.UnpackULow16x8)) +(rule (vecop_unpacku_low $I32X4) (VecUnaryOp.UnpackULow32x4)) + +(decl vec_unpacku_low (Type Reg) Reg) +(rule (vec_unpacku_low ty x) (vec_rr ty (vecop_unpacku_low ty) x)) + +(decl vecop_unpacku_high (Type) VecUnaryOp) +(rule (vecop_unpacku_high $I8X16) (VecUnaryOp.UnpackUHigh8x16)) +(rule (vecop_unpacku_high $I16X8) (VecUnaryOp.UnpackUHigh16x8)) +(rule (vecop_unpacku_high $I32X4) (VecUnaryOp.UnpackUHigh32x4)) + +(decl vec_unpacku_high (Type Reg) Reg) +(rule (vec_unpacku_high ty x) (vec_rr ty (vecop_unpacku_high ty) x)) + + +;; Helpers for generating vector merge instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_merge_low (Type) VecBinaryOp) +(rule (vecop_merge_low $I8X16) (VecBinaryOp.MergeLow8x16)) +(rule (vecop_merge_low $I16X8) (VecBinaryOp.MergeLow16x8)) +(rule (vecop_merge_low $I32X4) (VecBinaryOp.MergeLow32x4)) +(rule (vecop_merge_low $I64X2) (VecBinaryOp.MergeLow64x2)) + +(decl vec_merge_low (Type Reg Reg) Reg) +(rule (vec_merge_low ty x y) (vec_rrr ty (vecop_merge_low ty) x y)) + +(decl vecop_merge_high (Type) VecBinaryOp) +(rule (vecop_merge_high $I8X16) (VecBinaryOp.MergeHigh8x16)) +(rule (vecop_merge_high $I16X8) (VecBinaryOp.MergeHigh16x8)) +(rule (vecop_merge_high $I32X4) (VecBinaryOp.MergeHigh32x4)) +(rule (vecop_merge_high $I64X2) (VecBinaryOp.MergeHigh64x2)) + +(decl vec_merge_high (Type Reg Reg) Reg) +(rule (vec_merge_high ty x y) (vec_rrr ty (vecop_merge_high ty) x y)) + + ;; Helpers for generating `clz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Count leading zeroes. For a zero input, return the specified value. @@ -2711,6 +3445,15 @@ (decl add_mem_sext32 (Type Reg MemArg) Reg) (rule (add_mem_sext32 ty x y) (alu_rx ty (aluop_add_sext32 ty) x y)) +(decl vecop_add (Type) VecBinaryOp) +(rule (vecop_add $I8X16) (VecBinaryOp.Add8x16)) +(rule (vecop_add $I16X8) (VecBinaryOp.Add16x8)) +(rule (vecop_add $I32X4) (VecBinaryOp.Add32x4)) +(rule (vecop_add $I64X2) (VecBinaryOp.Add64x2)) + +(decl vec_add (Type Reg Reg) Reg) +(rule (vec_add ty x y) (vec_rrr ty (vecop_add ty) x y)) + ;; Helpers for generating `add_logical` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2768,6 +3511,15 @@ (decl sub_mem_sext32 (Type Reg MemArg) Reg) (rule (sub_mem_sext32 ty x y) (alu_rx ty (aluop_sub_sext32 ty) x y)) +(decl vecop_sub (Type) VecBinaryOp) +(rule (vecop_sub $I8X16) (VecBinaryOp.Sub8x16)) +(rule (vecop_sub $I16X8) (VecBinaryOp.Sub16x8)) +(rule (vecop_sub $I32X4) (VecBinaryOp.Sub32x4)) +(rule (vecop_sub $I64X2) (VecBinaryOp.Sub64x2)) + +(decl vec_sub (Type Reg Reg) Reg) +(rule (vec_sub ty x y) (vec_rrr ty (vecop_sub ty) x y)) + ;; Helpers for generating `sub_logical` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2831,6 +3583,69 @@ (decl mul_mem_sext32 (Type Reg MemArg) Reg) (rule (mul_mem_sext32 ty x y) (alu_rx ty (aluop_mul_sext32 ty) x y)) +(decl vecop_mul (Type) VecBinaryOp) +(rule (vecop_mul $I8X16) (VecBinaryOp.Mul8x16)) +(rule (vecop_mul $I16X8) (VecBinaryOp.Mul16x8)) +(rule (vecop_mul $I32X4) (VecBinaryOp.Mul32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_mul (Type Reg Reg) Reg) +(rule (vec_mul ty x y) (vec_rrr ty (vecop_mul ty) x y)) + +(decl vecop_umulhi (Type) VecBinaryOp) +(rule (vecop_umulhi $I8X16) (VecBinaryOp.UMulHi8x16)) +(rule (vecop_umulhi $I16X8) (VecBinaryOp.UMulHi16x8)) +(rule (vecop_umulhi $I32X4) (VecBinaryOp.UMulHi32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umulhi (Type Reg Reg) Reg) +(rule (vec_umulhi ty x y) (vec_rrr ty (vecop_umulhi ty) x y)) + +(decl vecop_smulhi (Type) VecBinaryOp) +(rule (vecop_smulhi $I8X16) (VecBinaryOp.SMulHi8x16)) +(rule (vecop_smulhi $I16X8) (VecBinaryOp.SMulHi16x8)) +(rule (vecop_smulhi $I32X4) (VecBinaryOp.SMulHi32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smulhi (Type Reg Reg) Reg) +(rule (vec_smulhi ty x y) (vec_rrr ty (vecop_smulhi ty) x y)) + +(decl vecop_umul_even (Type) VecBinaryOp) +(rule (vecop_umul_even $I8X16) (VecBinaryOp.UMulEven8x16)) +(rule (vecop_umul_even $I16X8) (VecBinaryOp.UMulEven16x8)) +(rule (vecop_umul_even $I32X4) (VecBinaryOp.UMulEven32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umul_even (Type Reg Reg) Reg) +(rule (vec_umul_even ty x y) (vec_rrr ty (vecop_umul_even ty) x y)) + +(decl vecop_smul_even (Type) VecBinaryOp) +(rule (vecop_smul_even $I8X16) (VecBinaryOp.SMulEven8x16)) +(rule (vecop_smul_even $I16X8) (VecBinaryOp.SMulEven16x8)) +(rule (vecop_smul_even $I32X4) (VecBinaryOp.SMulEven32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smul_even (Type Reg Reg) Reg) +(rule (vec_smul_even ty x y) (vec_rrr ty (vecop_smul_even ty) x y)) + +(decl vecop_umul_odd (Type) VecBinaryOp) +(rule (vecop_umul_odd $I8X16) (VecBinaryOp.UMulOdd8x16)) +(rule (vecop_umul_odd $I16X8) (VecBinaryOp.UMulOdd16x8)) +(rule (vecop_umul_odd $I32X4) (VecBinaryOp.UMulOdd32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_umul_odd (Type Reg Reg) Reg) +(rule (vec_umul_odd ty x y) (vec_rrr ty (vecop_umul_odd ty) x y)) + +(decl vecop_smul_odd (Type) VecBinaryOp) +(rule (vecop_smul_odd $I8X16) (VecBinaryOp.SMulOdd8x16)) +(rule (vecop_smul_odd $I16X8) (VecBinaryOp.SMulOdd16x8)) +(rule (vecop_smul_odd $I32X4) (VecBinaryOp.SMulOdd32x4)) +;; No support for $I64X2 multiplication. + +(decl vec_smul_odd (Type Reg Reg) Reg) +(rule (vec_smul_odd ty x y) (vec_rrr ty (vecop_smul_odd ty) x y)) + ;; Helpers for generating `udivmod` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2846,6 +3661,66 @@ (rule (sdivmod $I64 x y) (sdivmod64 x y)) +;; Helpers for generating `umax` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_umax (Type) VecBinaryOp) +(rule (vecop_umax $I8X16) (VecBinaryOp.UMax8x16)) +(rule (vecop_umax $I16X8) (VecBinaryOp.UMax16x8)) +(rule (vecop_umax $I32X4) (VecBinaryOp.UMax32x4)) +(rule (vecop_umax $I64X2) (VecBinaryOp.UMax64x2)) + +(decl vec_umax (Type Reg Reg) Reg) +(rule (vec_umax ty x y) (vec_rrr ty (vecop_umax ty) x y)) + + +;; Helpers for generating `imax` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_smax (Type) VecBinaryOp) +(rule (vecop_smax $I8X16) (VecBinaryOp.SMax8x16)) +(rule (vecop_smax $I16X8) (VecBinaryOp.SMax16x8)) +(rule (vecop_smax $I32X4) (VecBinaryOp.SMax32x4)) +(rule (vecop_smax $I64X2) (VecBinaryOp.SMax64x2)) + +(decl vec_smax (Type Reg Reg) Reg) +(rule (vec_smax ty x y) (vec_rrr ty (vecop_smax ty) x y)) + + +;; Helpers for generating `umin` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_umin (Type) VecBinaryOp) +(rule (vecop_umin $I8X16) (VecBinaryOp.UMin8x16)) +(rule (vecop_umin $I16X8) (VecBinaryOp.UMin16x8)) +(rule (vecop_umin $I32X4) (VecBinaryOp.UMin32x4)) +(rule (vecop_umin $I64X2) (VecBinaryOp.UMin64x2)) + +(decl vec_umin (Type Reg Reg) Reg) +(rule (vec_umin ty x y) (vec_rrr ty (vecop_umin ty) x y)) + + +;; Helpers for generating `imin` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_smin (Type) VecBinaryOp) +(rule (vecop_smin $I8X16) (VecBinaryOp.SMin8x16)) +(rule (vecop_smin $I16X8) (VecBinaryOp.SMin16x8)) +(rule (vecop_smin $I32X4) (VecBinaryOp.SMin32x4)) +(rule (vecop_smin $I64X2) (VecBinaryOp.SMin64x2)) + +(decl vec_smin (Type Reg Reg) Reg) +(rule (vec_smin ty x y) (vec_rrr ty (vecop_smin ty) x y)) + + +;; Helpers for generating `avg_round` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_uavg (Type) VecBinaryOp) +(rule (vecop_uavg $I8X16) (VecBinaryOp.UAvg8x16)) +(rule (vecop_uavg $I16X8) (VecBinaryOp.UAvg16x8)) +(rule (vecop_uavg $I32X4) (VecBinaryOp.UAvg32x4)) +(rule (vecop_uavg $I64X2) (VecBinaryOp.UAvg64x2)) + +(decl vec_uavg (Type Reg Reg) Reg) +(rule (vec_uavg ty x y) (vec_rrr ty (vecop_uavg ty) x y)) + + ;; Helpers for generating `and` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl aluop_and (Type) ALUOp) @@ -2864,6 +3739,9 @@ (decl and_mem (Type Reg MemArg) Reg) (rule (and_mem ty x y) (alu_rx ty (aluop_and ty) x y)) +(decl vec_and (Type Reg Reg) Reg) +(rule (vec_and (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.And128) x y)) + ;; Helpers for generating `or` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2883,6 +3761,9 @@ (decl or_mem (Type Reg MemArg) Reg) (rule (or_mem ty x y) (alu_rx ty (aluop_or ty) x y)) +(decl vec_or (Type Reg Reg) Reg) +(rule (vec_or (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.Orr128) x y)) + ;; Helpers for generating `xor` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2903,6 +3784,10 @@ (rule (push_xor_uimm32shifted ib ty dst src imm) (push_alu_uimm32shifted ib (aluop_xor ty) dst src imm)) +(decl vec_xor (Type Reg Reg) Reg) +(rule (vec_xor (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.Xor128) x y)) + + ;; Helpers for generating `not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl not_reg (Type Reg) Reg) @@ -2920,6 +3805,9 @@ (let ((val Reg (push_xor_uimm32shifted ib ty dst src (uimm32shifted 0xffffffff 0)))) (push_xor_uimm32shifted ib ty dst val (uimm32shifted 0xffffffff 32)))) +(decl vec_not (Type Reg) Reg) +(rule (vec_not ty x) (vec_not_or ty x x)) + ;; Helpers for generating `not_and` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2930,6 +3818,9 @@ (decl not_and_reg (Type Reg Reg) Reg) (rule (not_and_reg ty x y) (alu_rrr ty (aluop_not_and ty) x y)) +(decl vec_not_and (Type Reg Reg) Reg) +(rule (vec_not_and (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotAnd128) x y)) + ;; Helpers for generating `not_or` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2940,6 +3831,9 @@ (decl not_or_reg (Type Reg Reg) Reg) (rule (not_or_reg ty x y) (alu_rrr ty (aluop_not_or ty) x y)) +(decl vec_not_or (Type Reg Reg) Reg) +(rule (vec_not_or (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotOrr128) x y)) + ;; Helpers for generating `not_xor` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2950,6 +3844,9 @@ (decl not_xor_reg (Type Reg Reg) Reg) (rule (not_xor_reg ty x y) (alu_rrr ty (aluop_not_xor ty) x y)) +(decl vec_not_xor (Type Reg Reg) Reg) +(rule (vec_not_xor (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.NotXor128) x y)) + ;; Helpers for generating `and_not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2960,6 +3857,9 @@ (decl and_not_reg (Type Reg Reg) Reg) (rule (and_not_reg ty x y) (alu_rrr ty (aluop_and_not ty) x y)) +(decl vec_and_not (Type Reg Reg) Reg) +(rule (vec_and_not (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.AndNot128) x y)) + ;; Helpers for generating `or_not` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2970,6 +3870,15 @@ (decl or_not_reg (Type Reg Reg) Reg) (rule (or_not_reg ty x y) (alu_rrr ty (aluop_or_not ty) x y)) +(decl vec_or_not (Type Reg Reg) Reg) +(rule (vec_or_not (ty_vec128 ty) x y) (vec_rrr ty (VecBinaryOp.OrrNot128) x y)) + + +;; Helpers for generating `bitpermute` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vec_bitpermute (Reg Reg) Reg) +(rule (vec_bitpermute x y) (vec_rrr $I64X2 (VecBinaryOp.BitPermute128) x y)) + ;; Helpers for generating `abs` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2986,6 +3895,15 @@ (decl abs_reg_sext32 (Type Reg) Reg) (rule (abs_reg_sext32 ty x) (unary_rr ty (unaryop_abs_sext32 ty) x)) +(decl vecop_abs (Type) VecUnaryOp) +(rule (vecop_abs $I8X16) (VecUnaryOp.Abs8x16)) +(rule (vecop_abs $I16X8) (VecUnaryOp.Abs16x8)) +(rule (vecop_abs $I32X4) (VecUnaryOp.Abs32x4)) +(rule (vecop_abs $I64X2) (VecUnaryOp.Abs64x2)) + +(decl vec_abs (Type Reg) Reg) +(rule (vec_abs ty x) (vec_rr ty (vecop_abs ty) x)) + ;; Helpers for generating `neg` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3004,6 +3922,15 @@ (decl neg_reg_sext32 (Type Reg) Reg) (rule (neg_reg_sext32 ty x) (unary_rr ty (unaryop_neg_sext32 ty) x)) +(decl vecop_neg (Type) VecUnaryOp) +(rule (vecop_neg $I8X16) (VecUnaryOp.Neg8x16)) +(rule (vecop_neg $I16X8) (VecUnaryOp.Neg16x8)) +(rule (vecop_neg $I32X4) (VecUnaryOp.Neg32x4)) +(rule (vecop_neg $I64X2) (VecUnaryOp.Neg64x2)) + +(decl vec_neg (Type Reg) Reg) +(rule (vec_neg ty x) (vec_rr ty (vecop_neg ty) x)) + ;; Helpers for generating `bswap` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3040,6 +3967,20 @@ (rule (push_rot_imm_reg ib ty dst src shift_imm shift_reg) (push_shift ib (shiftop_rot ty) dst src shift_imm shift_reg)) +(decl vec_shiftop_rot (Type) VecShiftOp) +(rule (vec_shiftop_rot $I8X16) (VecShiftOp.RotL8x16)) +(rule (vec_shiftop_rot $I16X8) (VecShiftOp.RotL16x8)) +(rule (vec_shiftop_rot $I32X4) (VecShiftOp.RotL32x4)) +(rule (vec_shiftop_rot $I64X2) (VecShiftOp.RotL64x2)) + +(decl vec_rot_reg (Type Reg Reg) Reg) +(rule (vec_rot_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_rot ty) x 0 shift_reg)) + +(decl vec_rot_imm (Type Reg u8) Reg) +(rule (vec_rot_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_rot ty) x shift_imm (zero_reg))) + ;; Helpers for generating `lshl` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3057,6 +3998,23 @@ (rule (lshl_imm ty x shift_imm) (shift_rr ty (shiftop_lshl ty) x shift_imm (zero_reg))) +(decl vec_shiftop_lshl (Type) VecShiftOp) +(rule (vec_shiftop_lshl $I8X16) (VecShiftOp.LShL8x16)) +(rule (vec_shiftop_lshl $I16X8) (VecShiftOp.LShL16x8)) +(rule (vec_shiftop_lshl $I32X4) (VecShiftOp.LShL32x4)) +(rule (vec_shiftop_lshl $I64X2) (VecShiftOp.LShL64x2)) + +(decl vec_lshl_reg (Type Reg Reg) Reg) +(rule (vec_lshl_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_lshl ty) x 0 shift_reg)) + +(decl vec_lshl_imm (Type Reg u8) Reg) +(rule (vec_lshl_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_lshl ty) x shift_imm (zero_reg))) + +(decl vec_lshl_by_byte (Reg Reg) Reg) +(rule (vec_lshl_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.LShLByByte128) x y)) + ;; Helpers for generating `lshr` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3072,6 +4030,23 @@ (rule (lshr_imm ty x shift_imm) (shift_rr ty (shiftop_lshr ty) x shift_imm (zero_reg))) +(decl vec_shiftop_lshr (Type) VecShiftOp) +(rule (vec_shiftop_lshr $I8X16) (VecShiftOp.LShR8x16)) +(rule (vec_shiftop_lshr $I16X8) (VecShiftOp.LShR16x8)) +(rule (vec_shiftop_lshr $I32X4) (VecShiftOp.LShR32x4)) +(rule (vec_shiftop_lshr $I64X2) (VecShiftOp.LShR64x2)) + +(decl vec_lshr_reg (Type Reg Reg) Reg) +(rule (vec_lshr_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_lshr ty) x 0 shift_reg)) + +(decl vec_lshr_imm (Type Reg u8) Reg) +(rule (vec_lshr_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_lshr ty) x shift_imm (zero_reg))) + +(decl vec_lshr_by_byte (Reg Reg) Reg) +(rule (vec_lshr_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.LShRByByte128) x y)) + ;; Helpers for generating `ashr` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3087,6 +4062,23 @@ (rule (ashr_imm ty x shift_imm) (shift_rr ty (shiftop_ashr ty) x shift_imm (zero_reg))) +(decl vec_shiftop_ashr (Type) VecShiftOp) +(rule (vec_shiftop_ashr $I8X16) (VecShiftOp.AShR8x16)) +(rule (vec_shiftop_ashr $I16X8) (VecShiftOp.AShR16x8)) +(rule (vec_shiftop_ashr $I32X4) (VecShiftOp.AShR32x4)) +(rule (vec_shiftop_ashr $I64X2) (VecShiftOp.AShR64x2)) + +(decl vec_ashr_reg (Type Reg Reg) Reg) +(rule (vec_ashr_reg ty x shift_reg) + (vec_shift_rr ty (vec_shiftop_ashr ty) x 0 shift_reg)) + +(decl vec_ashr_imm (Type Reg u8) Reg) +(rule (vec_ashr_imm ty x shift_imm) + (vec_shift_rr ty (vec_shiftop_ashr ty) x shift_imm (zero_reg))) + +(decl vec_ashr_by_byte (Reg Reg) Reg) +(rule (vec_ashr_by_byte x y) (vec_rrr $I8X16 (VecBinaryOp.AShRByByte128) x y)) + ;; Helpers for generating `popcnt` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3096,6 +4088,15 @@ (decl popcnt_reg (Reg) Reg) (rule (popcnt_reg x) (unary_rr $I64 (UnaryOp.PopcntReg) x)) +(decl vecop_popcnt (Type) VecUnaryOp) +(rule (vecop_popcnt $I8X16) (VecUnaryOp.Popcnt8x16)) +(rule (vecop_popcnt $I16X8) (VecUnaryOp.Popcnt16x8)) +(rule (vecop_popcnt $I32X4) (VecUnaryOp.Popcnt32x4)) +(rule (vecop_popcnt $I64X2) (VecUnaryOp.Popcnt64x2)) + +(decl vec_popcnt (Type Reg) Reg) +(rule (vec_popcnt ty x) (vec_rr ty (vecop_popcnt ty) x)) + ;; Helpers for generating `atomic_rmw` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3132,6 +4133,8 @@ (decl fpuop2_add (Type) FPUOp2) (rule (fpuop2_add $F32) (FPUOp2.Add32)) (rule (fpuop2_add $F64) (FPUOp2.Add64)) +(rule (fpuop2_add $F32X4) (FPUOp2.Add32x4)) +(rule (fpuop2_add $F64X2) (FPUOp2.Add64x2)) (decl fadd_reg (Type Reg Reg) Reg) (rule (fadd_reg ty x y) (fpu_rrr ty (fpuop2_add ty) x y)) @@ -3142,6 +4145,8 @@ (decl fpuop2_sub (Type) FPUOp2) (rule (fpuop2_sub $F32) (FPUOp2.Sub32)) (rule (fpuop2_sub $F64) (FPUOp2.Sub64)) +(rule (fpuop2_sub $F32X4) (FPUOp2.Sub32x4)) +(rule (fpuop2_sub $F64X2) (FPUOp2.Sub64x2)) (decl fsub_reg (Type Reg Reg) Reg) (rule (fsub_reg ty x y) (fpu_rrr ty (fpuop2_sub ty) x y)) @@ -3152,6 +4157,8 @@ (decl fpuop2_mul (Type) FPUOp2) (rule (fpuop2_mul $F32) (FPUOp2.Mul32)) (rule (fpuop2_mul $F64) (FPUOp2.Mul64)) +(rule (fpuop2_mul $F32X4) (FPUOp2.Mul32x4)) +(rule (fpuop2_mul $F64X2) (FPUOp2.Mul64x2)) (decl fmul_reg (Type Reg Reg) Reg) (rule (fmul_reg ty x y) (fpu_rrr ty (fpuop2_mul ty) x y)) @@ -3162,6 +4169,8 @@ (decl fpuop2_div (Type) FPUOp2) (rule (fpuop2_div $F32) (FPUOp2.Div32)) (rule (fpuop2_div $F64) (FPUOp2.Div64)) +(rule (fpuop2_div $F32X4) (FPUOp2.Div32x4)) +(rule (fpuop2_div $F64X2) (FPUOp2.Div64x2)) (decl fdiv_reg (Type Reg Reg) Reg) (rule (fdiv_reg ty x y) (fpu_rrr ty (fpuop2_div ty) x y)) @@ -3172,6 +4181,8 @@ (decl fpuop2_min (Type) FPUOp2) (rule (fpuop2_min $F32) (FPUOp2.Min32)) (rule (fpuop2_min $F64) (FPUOp2.Min64)) +(rule (fpuop2_min $F32X4) (FPUOp2.Min32x4)) +(rule (fpuop2_min $F64X2) (FPUOp2.Min64x2)) (decl fmin_reg (Type Reg Reg) Reg) (rule (fmin_reg ty x y) (fpu_rrr ty (fpuop2_min ty) x y)) @@ -3182,16 +4193,44 @@ (decl fpuop2_max (Type) FPUOp2) (rule (fpuop2_max $F32) (FPUOp2.Max32)) (rule (fpuop2_max $F64) (FPUOp2.Max64)) +(rule (fpuop2_max $F32X4) (FPUOp2.Max32x4)) +(rule (fpuop2_max $F64X2) (FPUOp2.Max64x2)) (decl fmax_reg (Type Reg Reg) Reg) (rule (fmax_reg ty x y) (fpu_rrr ty (fpuop2_max ty) x y)) +;; Helpers for generating `fmin_pseudo` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl fpuop2_min_pseudo (Type) FPUOp2) +(rule (fpuop2_min_pseudo $F32) (FPUOp2.MinPseudo32)) +(rule (fpuop2_min_pseudo $F64) (FPUOp2.MinPseudo64)) +(rule (fpuop2_min_pseudo $F32X4) (FPUOp2.MinPseudo32x4)) +(rule (fpuop2_min_pseudo $F64X2) (FPUOp2.MinPseudo64x2)) + +(decl fmin_pseudo_reg (Type Reg Reg) Reg) +(rule (fmin_pseudo_reg ty x y) (fpu_rrr ty (fpuop2_min_pseudo ty) x y)) + + +;; Helpers for generating `fmax_pseudo` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl fpuop2_max_pseudo (Type) FPUOp2) +(rule (fpuop2_max_pseudo $F32) (FPUOp2.MaxPseudo32)) +(rule (fpuop2_max_pseudo $F64) (FPUOp2.MaxPseudo64)) +(rule (fpuop2_max_pseudo $F32X4) (FPUOp2.MaxPseudo32x4)) +(rule (fpuop2_max_pseudo $F64X2) (FPUOp2.MaxPseudo64x2)) + +(decl fmax_pseudo_reg (Type Reg Reg) Reg) +(rule (fmax_pseudo_reg ty x y) (fpu_rrr ty (fpuop2_max_pseudo ty) x y)) + + ;; Helpers for generating `fma` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl fpuop3_fma (Type) FPUOp3) (rule (fpuop3_fma $F32) (FPUOp3.MAdd32)) (rule (fpuop3_fma $F64) (FPUOp3.MAdd64)) +(rule (fpuop3_fma $F32X4) (FPUOp3.MAdd32x4)) +(rule (fpuop3_fma $F64X2) (FPUOp3.MAdd64x2)) (decl fma_reg (Type Reg Reg Reg) Reg) (rule (fma_reg ty x y acc) (fpu_rrrr ty (fpuop3_fma ty) x y acc)) @@ -3202,6 +4241,8 @@ (decl fpuop1_sqrt (Type) FPUOp1) (rule (fpuop1_sqrt $F32) (FPUOp1.Sqrt32)) (rule (fpuop1_sqrt $F64) (FPUOp1.Sqrt64)) +(rule (fpuop1_sqrt $F32X4) (FPUOp1.Sqrt32x4)) +(rule (fpuop1_sqrt $F64X2) (FPUOp1.Sqrt64x2)) (decl sqrt_reg (Type Reg) Reg) (rule (sqrt_reg ty x) (fpu_rr ty (fpuop1_sqrt ty) x)) @@ -3212,6 +4253,8 @@ (decl fpuop1_neg (Type) FPUOp1) (rule (fpuop1_neg $F32) (FPUOp1.Neg32)) (rule (fpuop1_neg $F64) (FPUOp1.Neg64)) +(rule (fpuop1_neg $F32X4) (FPUOp1.Neg32x4)) +(rule (fpuop1_neg $F64X2) (FPUOp1.Neg64x2)) (decl fneg_reg (Type Reg) Reg) (rule (fneg_reg ty x) (fpu_rr ty (fpuop1_neg ty) x)) @@ -3222,6 +4265,8 @@ (decl fpuop1_abs (Type) FPUOp1) (rule (fpuop1_abs $F32) (FPUOp1.Abs32)) (rule (fpuop1_abs $F64) (FPUOp1.Abs64)) +(rule (fpuop1_abs $F32X4) (FPUOp1.Abs32x4)) +(rule (fpuop1_abs $F64X2) (FPUOp1.Abs64x2)) (decl fabs_reg (Type Reg) Reg) (rule (fabs_reg ty x) (fpu_rr ty (fpuop1_abs ty) x)) @@ -3232,6 +4277,8 @@ (decl fpuroundop_round (Type) FpuRoundOp) (rule (fpuroundop_round $F32) (FpuRoundOp.Round32)) (rule (fpuroundop_round $F64) (FpuRoundOp.Round64)) +(rule (fpuroundop_round $F32X4) (FpuRoundOp.Round32x4)) +(rule (fpuroundop_round $F64X2) (FpuRoundOp.Round64x2)) (decl ceil_reg (Type Reg) Reg) (rule (ceil_reg ty x) (fpu_round ty (fpuroundop_round ty) @@ -3256,6 +4303,8 @@ (rule (fpromote_reg ty ty x) x) (rule (fpromote_reg $F64 $F32 x) (fpu_rr $F64 (FPUOp1.Cvt32To64) x)) +(rule (fpromote_reg $F64X2 $F32X4 x) + (fpu_rr $F64 (FPUOp1.Cvt32x4To64x2) x)) ;; Helpers for generating `fdemote` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3264,28 +4313,34 @@ (rule (fdemote_reg ty ty mode x) x) (rule (fdemote_reg $F32 $F64 mode x) (fpu_round $F32 (FpuRoundOp.Cvt64To32) mode x)) +(rule (fdemote_reg $F32X4 $F64X2 mode x) + (fpu_round $F32X4 (FpuRoundOp.Cvt64x2To32x4) mode x)) ;; Helpers for generating `fcvt_from_uint` instructions ;;;;;;;;;;;;;;;;;;;;;;;; -(decl uint_to_fpu_op (Type) FpuRoundOp) -(rule (uint_to_fpu_op $F32) (FpuRoundOp.FromUInt32)) -(rule (uint_to_fpu_op $F64) (FpuRoundOp.FromUInt64)) - (decl fcvt_from_uint_reg (Type FpuRoundMode Reg) Reg) -(rule (fcvt_from_uint_reg ty mode x) - (fpu_round ty (uint_to_fpu_op ty) mode x)) +(rule (fcvt_from_uint_reg $F32 mode x) + (fpu_round $F32 (FpuRoundOp.FromUInt32) mode (vec_insert_lane_undef $I32X4 x 0 (zero_reg)))) +(rule (fcvt_from_uint_reg $F64 mode x) + (fpu_round $F64 (FpuRoundOp.FromUInt64) mode (vec_insert_lane_undef $I64X2 x 0 (zero_reg)))) +(rule (fcvt_from_uint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.FromUInt32x4) mode x)) +(rule (fcvt_from_uint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.FromUInt64x2) mode x)) ;; Helpers for generating `fcvt_from_sint` instructions ;;;;;;;;;;;;;;;;;;;;;;;; -(decl sint_to_fpu_op (Type) FpuRoundOp) -(rule (sint_to_fpu_op $F32) (FpuRoundOp.FromSInt32)) -(rule (sint_to_fpu_op $F64) (FpuRoundOp.FromSInt64)) - (decl fcvt_from_sint_reg (Type FpuRoundMode Reg) Reg) -(rule (fcvt_from_sint_reg ty mode x) - (fpu_round ty (sint_to_fpu_op ty) mode x)) +(rule (fcvt_from_sint_reg $F32 mode x) + (fpu_round $F32 (FpuRoundOp.FromSInt32) mode (vec_insert_lane_undef $I32X4 x 0 (zero_reg)))) +(rule (fcvt_from_sint_reg $F64 mode x) + (fpu_round $F64 (FpuRoundOp.FromSInt64) mode (vec_insert_lane_undef $I64X2 x 0 (zero_reg)))) +(rule (fcvt_from_sint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.FromSInt32x4) mode x)) +(rule (fcvt_from_sint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.FromSInt64x2) mode x)) ;; Helpers for generating `fcvt_to_[us]int` instructions ;;;;;;;;;;;;;;;;;;;;;;; @@ -3305,9 +4360,13 @@ (decl fcvt_to_uint_reg (Type FpuRoundMode Reg) Reg) (rule (fcvt_to_uint_reg $F32 mode x) - (mov_from_fpr32 (fpu_round $F32 (FpuRoundOp.ToUInt32) mode x))) + (vec_extract_lane $I32X4 (fpu_round $F32 (FpuRoundOp.ToUInt32) mode x) 0 (zero_reg))) (rule (fcvt_to_uint_reg $F64 mode x) - (mov_from_fpr64 (fpu_round $F64 (FpuRoundOp.ToUInt64) mode x))) + (vec_extract_lane $I64X2 (fpu_round $F64 (FpuRoundOp.ToUInt64) mode x) 0 (zero_reg))) +(rule (fcvt_to_uint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.ToUInt32x4) mode x)) +(rule (fcvt_to_uint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.ToUInt64x2) mode x)) (decl fcvt_to_uint_ub (Type Type) Reg) (rule (fcvt_to_uint_ub $F32 dst_ty) @@ -3333,9 +4392,13 @@ (decl fcvt_to_sint_reg (Type FpuRoundMode Reg) Reg) (rule (fcvt_to_sint_reg $F32 mode x) - (mov_from_fpr32 (fpu_round $F32 (FpuRoundOp.ToSInt32) mode x))) + (vec_extract_lane $F32X4 (fpu_round $F32 (FpuRoundOp.ToSInt32) mode x) 0 (zero_reg))) (rule (fcvt_to_sint_reg $F64 mode x) - (mov_from_fpr64 (fpu_round $F64 (FpuRoundOp.ToSInt64) mode x))) + (vec_extract_lane $F64X2 (fpu_round $F64 (FpuRoundOp.ToSInt64) mode x) 0 (zero_reg))) +(rule (fcvt_to_sint_reg $F32X4 mode x) + (fpu_round $F32X4 (FpuRoundOp.ToSInt32x4) mode x)) +(rule (fcvt_to_sint_reg $F64X2 mode x) + (fpu_round $F64X2 (FpuRoundOp.ToSInt64x2) mode x)) (decl fcvt_to_sint_ub (Type Type) Reg) (rule (fcvt_to_sint_ub $F32 dst_ty) @@ -3426,12 +4489,79 @@ (rule (icmpu_mem_zext32 ty src mem) (cmp_rx (cmpop_cmpu_zext32 ty) src mem)) +;; Helpers for generating vector `icmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_int_cmpeq (Type) VecIntCmpOp) +(rule (vecop_int_cmpeq (multi_lane 8 16)) (VecIntCmpOp.CmpEq8x16)) +(rule (vecop_int_cmpeq (multi_lane 16 8)) (VecIntCmpOp.CmpEq16x8)) +(rule (vecop_int_cmpeq (multi_lane 32 4)) (VecIntCmpOp.CmpEq32x4)) +(rule (vecop_int_cmpeq (multi_lane 64 2)) (VecIntCmpOp.CmpEq64x2)) + +(decl vec_cmpeq (Type Reg Reg) Reg) +(rule (vec_cmpeq (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y)) +(decl vec_cmpeqs (Type Reg Reg) ProducesFlags) +(rule (vec_cmpeqs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y)) + +(decl vecop_int_cmph (Type) VecIntCmpOp) +(rule (vecop_int_cmph (multi_lane 8 16)) (VecIntCmpOp.SCmpHi8x16)) +(rule (vecop_int_cmph (multi_lane 16 8)) (VecIntCmpOp.SCmpHi16x8)) +(rule (vecop_int_cmph (multi_lane 32 4)) (VecIntCmpOp.SCmpHi32x4)) +(rule (vecop_int_cmph (multi_lane 64 2)) (VecIntCmpOp.SCmpHi64x2)) + +(decl vec_cmph (Type Reg Reg) Reg) +(rule (vec_cmph (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y)) +(decl vec_cmphs (Type Reg Reg) ProducesFlags) +(rule (vec_cmphs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y)) + +(decl vecop_int_cmphl (Type) VecIntCmpOp) +(rule (vecop_int_cmphl (multi_lane 8 16)) (VecIntCmpOp.UCmpHi8x16)) +(rule (vecop_int_cmphl (multi_lane 16 8)) (VecIntCmpOp.UCmpHi16x8)) +(rule (vecop_int_cmphl (multi_lane 32 4)) (VecIntCmpOp.UCmpHi32x4)) +(rule (vecop_int_cmphl (multi_lane 64 2)) (VecIntCmpOp.UCmpHi64x2)) + +(decl vec_cmphl (Type Reg Reg) Reg) +(rule (vec_cmphl (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y)) +(decl vec_cmphls (Type Reg Reg) ProducesFlags) +(rule (vec_cmphls (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y)) + + ;; Helpers for generating `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl fcmp_reg (Type Reg Reg) ProducesFlags) (rule (fcmp_reg $F32 src1 src2) (fpu_cmp32 src1 src2)) (rule (fcmp_reg $F64 src1 src2) (fpu_cmp64 src1 src2)) + +;; Helpers for generating vector `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_float_cmpeq (Type) VecFloatCmpOp) +(rule (vecop_float_cmpeq (multi_lane 32 4)) (VecFloatCmpOp.CmpEq32x4)) +(rule (vecop_float_cmpeq (multi_lane 64 2)) (VecFloatCmpOp.CmpEq64x2)) + +(decl vec_fcmpeq (Type Reg Reg) Reg) +(rule (vec_fcmpeq (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmpeq ty) x y)) +(decl vec_fcmpeqs (Type Reg Reg) ProducesFlags) +(rule (vec_fcmpeqs (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmpeq ty) x y)) + +(decl vecop_float_cmph (Type) VecFloatCmpOp) +(rule (vecop_float_cmph (multi_lane 32 4)) (VecFloatCmpOp.CmpHi32x4)) +(rule (vecop_float_cmph (multi_lane 64 2)) (VecFloatCmpOp.CmpHi64x2)) + +(decl vec_fcmph (Type Reg Reg) Reg) +(rule (vec_fcmph (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmph ty) x y)) +(decl vec_fcmphs (Type Reg Reg) ProducesFlags) +(rule (vec_fcmphs (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmph ty) x y)) + +(decl vecop_float_cmphe (Type) VecFloatCmpOp) +(rule (vecop_float_cmphe (multi_lane 32 4)) (VecFloatCmpOp.CmpHiEq32x4)) +(rule (vecop_float_cmphe (multi_lane 64 2)) (VecFloatCmpOp.CmpHiEq64x2)) + +(decl vec_fcmphe (Type Reg Reg) Reg) +(rule (vec_fcmphe (ty_vec128 ty) x y) (vec_float_cmp ty (vecop_float_cmphe ty) x y)) +(decl vec_fcmphes (Type Reg Reg) ProducesFlags) +(rule (vec_fcmphes (ty_vec128 ty) x y) (vec_float_cmps ty (vecop_float_cmphe ty) x y)) + + ;; Implicit conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (convert WritableRegPair RegPair writable_regpair_to_regpair) diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index a75e6ffaf334..3b7832b7ea58 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -852,6 +852,74 @@ fn enc_siy(opcode: u16, b1: Reg, d1: u32, i2: u8) -> [u8; 6] { enc } +/// VRIa-type instructions. +/// +/// 47 39 35 31 15 11 7 +/// opcode1 v1 - i2 m3 rxb opcode2 +/// 40 36 32 16 12 8 0 +/// +fn enc_vri_a(opcode: u16, v1: Reg, i2: u16, m3: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let m3 = m3 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4; + enc[2..4].copy_from_slice(&i2.to_be_bytes()); + enc[4] = m3 << 4 | rxb; + enc[5] = opcode2; + enc +} + +/// VRIb-type instructions. +/// +/// 47 39 35 31 23 15 11 7 +/// opcode1 v1 - i2 i3 m4 rxb opcode2 +/// 40 36 32 24 16 12 8 0 +/// +fn enc_vri_b(opcode: u16, v1: Reg, i2: u8, i3: u8, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4; + enc[2] = i2; + enc[3] = i3; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + +/// VRIc-type instructions. +/// +/// 47 39 35 31 15 11 7 +/// opcode1 v1 v3 i2 m4 rxb opcode2 +/// 40 36 32 16 12 8 0 +/// +fn enc_vri_c(opcode: u16, v1: Reg, i2: u16, v3: Reg, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v3), None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v3; + enc[2..4].copy_from_slice(&i2.to_be_bytes()); + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRa-type instructions. /// /// 47 39 35 31 23 19 15 11 7 @@ -878,6 +946,32 @@ fn enc_vrr_a(opcode: u16, v1: Reg, v2: Reg, m3: u8, m4: u8, m5: u8) -> [u8; 6] { enc } +/// VRRb-type instructions. +/// +/// 47 39 35 31 27 23 19 15 11 7 +/// opcode1 v1 v2 v3 - m5 - m4 rxb opcode2 +/// 40 36 32 28 24 20 16 12 8 0 +/// +fn enc_vrr_b(opcode: u16, v1: Reg, v2: Reg, v3: Reg, m4: u8, m5: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v2), Some(v3), None); + let v1 = machreg_to_vr(v1) & 0x0f; + let v2 = machreg_to_vr(v2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let m4 = m4 & 0x0f; + let m5 = m5 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v2; + enc[2] = v3 << 4; + enc[3] = m5 << 4; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRc-type instructions. /// /// 47 39 35 31 27 23 19 15 11 7 @@ -932,6 +1026,56 @@ fn enc_vrr_e(opcode: u16, v1: Reg, v2: Reg, v3: Reg, v4: Reg, m5: u8, m6: u8) -> enc } +/// VRRf-type instructions. +/// +/// 47 39 35 31 27 11 7 +/// opcode1 v1 r2 r3 - rxb opcode2 +/// 40 36 32 28 12 8 0 +/// +fn enc_vrr_f(opcode: u16, v1: Reg, r2: Reg, r3: Reg) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), None, None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let r2 = machreg_to_gpr(r2) & 0x0f; + let r3 = machreg_to_gpr(r3) & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | r2; + enc[2] = r3 << 4; + enc[4] = rxb; + enc[5] = opcode2; + enc +} + +/// VRSa-type instructions. +/// +/// 47 39 35 31 27 15 11 7 +/// opcode1 v1 v3 b2 d2 m4 rxb opcode2 +/// 40 36 32 28 16 12 8 0 +/// +fn enc_vrs_a(opcode: u16, v1: Reg, b2: Reg, d2: u32, v3: Reg, m4: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v3), None, None); + let v1 = machreg_to_vr(v1) & 0x0f; + let b2 = machreg_to_gpr(b2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let d2_lo = (d2 & 0xff) as u8; + let d2_hi = ((d2 >> 8) & 0x0f) as u8; + let m4 = m4 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v3; + enc[2] = b2 << 4 | d2_hi; + enc[3] = d2_lo; + enc[4] = m4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRSb-type instructions. /// /// 47 39 35 31 27 15 11 7 @@ -1834,29 +1978,6 @@ impl MachInstEmit for Inst { rd, &mem, opcode_rx, opcode_rxy, opcode_ril, true, sink, emit_info, state, ); } - &Inst::FpuLoad32 { rd, ref mem } - | &Inst::FpuLoad64 { rd, ref mem } - | &Inst::FpuLoadRev32 { rd, ref mem } - | &Inst::FpuLoadRev64 { rd, ref mem } => { - let rd = allocs.next_writable(rd); - let mem = mem.with_allocs(&mut allocs); - - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuLoad32 { .. } => (Some(0x78), Some(0xed64), 0xe703), // LE(Y), VLEF - &Inst::FpuLoad64 { .. } => (Some(0x68), Some(0xed65), 0xe702), // LD(Y), VLEG - &Inst::FpuLoadRev32 { .. } => (None, None, 0xe603), // VLEBRF - &Inst::FpuLoadRev64 { .. } => (None, None, 0xe602), // VLEBRG - _ => unreachable!(), - }; - let rd = rd.to_reg(); - if is_fpr(rd) && opcode_rx.is_some() { - mem_emit( - rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, - ); - } else { - mem_vrx_emit(rd, &mem, opcode_vrx, 0, true, sink, emit_info, state); - } - } &Inst::Store8 { rd, ref mem } | &Inst::Store16 { rd, ref mem } @@ -1904,28 +2025,6 @@ impl MachInstEmit for Inst { }; mem_imm16_emit(imm, &mem, opcode, true, sink, emit_info, state); } - &Inst::FpuStore32 { rd, ref mem } - | &Inst::FpuStore64 { rd, ref mem } - | &Inst::FpuStoreRev32 { rd, ref mem } - | &Inst::FpuStoreRev64 { rd, ref mem } => { - let rd = allocs.next(rd); - let mem = mem.with_allocs(&mut allocs); - - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuStore32 { .. } => (Some(0x70), Some(0xed66), 0xe70b), // STE(Y), VSTEF - &Inst::FpuStore64 { .. } => (Some(0x60), Some(0xed67), 0xe70a), // STD(Y), VSTEG - &Inst::FpuStoreRev32 { .. } => (None, None, 0xe60b), // VSTEBRF - &Inst::FpuStoreRev64 { .. } => (None, None, 0xe60a), // VSTEBRG - _ => unreachable!(), - }; - if is_fpr(rd) && opcode_rx.is_some() { - mem_emit( - rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, - ); - } else { - mem_vrx_emit(rd, &mem, opcode_vrx, 0, true, sink, emit_info, state); - } - } &Inst::LoadMultiple64 { rt, rt2, ref mem } => { let mem = mem.with_allocs(&mut allocs); @@ -2168,44 +2267,6 @@ impl MachInstEmit for Inst { put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0)); } } - &Inst::MovToFpr32 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - let (opcode, m4) = (0xe722, 2); // VLVG - put(sink, &enc_vrs_b(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - &Inst::MovToFpr64 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - if is_fpr(rd.to_reg()) { - let opcode = 0xb3c1; // LDGR - put(sink, &enc_rre(opcode, rd.to_reg(), rn)); - } else { - let (opcode, m4) = (0xe722, 3); // VLVG - put(sink, &enc_vrs_b(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - } - &Inst::MovFromFpr32 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - let (opcode, m4) = (0xe721, 2); // VLGV - put(sink, &enc_vrs_c(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - &Inst::MovFromFpr64 { rd, rn } => { - let rd = allocs.next_writable(rd); - let rn = allocs.next(rn); - - if is_fpr(rn) { - let opcode = 0xb3cd; // LGDR - put(sink, &enc_rre(opcode, rd.to_reg(), rn)); - } else { - let (opcode, m4) = (0xe721, 3); // VLVG - put(sink, &enc_vrs_c(opcode, rd.to_reg(), zero_reg(), 0, rn, m4)); - } - } &Inst::LoadFpuConst32 { rd, const_data } => { let rd = allocs.next_writable(rd); @@ -2213,9 +2274,11 @@ impl MachInstEmit for Inst { let reg = writable_spilltmp_reg().to_reg(); put(sink, &enc_ri_b(opcode, reg, 8)); sink.put4(const_data.swap_bytes()); - let inst = Inst::FpuLoad32 { + let inst = Inst::VecLoadLaneUndef { + size: 32, rd, mem: MemArg::reg(reg, MemFlags::trusted()), + lane_imm: 0, }; inst.emit(&[], sink, emit_info, state); } @@ -2226,9 +2289,11 @@ impl MachInstEmit for Inst { let reg = writable_spilltmp_reg().to_reg(); put(sink, &enc_ri_b(opcode, reg, 12)); sink.put8(const_data.swap_bytes()); - let inst = Inst::FpuLoad64 { + let inst = Inst::VecLoadLaneUndef { + size: 64, rd, mem: MemArg::reg(reg, MemFlags::trusted()), + lane_imm: 0, }; inst.emit(&[], sink, emit_info, state); } @@ -2236,21 +2301,30 @@ impl MachInstEmit for Inst { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); - let (opcode, m3, m5, opcode_fpr) = match fpu_op { - FPUOp1::Abs32 => (0xe7cc, 2, 2, 0xb300), // VFPSO, LPEBR - FPUOp1::Abs64 => (0xe7cc, 3, 2, 0xb310), // VFPSO, LPDBR - FPUOp1::Neg32 => (0xe7cc, 2, 0, 0xb303), // VFPSO, LCEBR - FPUOp1::Neg64 => (0xe7cc, 3, 0, 0xb313), // VFPSO, LCDBR - FPUOp1::NegAbs32 => (0xe7cc, 2, 1, 0xb301), // VFPSO, LNEBR - FPUOp1::NegAbs64 => (0xe7cc, 3, 1, 0xb311), // VFPSO, LNDBR - FPUOp1::Sqrt32 => (0xe7ce, 2, 0, 0xb314), // VFSQ, SQEBR - FPUOp1::Sqrt64 => (0xe7ce, 3, 0, 0xb315), // VFSQ, SQDBR - FPUOp1::Cvt32To64 => (0xe7c4, 2, 0, 0xb304), // VFLL, LDEBR + let (opcode, m3, m4, m5, opcode_fpr) = match fpu_op { + FPUOp1::Abs32 => (0xe7cc, 2, 8, 2, Some(0xb300)), // WFPSO, LPEBR + FPUOp1::Abs64 => (0xe7cc, 3, 8, 2, Some(0xb310)), // WFPSO, LPDBR + FPUOp1::Abs32x4 => (0xe7cc, 2, 0, 2, None), // VFPSO + FPUOp1::Abs64x2 => (0xe7cc, 3, 0, 2, None), // VFPSO + FPUOp1::Neg32 => (0xe7cc, 2, 8, 0, Some(0xb303)), // WFPSO, LCEBR + FPUOp1::Neg64 => (0xe7cc, 3, 8, 0, Some(0xb313)), // WFPSO, LCDBR + FPUOp1::Neg32x4 => (0xe7cc, 2, 0, 0, None), // VFPSO + FPUOp1::Neg64x2 => (0xe7cc, 3, 0, 0, None), // VFPSO + FPUOp1::NegAbs32 => (0xe7cc, 2, 8, 1, Some(0xb301)), // WFPSO, LNEBR + FPUOp1::NegAbs64 => (0xe7cc, 3, 8, 1, Some(0xb311)), // WFPSO, LNDBR + FPUOp1::NegAbs32x4 => (0xe7cc, 2, 0, 1, None), // VFPSO + FPUOp1::NegAbs64x2 => (0xe7cc, 3, 0, 1, None), // VFPSO + FPUOp1::Sqrt32 => (0xe7ce, 2, 8, 0, Some(0xb314)), // WFSQ, SQEBR + FPUOp1::Sqrt64 => (0xe7ce, 3, 8, 0, Some(0xb315)), // WFSQ, SQDBR + FPUOp1::Sqrt32x4 => (0xe7ce, 2, 0, 0, None), // VFSQ + FPUOp1::Sqrt64x2 => (0xe7ce, 3, 0, 0, None), // VFSQ + FPUOp1::Cvt32To64 => (0xe7c4, 2, 8, 0, Some(0xb304)), // WFLL, LDEBR + FPUOp1::Cvt32x4To64x2 => (0xe7c4, 2, 0, 0, None), // VFLL }; - if is_fpr(rd.to_reg()) && is_fpr(rn) { - put(sink, &enc_rre(opcode_fpr, rd.to_reg(), rn)); + if m4 == 8 && is_fpr(rd.to_reg()) && is_fpr(rn) { + put(sink, &enc_rre(opcode_fpr.unwrap(), rd.to_reg(), rn)); } else { - put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 8, m5)); + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, m4, m5)); } } &Inst::FpuRRR { fpu_op, rd, rn, rm } => { @@ -2258,24 +2332,45 @@ impl MachInstEmit for Inst { let rn = allocs.next(rn); let rm = allocs.next(rm); - let (opcode, m4, m6, opcode_fpr) = match fpu_op { - FPUOp2::Add32 => (0xe7e3, 2, 0, Some(0xb30a)), // VFA, AEBR - FPUOp2::Add64 => (0xe7e3, 3, 0, Some(0xb31a)), // VFA, ADBR - FPUOp2::Sub32 => (0xe7e2, 2, 0, Some(0xb30b)), // VFS, SEBR - FPUOp2::Sub64 => (0xe7e2, 3, 0, Some(0xb31b)), // VFS, SDBR - FPUOp2::Mul32 => (0xe7e7, 2, 0, Some(0xb317)), // VFM, MEEBR - FPUOp2::Mul64 => (0xe7e7, 3, 0, Some(0xb31c)), // VFM, MDBR - FPUOp2::Div32 => (0xe7e5, 2, 0, Some(0xb30d)), // VFD, DEBR - FPUOp2::Div64 => (0xe7e5, 3, 0, Some(0xb31d)), // VFD, DDBR - FPUOp2::Max32 => (0xe7ef, 2, 1, None), // VFMAX - FPUOp2::Max64 => (0xe7ef, 3, 1, None), // VFMAX - FPUOp2::Min32 => (0xe7ee, 2, 1, None), // VFMIN - FPUOp2::Min64 => (0xe7ee, 3, 1, None), // VFMIN + let (opcode, m4, m5, m6, opcode_fpr) = match fpu_op { + FPUOp2::Add32 => (0xe7e3, 2, 8, 0, Some(0xb30a)), // WFA, AEBR + FPUOp2::Add64 => (0xe7e3, 3, 8, 0, Some(0xb31a)), // WFA, ADBR + FPUOp2::Add32x4 => (0xe7e3, 2, 0, 0, None), // VFA + FPUOp2::Add64x2 => (0xe7e3, 3, 0, 0, None), // VFA + FPUOp2::Sub32 => (0xe7e2, 2, 8, 0, Some(0xb30b)), // WFS, SEBR + FPUOp2::Sub64 => (0xe7e2, 3, 8, 0, Some(0xb31b)), // WFS, SDBR + FPUOp2::Sub32x4 => (0xe7e2, 2, 0, 0, None), // VFS + FPUOp2::Sub64x2 => (0xe7e2, 3, 0, 0, None), // VFS + FPUOp2::Mul32 => (0xe7e7, 2, 8, 0, Some(0xb317)), // WFM, MEEBR + FPUOp2::Mul64 => (0xe7e7, 3, 8, 0, Some(0xb31c)), // WFM, MDBR + FPUOp2::Mul32x4 => (0xe7e7, 2, 0, 0, None), // VFM + FPUOp2::Mul64x2 => (0xe7e7, 3, 0, 0, None), // VFM + FPUOp2::Div32 => (0xe7e5, 2, 8, 0, Some(0xb30d)), // WFD, DEBR + FPUOp2::Div64 => (0xe7e5, 3, 8, 0, Some(0xb31d)), // WFD, DDBR + FPUOp2::Div32x4 => (0xe7e5, 2, 0, 0, None), // VFD + FPUOp2::Div64x2 => (0xe7e5, 3, 0, 0, None), // VFD + FPUOp2::Max32 => (0xe7ef, 2, 8, 1, None), // WFMAX + FPUOp2::Max64 => (0xe7ef, 3, 8, 1, None), // WFMAX + FPUOp2::Max32x4 => (0xe7ef, 2, 0, 1, None), // VFMAX + FPUOp2::Max64x2 => (0xe7ef, 3, 0, 1, None), // VFMAX + FPUOp2::Min32 => (0xe7ee, 2, 8, 1, None), // WFMIN + FPUOp2::Min64 => (0xe7ee, 3, 8, 1, None), // WFMIN + FPUOp2::Min32x4 => (0xe7ee, 2, 0, 1, None), // VFMIN + FPUOp2::Min64x2 => (0xe7ee, 3, 0, 1, None), // VFMIN + FPUOp2::MaxPseudo32 => (0xe7ef, 2, 8, 3, None), // WFMAX + FPUOp2::MaxPseudo64 => (0xe7ef, 3, 8, 3, None), // WFMAX + FPUOp2::MaxPseudo32x4 => (0xe7ef, 2, 0, 3, None), // VFMAX + FPUOp2::MaxPseudo64x2 => (0xe7ef, 3, 0, 3, None), // VFMAX + FPUOp2::MinPseudo32 => (0xe7ee, 2, 8, 3, None), // WFMIN + FPUOp2::MinPseudo64 => (0xe7ee, 3, 8, 3, None), // WFMIN + FPUOp2::MinPseudo32x4 => (0xe7ee, 2, 0, 3, None), // VFMIN + FPUOp2::MinPseudo64x2 => (0xe7ee, 3, 0, 3, None), // VFMIN }; - if opcode_fpr.is_some() && rd.to_reg() == rn && is_fpr(rn) && is_fpr(rm) { + if m5 == 8 && opcode_fpr.is_some() && rd.to_reg() == rn && is_fpr(rn) && is_fpr(rm) + { put(sink, &enc_rre(opcode_fpr.unwrap(), rd.to_reg(), rm)); } else { - put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 8, m6)); + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, m5, m6)); } } &Inst::FpuRRRR { @@ -2290,16 +2385,20 @@ impl MachInstEmit for Inst { let rm = allocs.next(rm); let ra = allocs.next(ra); - let (opcode, m6, opcode_fpr) = match fpu_op { - FPUOp3::MAdd32 => (0xe78f, 2, 0xb30e), // VFMA, MAEBR - FPUOp3::MAdd64 => (0xe78f, 3, 0xb31e), // VFMA, MADBR - FPUOp3::MSub32 => (0xe78e, 2, 0xb30f), // VFMS, MSEBR - FPUOp3::MSub64 => (0xe78e, 3, 0xb31f), // VFMS, MSDBR + let (opcode, m5, m6, opcode_fpr) = match fpu_op { + FPUOp3::MAdd32 => (0xe78f, 8, 2, Some(0xb30e)), // WFMA, MAEBR + FPUOp3::MAdd64 => (0xe78f, 8, 3, Some(0xb31e)), // WFMA, MADBR + FPUOp3::MAdd32x4 => (0xe78f, 0, 2, None), // VFMA + FPUOp3::MAdd64x2 => (0xe78f, 0, 3, None), // VFMA + FPUOp3::MSub32 => (0xe78e, 8, 2, Some(0xb30f)), // WFMS, MSEBR + FPUOp3::MSub64 => (0xe78e, 8, 3, Some(0xb31f)), // WFMS, MSDBR + FPUOp3::MSub32x4 => (0xe78e, 0, 2, None), // VFMS + FPUOp3::MSub64x2 => (0xe78e, 0, 3, None), // VFMS }; - if rd.to_reg() == ra && is_fpr(rn) && is_fpr(rm) && is_fpr(ra) { - put(sink, &enc_rrd(opcode_fpr, rd.to_reg(), rm, rn)); + if m5 == 8 && rd.to_reg() == ra && is_fpr(rn) && is_fpr(rm) && is_fpr(ra) { + put(sink, &enc_rrd(opcode_fpr.unwrap(), rd.to_reg(), rm, rn)); } else { - put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 8, m6)); + put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, m5, m6)); } } &Inst::FpuRound { op, mode, rd, rn } => { @@ -2315,26 +2414,37 @@ impl MachInstEmit for Inst { FpuRoundMode::ToPosInfinity => 6, FpuRoundMode::ToNegInfinity => 7, }; - let (opcode, m3, opcode_fpr) = match op { - FpuRoundOp::Cvt64To32 => (0xe7c5, 3, Some(0xb344)), // VFLR, LEDBR(A) - FpuRoundOp::Round32 => (0xe7c7, 2, Some(0xb357)), // VFI, FIEBR - FpuRoundOp::Round64 => (0xe7c7, 3, Some(0xb35f)), // VFI, FIDBR - FpuRoundOp::ToSInt32 => (0xe7c2, 2, None), // VCSFP - FpuRoundOp::ToSInt64 => (0xe7c2, 3, None), // VCSFP - FpuRoundOp::ToUInt32 => (0xe7c0, 2, None), // VCLFP - FpuRoundOp::ToUInt64 => (0xe7c0, 3, None), // VCLFP - FpuRoundOp::FromSInt32 => (0xe7c3, 2, None), // VCFPS - FpuRoundOp::FromSInt64 => (0xe7c3, 3, None), // VCFPS - FpuRoundOp::FromUInt32 => (0xe7c1, 2, None), // VCFPL - FpuRoundOp::FromUInt64 => (0xe7c1, 3, None), // VCFPL + let (opcode, m3, m4, opcode_fpr) = match op { + FpuRoundOp::Cvt64To32 => (0xe7c5, 3, 8, Some(0xb344)), // WFLR, LEDBR(A) + FpuRoundOp::Cvt64x2To32x4 => (0xe7c5, 3, 0, None), // VFLR + FpuRoundOp::Round32 => (0xe7c7, 2, 8, Some(0xb357)), // WFI, FIEBR + FpuRoundOp::Round64 => (0xe7c7, 3, 8, Some(0xb35f)), // WFI, FIDBR + FpuRoundOp::Round32x4 => (0xe7c7, 2, 0, None), // VFI + FpuRoundOp::Round64x2 => (0xe7c7, 3, 0, None), // VFI + FpuRoundOp::ToSInt32 => (0xe7c2, 2, 8, None), // WCSFP + FpuRoundOp::ToSInt64 => (0xe7c2, 3, 8, None), // WCSFP + FpuRoundOp::ToUInt32 => (0xe7c0, 2, 8, None), // WCLFP + FpuRoundOp::ToUInt64 => (0xe7c0, 3, 8, None), // WCLFP + FpuRoundOp::ToSInt32x4 => (0xe7c2, 2, 0, None), // VCSFP + FpuRoundOp::ToSInt64x2 => (0xe7c2, 3, 0, None), // VCSFP + FpuRoundOp::ToUInt32x4 => (0xe7c0, 2, 0, None), // VCLFP + FpuRoundOp::ToUInt64x2 => (0xe7c0, 3, 0, None), // VCLFP + FpuRoundOp::FromSInt32 => (0xe7c3, 2, 8, None), // WCFPS + FpuRoundOp::FromSInt64 => (0xe7c3, 3, 8, None), // WCFPS + FpuRoundOp::FromUInt32 => (0xe7c1, 2, 8, None), // WCFPL + FpuRoundOp::FromUInt64 => (0xe7c1, 3, 8, None), // WCFPL + FpuRoundOp::FromSInt32x4 => (0xe7c3, 2, 0, None), // VCFPS + FpuRoundOp::FromSInt64x2 => (0xe7c3, 3, 0, None), // VCFPS + FpuRoundOp::FromUInt32x4 => (0xe7c1, 2, 0, None), // VCFPL + FpuRoundOp::FromUInt64x2 => (0xe7c1, 3, 0, None), // VCFPL }; - if opcode_fpr.is_some() && is_fpr(rd.to_reg()) && is_fpr(rn) { + if m4 == 8 && opcode_fpr.is_some() && is_fpr(rd.to_reg()) && is_fpr(rn) { put( sink, &enc_rrf_cde(opcode_fpr.unwrap(), rd.to_reg(), rn, mode, 0), ); } else { - put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 8, mode)); + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, m4, mode)); } } &Inst::FpuCmp32 { rn, rm } => { @@ -2361,6 +2471,169 @@ impl MachInstEmit for Inst { put(sink, &enc_vrr_a(opcode, rn, rm, 3, 0, 0)); } } + + &Inst::VecRRR { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecBinaryOp::Add8x16 => (0xe7f3, 0), // VAB + VecBinaryOp::Add16x8 => (0xe7f3, 1), // VAH + VecBinaryOp::Add32x4 => (0xe7f3, 2), // VAF + VecBinaryOp::Add64x2 => (0xe7f3, 3), // VAG + VecBinaryOp::Sub8x16 => (0xe7f7, 0), // VSB + VecBinaryOp::Sub16x8 => (0xe7f7, 1), // VSH + VecBinaryOp::Sub32x4 => (0xe7f7, 2), // VSF + VecBinaryOp::Sub64x2 => (0xe7f7, 3), // VSG + VecBinaryOp::Mul8x16 => (0xe7a2, 0), // VMLB + VecBinaryOp::Mul16x8 => (0xe7a2, 1), // VMLHW + VecBinaryOp::Mul32x4 => (0xe7a2, 2), // VMLF + VecBinaryOp::UMulHi8x16 => (0xe7a1, 0), // VMLHB + VecBinaryOp::UMulHi16x8 => (0xe7a1, 1), // VMLHH + VecBinaryOp::UMulHi32x4 => (0xe7a1, 2), // VMLHF + VecBinaryOp::SMulHi8x16 => (0xe7a3, 0), // VMHB + VecBinaryOp::SMulHi16x8 => (0xe7a3, 1), // VMHH + VecBinaryOp::SMulHi32x4 => (0xe7a3, 2), // VMHF + VecBinaryOp::UMulEven8x16 => (0xe7a4, 0), // VMLEB + VecBinaryOp::UMulEven16x8 => (0xe7a4, 1), // VMLEH + VecBinaryOp::UMulEven32x4 => (0xe7a4, 2), // VMLEF + VecBinaryOp::SMulEven8x16 => (0xe7a6, 0), // VMEB + VecBinaryOp::SMulEven16x8 => (0xe7a6, 1), // VMEH + VecBinaryOp::SMulEven32x4 => (0xe7a6, 2), // VMEF + VecBinaryOp::UMulOdd8x16 => (0xe7a5, 0), // VMLOB + VecBinaryOp::UMulOdd16x8 => (0xe7a5, 1), // VMLOH + VecBinaryOp::UMulOdd32x4 => (0xe7a5, 2), // VMLOF + VecBinaryOp::SMulOdd8x16 => (0xe7a7, 0), // VMOB + VecBinaryOp::SMulOdd16x8 => (0xe7a7, 1), // VMOH + VecBinaryOp::SMulOdd32x4 => (0xe7a7, 2), // VMOF + VecBinaryOp::UMax8x16 => (0xe7fd, 0), // VMXLB + VecBinaryOp::UMax16x8 => (0xe7fd, 1), // VMXLH + VecBinaryOp::UMax32x4 => (0xe7fd, 2), // VMXLF + VecBinaryOp::UMax64x2 => (0xe7fd, 3), // VMXLG + VecBinaryOp::SMax8x16 => (0xe7ff, 0), // VMXB + VecBinaryOp::SMax16x8 => (0xe7ff, 1), // VMXH + VecBinaryOp::SMax32x4 => (0xe7ff, 2), // VMXF + VecBinaryOp::SMax64x2 => (0xe7ff, 3), // VMXG + VecBinaryOp::UMin8x16 => (0xe7fc, 0), // VMNLB + VecBinaryOp::UMin16x8 => (0xe7fc, 1), // VMNLH + VecBinaryOp::UMin32x4 => (0xe7fc, 2), // VMNLF + VecBinaryOp::UMin64x2 => (0xe7fc, 3), // VMNLG + VecBinaryOp::SMin8x16 => (0xe7fe, 0), // VMNB + VecBinaryOp::SMin16x8 => (0xe7fe, 1), // VMNH + VecBinaryOp::SMin32x4 => (0xe7fe, 2), // VMNF + VecBinaryOp::SMin64x2 => (0xe7fe, 3), // VMNG + VecBinaryOp::UAvg8x16 => (0xe7f0, 0), // VAVGLB + VecBinaryOp::UAvg16x8 => (0xe7f0, 1), // VAVGLH + VecBinaryOp::UAvg32x4 => (0xe7f0, 2), // VAVGLF + VecBinaryOp::UAvg64x2 => (0xe7f0, 3), // VAVGLG + VecBinaryOp::SAvg8x16 => (0xe7f2, 0), // VAVGB + VecBinaryOp::SAvg16x8 => (0xe7f2, 1), // VAVGH + VecBinaryOp::SAvg32x4 => (0xe7f2, 2), // VAVGF + VecBinaryOp::SAvg64x2 => (0xe7f2, 3), // VAVGG + VecBinaryOp::And128 => (0xe768, 0), // VN + VecBinaryOp::Orr128 => (0xe76a, 0), // VO + VecBinaryOp::Xor128 => (0xe76d, 0), // VX + VecBinaryOp::NotAnd128 => (0xe76e, 0), // VNN + VecBinaryOp::NotOrr128 => (0xe76b, 0), // VNO + VecBinaryOp::NotXor128 => (0xe76c, 0), // VNX + VecBinaryOp::AndNot128 => (0xe769, 0), // VNC + VecBinaryOp::OrrNot128 => (0xe76f, 0), // VOC + VecBinaryOp::BitPermute128 => (0xe785, 0), // VBPERM + VecBinaryOp::LShLByByte128 => (0xe775, 0), // VSLB + VecBinaryOp::LShRByByte128 => (0xe77d, 0), // VSRLB + VecBinaryOp::AShRByByte128 => (0xe77f, 0), // VSRAB + VecBinaryOp::LShLByBit128 => (0xe774, 0), // VSL + VecBinaryOp::LShRByBit128 => (0xe77c, 0), // VSRL + VecBinaryOp::AShRByBit128 => (0xe77e, 0), // VSRA + VecBinaryOp::Pack16x8 => (0xe794, 1), // VPKH + VecBinaryOp::Pack32x4 => (0xe794, 2), // VPKF + VecBinaryOp::Pack64x2 => (0xe794, 3), // VPKG + VecBinaryOp::PackUSat16x8 => (0xe795, 1), // VPKLSH + VecBinaryOp::PackUSat32x4 => (0xe795, 2), // VPKLSF + VecBinaryOp::PackUSat64x2 => (0xe795, 3), // VPKLSG + VecBinaryOp::PackSSat16x8 => (0xe797, 1), // VPKSH + VecBinaryOp::PackSSat32x4 => (0xe797, 2), // VPKSF + VecBinaryOp::PackSSat64x2 => (0xe797, 3), // VPKSG + VecBinaryOp::MergeLow8x16 => (0xe760, 0), // VMRLB + VecBinaryOp::MergeLow16x8 => (0xe760, 1), // VMRLH + VecBinaryOp::MergeLow32x4 => (0xe760, 2), // VMRLF + VecBinaryOp::MergeLow64x2 => (0xe760, 3), // VMRLG + VecBinaryOp::MergeHigh8x16 => (0xe761, 0), // VMRHB + VecBinaryOp::MergeHigh16x8 => (0xe761, 1), // VMRHH + VecBinaryOp::MergeHigh32x4 => (0xe761, 2), // VMRHF + VecBinaryOp::MergeHigh64x2 => (0xe761, 3), // VMRHG + }; + + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0)); + } + &Inst::VecRR { op, rd, rn } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let (opcode, m3) = match op { + VecUnaryOp::Abs8x16 => (0xe7df, 0), // VLPB + VecUnaryOp::Abs16x8 => (0xe7df, 1), // VLPH + VecUnaryOp::Abs32x4 => (0xe7df, 2), // VLPF + VecUnaryOp::Abs64x2 => (0xe7df, 3), // VLPG + VecUnaryOp::Neg8x16 => (0xe7de, 0), // VLCB + VecUnaryOp::Neg16x8 => (0xe7de, 1), // VLCH + VecUnaryOp::Neg32x4 => (0xe7de, 2), // VLCF + VecUnaryOp::Neg64x2 => (0xe7de, 3), // VLCG + VecUnaryOp::Popcnt8x16 => (0xe750, 0), // VPOPCTB + VecUnaryOp::Popcnt16x8 => (0xe750, 1), // VPOPCTH + VecUnaryOp::Popcnt32x4 => (0xe750, 2), // VPOPCTF + VecUnaryOp::Popcnt64x2 => (0xe750, 3), // VPOPCTG + VecUnaryOp::UnpackULow8x16 => (0xe7d4, 0), // VUPLLB + VecUnaryOp::UnpackULow16x8 => (0xe7d4, 1), // VUPLLH + VecUnaryOp::UnpackULow32x4 => (0xe7d4, 2), // VUPLLF + VecUnaryOp::UnpackUHigh8x16 => (0xe7d5, 0), // VUPLHB + VecUnaryOp::UnpackUHigh16x8 => (0xe7d5, 1), // VUPLHH + VecUnaryOp::UnpackUHigh32x4 => (0xe7d5, 2), // VUPLHF + VecUnaryOp::UnpackSLow8x16 => (0xe7d6, 0), // VUPLB + VecUnaryOp::UnpackSLow16x8 => (0xe7d6, 1), // VUPLH + VecUnaryOp::UnpackSLow32x4 => (0xe7d6, 2), // VUPLF + VecUnaryOp::UnpackSHigh8x16 => (0xe7d7, 0), // VUPHB + VecUnaryOp::UnpackSHigh16x8 => (0xe7d7, 1), // VUPHH + VecUnaryOp::UnpackSHigh32x4 => (0xe7d7, 2), // VUPHF + }; + + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 0, 0)); + } + &Inst::VecShiftRR { + shift_op, + rd, + rn, + shift_imm, + shift_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let shift_reg = allocs.next(shift_reg); + + let (opcode, m4) = match shift_op { + VecShiftOp::RotL8x16 => (0xe733, 0), // VERLLB + VecShiftOp::RotL16x8 => (0xe733, 1), // VERLLH + VecShiftOp::RotL32x4 => (0xe733, 2), // VERLLF + VecShiftOp::RotL64x2 => (0xe733, 3), // VERLLG + VecShiftOp::LShL8x16 => (0xe730, 0), // VESLB + VecShiftOp::LShL16x8 => (0xe730, 1), // VESLH + VecShiftOp::LShL32x4 => (0xe730, 2), // VESLF + VecShiftOp::LShL64x2 => (0xe730, 3), // VESLG + VecShiftOp::LShR8x16 => (0xe738, 0), // VESRLB + VecShiftOp::LShR16x8 => (0xe738, 1), // VESRLH + VecShiftOp::LShR32x4 => (0xe738, 2), // VESRLF + VecShiftOp::LShR64x2 => (0xe738, 3), // VESRLG + VecShiftOp::AShR8x16 => (0xe73a, 0), // VESRAB + VecShiftOp::AShR16x8 => (0xe73a, 1), // VESRAH + VecShiftOp::AShR32x4 => (0xe73a, 2), // VESRAF + VecShiftOp::AShR64x2 => (0xe73a, 3), // VESRAG + }; + put( + sink, + &enc_vrs_a(opcode, rd.to_reg(), shift_reg, shift_imm.into(), rn, m4), + ); + } &Inst::VecSelect { rd, rn, rm, ra } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); @@ -2370,6 +2643,442 @@ impl MachInstEmit for Inst { let opcode = 0xe78d; // VSEL put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); } + &Inst::VecPermute { rd, rn, rm, ra } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + let ra = allocs.next(ra); + + let opcode = 0xe78c; // VPERM + put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); + } + &Inst::VecPermuteDWImm { + rd, + rn, + rm, + idx1, + idx2, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + let m4 = (idx1 & 1) * 4 + (idx2 & 1); + + let opcode = 0xe784; // VPDI + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0)); + } + &Inst::VecIntCmp { op, rd, rn, rm } | &Inst::VecIntCmpS { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecIntCmpOp::CmpEq8x16 => (0xe7f8, 0), // VCEQB + VecIntCmpOp::CmpEq16x8 => (0xe7f8, 1), // VCEQH + VecIntCmpOp::CmpEq32x4 => (0xe7f8, 2), // VCEQF + VecIntCmpOp::CmpEq64x2 => (0xe7f8, 3), // VCEQG + VecIntCmpOp::SCmpHi8x16 => (0xe7fb, 0), // VCHB + VecIntCmpOp::SCmpHi16x8 => (0xe7fb, 1), // VCHH + VecIntCmpOp::SCmpHi32x4 => (0xe7fb, 2), // VCHG + VecIntCmpOp::SCmpHi64x2 => (0xe7fb, 3), // VCHG + VecIntCmpOp::UCmpHi8x16 => (0xe7f9, 0), // VCHLB + VecIntCmpOp::UCmpHi16x8 => (0xe7f9, 1), // VCHLH + VecIntCmpOp::UCmpHi32x4 => (0xe7f9, 2), // VCHLG + VecIntCmpOp::UCmpHi64x2 => (0xe7f9, 3), // VCHLG + }; + let m5 = match self { + &Inst::VecIntCmp { .. } => 0, + &Inst::VecIntCmpS { .. } => 1, + _ => unreachable!(), + }; + + put(sink, &enc_vrr_b(opcode, rd.to_reg(), rn, rm, m4, m5)); + } + &Inst::VecFloatCmp { op, rd, rn, rm } | &Inst::VecFloatCmpS { op, rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let (opcode, m4) = match op { + VecFloatCmpOp::CmpEq32x4 => (0xe7e8, 2), // VFCESB + VecFloatCmpOp::CmpEq64x2 => (0xe7e8, 3), // VFCEDB + VecFloatCmpOp::CmpHi32x4 => (0xe7eb, 2), // VFCHSB + VecFloatCmpOp::CmpHi64x2 => (0xe7eb, 3), // VFCHDB + VecFloatCmpOp::CmpHiEq32x4 => (0xe7ea, 2), // VFCHESB + VecFloatCmpOp::CmpHiEq64x2 => (0xe7ea, 3), // VFCHEDB + }; + let m6 = match self { + &Inst::VecFloatCmp { .. } => 0, + &Inst::VecFloatCmpS { .. } => 1, + _ => unreachable!(), + }; + + put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, m6)); + } + + &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match self { + &Inst::VecLoad { .. } => (0xe706, 0), // VL + &Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ + _ => unreachable!(), + }; + mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state); + } + &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + let rd = allocs.next(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match self { + &Inst::VecStore { .. } => (0xe70e, 0), // VST + &Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ + _ => unreachable!(), + }; + mem_vrx_emit(rd, &mem, opcode, m3, true, sink, emit_info, state); + } + &Inst::VecLoadReplicate { size, rd, ref mem } + | &Inst::VecLoadReplicateRev { size, rd, ref mem } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode, m3) = match (self, size) { + (&Inst::VecLoadReplicate { .. }, 8) => (0xe705, 0), // VLREPB + (&Inst::VecLoadReplicate { .. }, 16) => (0xe705, 1), // VLREPH + (&Inst::VecLoadReplicate { .. }, 32) => (0xe705, 2), // VLREPF + (&Inst::VecLoadReplicate { .. }, 64) => (0xe705, 3), // VLREPG + (&Inst::VecLoadReplicateRev { .. }, 16) => (0xe605, 1), // VLREPBRH + (&Inst::VecLoadReplicateRev { .. }, 32) => (0xe605, 2), // VLREPBRF + (&Inst::VecLoadReplicateRev { .. }, 64) => (0xe605, 3), // VLREPBRG + _ => unreachable!(), + }; + mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state); + } + + &Inst::VecMov { rd, rn } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let opcode = 0xe756; // VLR + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0)); + } + &Inst::VecCMov { rd, cond, rm } => { + let rd = allocs.next_writable(rd); + let rm = allocs.next(rm); + + let opcode = 0xa74; // BCR + put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6)); + let opcode = 0xe756; // VLR + put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0)); + } + &Inst::MovToVec128 { rd, rn, rm } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let rm = allocs.next(rm); + + let opcode = 0xe762; // VLVGP + put(sink, &enc_vrr_f(opcode, rd.to_reg(), rn, rm)); + } + &Inst::VecLoadConst { rd, const_data } => { + let rd = allocs.next_writable(rd); + + let opcode = 0xa75; // BRAS + let reg = writable_spilltmp_reg().to_reg(); + put(sink, &enc_ri_b(opcode, reg, 20)); + for i in const_data.to_be_bytes().iter() { + sink.put1(*i); + } + let inst = Inst::VecLoad { + rd, + mem: MemArg::reg(reg, MemFlags::trusted()), + }; + inst.emit(&[], sink, emit_info, state); + } + &Inst::VecLoadConstReplicate { + size, + rd, + const_data, + } => { + let rd = allocs.next_writable(rd); + + let opcode = 0xa75; // BRAS + let reg = writable_spilltmp_reg().to_reg(); + put(sink, &enc_ri_b(opcode, reg, (4 + size / 8) as i32)); + for i in 0..size / 8 { + sink.put1((const_data >> (size - 8 - 8 * i)) as u8); + } + let inst = Inst::VecLoadReplicate { + size, + rd, + mem: MemArg::reg(reg, MemFlags::trusted()), + }; + inst.emit(&[], sink, emit_info, state); + } + &Inst::VecImmByteMask { rd, mask } => { + let rd = allocs.next_writable(rd); + let opcode = 0xe744; // VGBM + put(sink, &enc_vri_a(opcode, rd.to_reg(), mask, 0)); + } + &Inst::VecImmBitMask { + size, + rd, + start_bit, + end_bit, + } => { + let rd = allocs.next_writable(rd); + let (opcode, m4) = match size { + 8 => (0xe746, 0), // VGMB + 16 => (0xe746, 1), // VGMH + 32 => (0xe746, 2), // VGMF + 64 => (0xe746, 3), // VGMG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_b(opcode, rd.to_reg(), start_bit, end_bit, m4), + ); + } + &Inst::VecImmReplicate { size, rd, imm } => { + let rd = allocs.next_writable(rd); + let (opcode, m3) = match size { + 8 => (0xe745, 0), // VREPIB + 16 => (0xe745, 1), // VREPIH + 32 => (0xe745, 2), // VREPIF + 64 => (0xe745, 3), // VREPIG + _ => unreachable!(), + }; + put(sink, &enc_vri_a(opcode, rd.to_reg(), imm as u16, m3)); + } + + &Inst::VecLoadLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneUndef { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRev { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRevUndef { + size, + rd, + ref mem, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecLoadLane { .. }, 8) => (0xe700, None, None), // VLEB + (&Inst::VecLoadLane { .. }, 16) => (0xe701, None, None), // VLEH + (&Inst::VecLoadLane { .. }, 32) => (0xe703, None, None), // VLEF + (&Inst::VecLoadLane { .. }, 64) => (0xe702, None, None), // VLEG + (&Inst::VecLoadLaneUndef { .. }, 8) => (0xe700, None, None), // VLEB + (&Inst::VecLoadLaneUndef { .. }, 16) => (0xe701, None, None), // VLEH + (&Inst::VecLoadLaneUndef { .. }, 32) => (0xe703, Some(0x78), Some(0xed64)), // VLEF, LE(Y) + (&Inst::VecLoadLaneUndef { .. }, 64) => (0xe702, Some(0x68), Some(0xed65)), // VLEG, LD(Y) + (&Inst::VecLoadLaneRev { .. }, 16) => (0xe601, None, None), // VLEBRH + (&Inst::VecLoadLaneRev { .. }, 32) => (0xe603, None, None), // VLEBRF + (&Inst::VecLoadLaneRev { .. }, 64) => (0xe602, None, None), // VLEBRG + (&Inst::VecLoadLaneRevUndef { .. }, 16) => (0xe601, None, None), // VLEBRH + (&Inst::VecLoadLaneRevUndef { .. }, 32) => (0xe603, None, None), // VLEBRF + (&Inst::VecLoadLaneRevUndef { .. }, 64) => (0xe602, None, None), // VLEBRG + _ => unreachable!(), + }; + + let rd = rd.to_reg(); + if lane_imm == 0 && is_fpr(rd) && opcode_rx.is_some() { + mem_emit( + rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, + ); + } else { + mem_vrx_emit( + rd, + &mem, + opcode_vrx, + lane_imm.into(), + true, + sink, + emit_info, + state, + ); + } + } + &Inst::VecStoreLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecStoreLaneRev { + size, + rd, + ref mem, + lane_imm, + } => { + let rd = allocs.next(rd); + let mem = mem.with_allocs(&mut allocs); + + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecStoreLane { .. }, 8) => (0xe708, None, None), // VSTEB + (&Inst::VecStoreLane { .. }, 16) => (0xe709, None, None), // VSTEH + (&Inst::VecStoreLane { .. }, 32) => (0xe70b, Some(0x70), Some(0xed66)), // VSTEF, STE(Y) + (&Inst::VecStoreLane { .. }, 64) => (0xe70a, Some(0x60), Some(0xed67)), // VSTEG, STD(Y) + (&Inst::VecStoreLaneRev { .. }, 16) => (0xe609, None, None), // VSTEBRH + (&Inst::VecStoreLaneRev { .. }, 32) => (0xe60b, None, None), // VSTEBRF + (&Inst::VecStoreLaneRev { .. }, 64) => (0xe60a, None, None), // VSTEBRG + _ => unreachable!(), + }; + + if lane_imm == 0 && is_fpr(rd) && opcode_rx.is_some() { + mem_emit( + rd, &mem, opcode_rx, opcode_rxy, None, true, sink, emit_info, state, + ); + } else { + mem_vrx_emit( + rd, + &mem, + opcode_vrx, + lane_imm.into(), + true, + sink, + emit_info, + state, + ); + } + } + &Inst::VecInsertLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4) = match size { + 8 => (0xe722, 0), // VLVGB + 16 => (0xe722, 1), // VLVGH + 32 => (0xe722, 2), // VLVGF + 64 => (0xe722, 3), // VLVGG + _ => unreachable!(), + }; + put( + sink, + &enc_vrs_b(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + &Inst::VecInsertLaneUndef { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4, opcode_rre) = match size { + 8 => (0xe722, 0, None), // VLVGB + 16 => (0xe722, 1, None), // VLVGH + 32 => (0xe722, 2, None), // VLVGF + 64 => (0xe722, 3, Some(0xb3c1)), // VLVGG, LDGR + _ => unreachable!(), + }; + if opcode_rre.is_some() + && lane_imm == 0 + && lane_reg == zero_reg() + && is_fpr(rd.to_reg()) + { + put(sink, &enc_rre(opcode_rre.unwrap(), rd.to_reg(), rn)); + } else { + put( + sink, + &enc_vrs_b(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + } + &Inst::VecExtractLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let lane_reg = allocs.next(lane_reg); + + let (opcode_vrs, m4, opcode_rre) = match size { + 8 => (0xe721, 0, None), // VLGVB + 16 => (0xe721, 1, None), // VLGVH + 32 => (0xe721, 2, None), // VLGVF + 64 => (0xe721, 3, Some(0xb3cd)), // VLGVG, LGDR + _ => unreachable!(), + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg == zero_reg() && is_fpr(rn) { + put(sink, &enc_rre(opcode_rre.unwrap(), rd.to_reg(), rn)); + } else { + put( + sink, + &enc_vrs_c(opcode_vrs, rd.to_reg(), lane_reg, lane_imm.into(), rn, m4), + ); + } + } + &Inst::VecInsertLaneImm { + size, + rd, + imm, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + + let opcode = match size { + 8 => 0xe740, // VLEIB + 16 => 0xe741, // LEIVH + 32 => 0xe743, // VLEIF + 64 => 0xe742, // VLEIG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_a(opcode, rd.to_reg(), imm as u16, lane_imm.into()), + ); + } + &Inst::VecReplicateLane { + size, + rd, + rn, + lane_imm, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + + let (opcode, m4) = match size { + 8 => (0xe74d, 0), // VREPB + 16 => (0xe74d, 1), // VREPH + 32 => (0xe74d, 2), // VREPF + 64 => (0xe74d, 3), // VREPG + _ => unreachable!(), + }; + put( + sink, + &enc_vri_c(opcode, rd.to_reg(), lane_imm.into(), rn, m4), + ); + } &Inst::Call { link, ref info } => { let link = allocs.next_writable(link); diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index a398c798a958..5dd423801fad 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -6808,6 +6808,8 @@ fn test_s390x_binemit() { defs: smallvec![], clobbers: PRegSet::empty(), opcode: Opcode::Call, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, }), }, "C0E500000000", @@ -6823,6 +6825,8 @@ fn test_s390x_binemit() { defs: smallvec![], clobbers: PRegSet::empty(), opcode: Opcode::CallIndirect, + caller_callconv: CallConv::SystemV, + callee_callconv: CallConv::SystemV, }), }, "0DE1", @@ -6953,71 +6957,6 @@ fn test_s390x_binemit() { "jno 10 ; vlr %v8, %v20", )); - insns.push(( - Inst::MovToFpr64 { - rd: writable_vr(8), - rn: gpr(4), - }, - "B3C10084", - "ldgr %f8, %r4", - )); - insns.push(( - Inst::MovToFpr64 { - rd: writable_vr(24), - rn: gpr(4), - }, - "E78400003822", - "vlvgg %v24, %r4, 0", - )); - insns.push(( - Inst::MovToFpr32 { - rd: writable_vr(8), - rn: gpr(4), - }, - "E78400002022", - "vlvgf %v8, %r4, 0", - )); - insns.push(( - Inst::MovToFpr32 { - rd: writable_vr(24), - rn: gpr(4), - }, - "E78400002822", - "vlvgf %v24, %r4, 0", - )); - insns.push(( - Inst::MovFromFpr64 { - rd: writable_gpr(8), - rn: vr(4), - }, - "B3CD0084", - "lgdr %r8, %f4", - )); - insns.push(( - Inst::MovFromFpr64 { - rd: writable_gpr(8), - rn: vr(20), - }, - "E78400003421", - "vlgvg %r8, %v20, 0", - )); - insns.push(( - Inst::MovFromFpr32 { - rd: writable_gpr(8), - rn: vr(4), - }, - "E78400002021", - "vlgvf %r8, %v4, 0", - )); - insns.push(( - Inst::MovFromFpr32 { - rd: writable_gpr(8), - rn: vr(20), - }, - "E78400002421", - "vlgvf %r8, %v20, 0", - )); - insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs32, @@ -7036,6 +6975,15 @@ fn test_s390x_binemit() { "E78C002828CC", "wflpsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C002028CC", + "vflpsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs64, @@ -7054,6 +7002,15 @@ fn test_s390x_binemit() { "E78C002838CC", "wflpdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C002038CC", + "vflpdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Neg32, @@ -7072,6 +7029,15 @@ fn test_s390x_binemit() { "E78C000828CC", "wflcsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028CC", + "vflcsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Neg64, @@ -7090,6 +7056,15 @@ fn test_s390x_binemit() { "E78C000838CC", "wflcdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000038CC", + "vflcdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::NegAbs32, @@ -7108,6 +7083,15 @@ fn test_s390x_binemit() { "E78C001828CC", "wflnsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::NegAbs32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028CC", + "vflnsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::NegAbs64, @@ -7126,6 +7110,15 @@ fn test_s390x_binemit() { "E78C001838CC", "wflndb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::NegAbs64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038CC", + "vflndb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Sqrt32, @@ -7144,6 +7137,15 @@ fn test_s390x_binemit() { "E78C000828CE", "wfsqsb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt32x4, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028CE", + "vfsqsb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Sqrt64, @@ -7162,6 +7164,15 @@ fn test_s390x_binemit() { "E78C000838CE", "wfsqdb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000038CE", + "vfsqdb %v24, %v12", + )); insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Cvt32To64, @@ -7180,6 +7191,15 @@ fn test_s390x_binemit() { "E78C000828C4", "wldeb %v24, %f12", )); + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt32x4To64x2, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C000028C4", + "vldeb %v24, %v12", + )); insns.push(( Inst::FpuRRR { @@ -7201,6 +7221,16 @@ fn test_s390x_binemit() { "E748C00828E3", "wfasb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E3", + "vfasb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Add64, @@ -7221,6 +7251,16 @@ fn test_s390x_binemit() { "E748C00838E3", "wfadb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E3", + "vfadb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Sub32, @@ -7241,6 +7281,16 @@ fn test_s390x_binemit() { "E748C00828E2", "wfssb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E2", + "vfssb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Sub64, @@ -7261,6 +7311,16 @@ fn test_s390x_binemit() { "E748C00838E2", "wfsdb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E2", + "vfsdb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Mul32, @@ -7281,6 +7341,16 @@ fn test_s390x_binemit() { "E748C00828E7", "wfmsb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E7", + "vfmsb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Mul64, @@ -7301,6 +7371,16 @@ fn test_s390x_binemit() { "E748C00838E7", "wfmdb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E7", + "vfmdb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Div32, @@ -7321,6 +7401,16 @@ fn test_s390x_binemit() { "E748C00828E5", "wfdsb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E5", + "vfdsb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Div64, @@ -7341,6 +7431,16 @@ fn test_s390x_binemit() { "E748C00838E5", "wfddb %v20, %f8, %f12", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E5", + "vfddb %v20, %v8, %v12", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Max32, @@ -7351,6 +7451,16 @@ fn test_s390x_binemit() { "E746801820EF", "wfmaxsb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801020EF", + "vfmaxsb %v4, %v6, %v8, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Max64, @@ -7361,6 +7471,16 @@ fn test_s390x_binemit() { "E746801832EF", "wfmaxdb %f4, %f6, %v24, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746801032EF", + "vfmaxdb %v4, %v6, %v24, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Min32, @@ -7371,6 +7491,16 @@ fn test_s390x_binemit() { "E746801820EE", "wfminsb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801020EE", + "vfminsb %v4, %v6, %v8, 1", + )); insns.push(( Inst::FpuRRR { fpu_op: FPUOp2::Min64, @@ -7381,6 +7511,96 @@ fn test_s390x_binemit() { "E746801830EE", "wfmindb %f4, %f6, %f8, 1", )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746801030EE", + "vfmindb %v4, %v6, %v8, 1", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo32, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803820EF", + "wfmaxsb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803020EF", + "vfmaxsb %v4, %v6, %v8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo64, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746803832EF", + "wfmaxdb %f4, %f6, %v24, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MaxPseudo64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + }, + "E746803032EF", + "vfmaxdb %v4, %v6, %v24, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo32, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803820EE", + "wfminsb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo32x4, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803020EE", + "vfminsb %v4, %v6, %v8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo64, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803830EE", + "wfmindb %f4, %f6, %f8, 3", + )); + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::MinPseudo64x2, + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + }, + "E746803030EE", + "vfmindb %v4, %v6, %v8, 3", + )); insns.push(( Inst::FpuRRRR { @@ -7404,6 +7624,17 @@ fn test_s390x_binemit() { "E78CD208418F", "wfmasb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd32x4, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD200418F", + "vfmasb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuRRRR { fpu_op: FPUOp3::MAdd64, @@ -7428,14 +7659,25 @@ fn test_s390x_binemit() { )); insns.push(( Inst::FpuRRRR { - fpu_op: FPUOp3::MSub32, + fpu_op: FPUOp3::MAdd64x2, rd: writable_vr(8), rn: vr(12), rm: vr(13), - ra: vr(8), + ra: vr(20), }, - "B30F80CD", - "msebr %f8, %f12, %f13", + "E78CD300418F", + "vfmadb %v8, %v12, %v13, %v20", + )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub32, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(8), + }, + "B30F80CD", + "msebr %f8, %f12, %f13", )); insns.push(( Inst::FpuRRRR { @@ -7448,6 +7690,17 @@ fn test_s390x_binemit() { "E78CD208418E", "wfmssb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub32x4, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD200418E", + "vfmssb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuRRRR { fpu_op: FPUOp3::MSub64, @@ -7470,6 +7723,17 @@ fn test_s390x_binemit() { "E78CD308418E", "wfmsdb %f8, %f12, %f13, %v20", )); + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MSub64x2, + rd: writable_vr(8), + rn: vr(12), + rm: vr(13), + ra: vr(20), + }, + "E78CD300418E", + "vfmsdb %v8, %v12, %v13, %v20", + )); insns.push(( Inst::FpuCmp32 { @@ -7505,202 +7769,2982 @@ fn test_s390x_binemit() { )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst32 { + rd: writable_vr(8), + const_data: 1.0_f32.to_bits(), }, - "78102000", - "le %f1, 0(%r2)", + "A71500043F80000078801000", + "bras %r1, 8 ; data.f32 1 ; le %f8, 0(%r1)", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst32 { + rd: writable_vr(24), + const_data: 1.0_f32.to_bits(), }, - "78102FFF", - "le %f1, 4095(%r2)", + "A71500043F800000E78010000803", + "bras %r1, 8 ; data.f32 1 ; vlef %v24, 0(%r1), 0", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst64 { + rd: writable_vr(8), + const_data: 1.0_f64.to_bits(), }, - "ED1020008064", - "ley %f1, -524288(%r2)", + "A71500063FF000000000000068801000", + "bras %r1, 12 ; data.f64 1 ; ld %f8, 0(%r1)", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(524287).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::LoadFpuConst64 { + rd: writable_vr(24), + const_data: 1.0_f64.to_bits(), }, - "ED102FFF7F64", - "ley %f1, 524287(%r2)", + "A71500063FF0000000000000E78010000802", + "bras %r1, 12 ; data.f64 1 ; vleg %v24, 0(%r1), 0", )); + insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64To32, + mode: FpuRoundMode::Current, + rd: writable_vr(8), + rn: vr(12), }, - "E71020000803", - "vlef %v17, 0(%r2), 0", + "B344008C", + "ledbra %f8, %f12, 0", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64To32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), }, - "E7102FFF0803", - "vlef %v17, 4095(%r2), 0", + "E78C001838C5", + "wledb %v24, %f12, 0, 1", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Cvt64x2To32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), }, - "78123000", - "le %f1, 0(%r2,%r3)", + "E78C001038C5", + "vledb %v24, %v12, 0, 1", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNegInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "78123FFF", - "le %f1, 4095(%r2,%r3)", + "B357708C", + "fiebr %f8, %f12, 7", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNegInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "ED1230008064", - "ley %f1, -524288(%r2,%r3)", + "B35F708C", + "fidbr %f8, %f12, 7", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(524287).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToPosInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "ED123FFF7F64", - "ley %f1, 524287(%r2,%r3)", + "B357608C", + "fiebr %f8, %f12, 6", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToPosInfinity, + rd: writable_vr(8), + rn: vr(12), }, - "E71230000803", - "vlef %v17, 0(%r2,%r3), 0", + "B35F608C", + "fidbr %f8, %f12, 6", )); insns.push(( - Inst::FpuLoad32 { - rd: writable_vr(17), - mem: MemArg::BXD12 { - base: gpr(3), - index: gpr(2), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToZero, + rd: writable_vr(8), + rn: vr(12), }, - "E7123FFF0803", - "vlef %v17, 4095(%r2,%r3), 0", + "B357508C", + "fiebr %f8, %f12, 5", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::zero(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToZero, + rd: writable_vr(8), + rn: vr(12), }, - "68102000", - "ld %f1, 0(%r2)", + "B35F508C", + "fidbr %f8, %f12, 5", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD12 { - base: gpr(2), - index: zero_reg(), - disp: UImm12::maybe_from_u64(4095).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNearestTiesToEven, + rd: writable_vr(8), + rn: vr(12), }, - "68102FFF", - "ld %f1, 4095(%r2)", + "B357408C", + "fiebr %f8, %f12, 4", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(2), - index: zero_reg(), - disp: SImm20::maybe_from_i64(-524288).unwrap(), - flags: MemFlags::trusted(), - }, + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNearestTiesToEven, + rd: writable_vr(8), + rn: vr(12), }, - "ED1020008065", - "ldy %f1, -524288(%r2)", - )); + "B35F408C", + "fidbr %f8, %f12, 4", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C7", + "wfisb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C7", + "vfisb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C7", + "wfidb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::Round64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C7", + "vfidb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C2", + "wcfeb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C2", + "vcfeb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C2", + "wcgdb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToSInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C2", + "vcgdb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C0", + "wclfeb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C0", + "vclfeb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C0", + "wclgdb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::ToUInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C0", + "vclgdb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C3", + "wcefb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C3", + "vcefb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C3", + "wcdgb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromSInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C3", + "vcdgb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt32, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001828C1", + "wcelfb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt32x4, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001028C1", + "vcelfb %v24, %v12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt64, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001838C1", + "wcdlgb %v24, %f12, 0, 1", + )); + insns.push(( + Inst::FpuRound { + op: FpuRoundOp::FromUInt64x2, + mode: FpuRoundMode::ToNearest, + rd: writable_vr(24), + rn: vr(12), + }, + "E78C001038C1", + "vcdlgb %v24, %v12, 0, 1", + )); + + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F3", + "vab %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F3", + "vah %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F3", + "vaf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Add64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F3", + "vag %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F7", + "vsb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F7", + "vsh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F7", + "vsf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Sub64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F7", + "vsg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A2", + "vmlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A2", + "vmlhw %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A2", + "vmlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A1", + "vmlhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A1", + "vmlhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A1", + "vmlhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A3", + "vmhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A3", + "vmhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A3", + "vmhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A4", + "vmleb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A4", + "vmleh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A4", + "vmlef %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A6", + "vmeb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A6", + "vmeh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A6", + "vmef %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A5", + "vmlob %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A5", + "vmloh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A5", + "vmlof %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008A7", + "vmob %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018A7", + "vmoh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028A7", + "vmof %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FD", + "vmxlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FD", + "vmxlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FD", + "vmxlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FD", + "vmxlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FF", + "vmxb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FF", + "vmxh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FF", + "vmxf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FF", + "vmxg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FC", + "vmnlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FC", + "vmnlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FC", + "vmnlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FC", + "vmnlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FE", + "vmnb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FE", + "vmnh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FE", + "vmnf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FE", + "vmng %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F0", + "vavglb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F0", + "vavglh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F0", + "vavglf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F0", + "vavglg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F2", + "vavgb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F2", + "vavgh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F2", + "vavgf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F2", + "vavgg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::And128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000868", + "vn %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Orr128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086A", + "vo %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Xor128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086D", + "vx %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotAnd128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086E", + "vnn %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotOrr128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086B", + "vno %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::NotXor128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086C", + "vnx %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AndNot128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000869", + "vnc %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::OrrNot128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000086F", + "voc %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::BitPermute128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000885", + "vbperm %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShLByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000875", + "vslb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShRByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087D", + "vsrlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AShRByByte128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087F", + "vsrab %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShLByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000874", + "vsl %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::LShRByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087C", + "vsrl %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::AShRByBit128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C000087E", + "vsra %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001894", + "vpkh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002894", + "vpkf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Pack64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003894", + "vpkg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001895", + "vpklsh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002895", + "vpklsf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackUSat64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003895", + "vpklsg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001897", + "vpksh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002897", + "vpksf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::PackSSat64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003897", + "vpksg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000860", + "vmrlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001860", + "vmrlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002860", + "vmrlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeLow64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003860", + "vmrlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0000861", + "vmrhb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0001861", + "vmrhh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0002861", + "vmrhf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::MergeHigh64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C0003861", + "vmrhg %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008DF", + "vlpb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018DF", + "vlph %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028DF", + "vlpf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038DF", + "vlpg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008DE", + "vlcb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018DE", + "vlch %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028DE", + "vlcf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038DE", + "vlcg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800000850", + "vpopctb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800001850", + "vpopcth %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800002850", + "vpopctf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Popcnt64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800003850", + "vpopctg %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D4", + "vupllb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D4", + "vupllh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D4", + "vupllf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D5", + "vuplhb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D5", + "vuplhh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D5", + "vuplhf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D6", + "vuplb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D6", + "vuplh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D6", + "vuplf %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh8x16, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000008D7", + "vuphb %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh16x8, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000018D7", + "vuphh %v20, %v8", + )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh32x4, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000028D7", + "vuphf %v20, %v8", + )); + + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030833", + "verllb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031833", + "verllh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032833", + "verllf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::RotL64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033833", + "verllg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030830", + "veslb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031830", + "veslh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032830", + "veslf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShL64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033830", + "veslg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560030838", + "vesrlb %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560031838", + "vesrlh %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560032838", + "vesrlf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::LShR64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E74560033838", + "vesrlg %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR8x16, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003083A", + "vesrab %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR16x8, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003183A", + "vesrah %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR32x4, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003283A", + "vesraf %v20, %v5, 3(%r6)", + )); + insns.push(( + Inst::VecShiftRR { + shift_op: VecShiftOp::AShR64x2, + rd: writable_vr(20), + rn: vr(5), + shift_imm: 3, + shift_reg: gpr(6), + }, + "E7456003383A", + "vesrag %v20, %v5, 3(%r6)", + )); + + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A08D", + "vsel %v4, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A88D", + "vsel %v20, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(22), + rm: vr(8), + ra: vr(10), + }, + "E7468000A48D", + "vsel %v4, %v22, %v8, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + ra: vr(10), + }, + "E7468000A28D", + "vsel %v4, %v6, %v24, %v10", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(26), + }, + "E7468000A18D", + "vsel %v4, %v6, %v8, %v26", + )); + insns.push(( + Inst::VecSelect { + rd: writable_vr(20), + rn: vr(22), + rm: vr(24), + ra: vr(26), + }, + "E7468000AF8D", + "vsel %v20, %v22, %v24, %v26", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A08C", + "vperm %v4, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + ra: vr(10), + }, + "E7468000A88C", + "vperm %v20, %v6, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(22), + rm: vr(8), + ra: vr(10), + }, + "E7468000A48C", + "vperm %v4, %v22, %v8, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(24), + ra: vr(10), + }, + "E7468000A28C", + "vperm %v4, %v6, %v24, %v10", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(4), + rn: vr(6), + rm: vr(8), + ra: vr(26), + }, + "E7468000A18C", + "vperm %v4, %v6, %v8, %v26", + )); + insns.push(( + Inst::VecPermute { + rd: writable_vr(20), + rn: vr(22), + rm: vr(24), + ra: vr(26), + }, + "E7468000AF8C", + "vperm %v20, %v22, %v24, %v26", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 0, + idx2: 0, + }, + "E74680000884", + "vpdi %v20, %v6, %v8, 0", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 0, + idx2: 1, + }, + "E74680001884", + "vpdi %v20, %v6, %v8, 1", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 1, + idx2: 0, + }, + "E74680004884", + "vpdi %v20, %v6, %v8, 4", + )); + insns.push(( + Inst::VecPermuteDWImm { + rd: writable_vr(20), + rn: vr(6), + rm: vr(8), + idx1: 1, + idx2: 1, + }, + "E74680005884", + "vpdi %v20, %v6, %v8, 5", + )); + + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F8", + "vceqb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F8", + "vceqh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F8", + "vceqf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F8", + "vceqg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008FB", + "vchb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018FB", + "vchh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028FB", + "vchf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038FB", + "vchg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00008F9", + "vchlb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00018F9", + "vchlh %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028F9", + "vchlf %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038F9", + "vchlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008F8", + "vceqbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018F8", + "vceqhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028F8", + "vceqfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038F8", + "vceqgs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008FB", + "vchbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018FB", + "vchhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028FB", + "vchfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038FB", + "vchgs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi8x16, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01008F9", + "vchlbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi16x8, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01018F9", + "vchlhs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028F9", + "vchlfs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038F9", + "vchlgs %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028E8", + "vfcesb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038E8", + "vfcedb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028EB", + "vfchsb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038EB", + "vfchdb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHiEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028EA", + "vfchesb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmp { + op: VecFloatCmpOp::CmpHiEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038EA", + "vfchedb %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028E8", + "vfcesbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038E8", + "vfcedbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHi32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028EB", + "vfchsbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038EB", + "vfchdbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHiEq32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01028EA", + "vfchesbs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecFloatCmpS { + op: VecFloatCmpOp::CmpHiEq64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01038EA", + "vfchedbs %v20, %v8, %v12", + )); + + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E71020000806", + "vl %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E7102FFF0806", + "vl %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoad { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E71230000806", + "vl %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020004806", + "vlbrq %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF4806", + "vlbrq %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadRev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230004806", + "vlbrq %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E7102000080E", + "vst %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E7102FFF080E", + "vst %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStore { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E7123000080E", + "vst %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000480E", + "vstbrq %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF480E", + "vstbrq %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreRev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000480E", + "vstbrq %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020800805", + "vlrepb %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020801805", + "vlreph %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020802805", + "vlrepf %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicate { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E71020803805", + "vlrepg %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020801805", + "vlbrreph %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020802805", + "vlbrrepf %v17, 128(%r2)", + )); + insns.push(( + Inst::VecLoadReplicateRev { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(128).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E61020803805", + "vlbrrepg %v17, 128(%r2)", + )); + + insns.push(( + Inst::VecMov { + rd: writable_vr(8), + rn: vr(20), + }, + "E78400000456", + "vlr %v8, %v20", + )); + insns.push(( + Inst::VecCMov { + rd: writable_vr(8), + rm: vr(20), + cond: Cond::from_mask(1), + }, + "A7E40005E78400000456", + "jno 10 ; vlr %v8, %v20", + )); + insns.push(( + Inst::MovToVec128 { + rd: writable_vr(20), + rn: gpr(5), + rm: gpr(6), + }, + "E74560000862", + "vlvgp %v20, %r5, %r6", + )); + insns.push(( + Inst::VecLoadConst { + rd: writable_vr(24), + const_data: 0x0102030405060708090a0b0c0d0e0fu128, + }, + "A715000A000102030405060708090A0B0C0D0E0FE78010000806", + "bras %r1, 20 ; data.u128 0x000102030405060708090a0b0c0d0e0f ; vl %v24, 0(%r1)", + )); + insns.push(( + Inst::VecLoadConstReplicate { + size: 64, + rd: writable_vr(24), + const_data: 0x01020304050607u64, + }, + "A71500060001020304050607E78010003805", + "bras %r1, 12 ; data.u64 0x0001020304050607 ; vlrepg %v24, 0(%r1)", + )); + insns.push(( + Inst::VecLoadConstReplicate { + size: 32, + rd: writable_vr(24), + const_data: 0x010203u64, + }, + "A715000400010203E78010002805", + "bras %r1, 8 ; data.u32 0x00010203 ; vlrepf %v24, 0(%r1)", + )); + + insns.push(( + Inst::VecImmByteMask { + rd: writable_vr(20), + mask: 0x1234, + }, + "E74012340844", + "vgbm %v20, 4660", + )); + insns.push(( + Inst::VecImmBitMask { + size: 8, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001070846", + "vgmb %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 16, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001071846", + "vgmh %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 32, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001072846", + "vgmf %v20, 1, 7", + )); + insns.push(( + Inst::VecImmBitMask { + size: 64, + rd: writable_vr(20), + start_bit: 1, + end_bit: 7, + }, + "E74001073846", + "vgmg %v20, 1, 7", + )); + insns.push(( + Inst::VecImmReplicate { + size: 8, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012340845", + "vrepib %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 16, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012341845", + "vrepih %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 32, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012342845", + "vrepif %v20, 4660", + )); + insns.push(( + Inst::VecImmReplicate { + size: 64, + rd: writable_vr(20), + imm: 0x1234, + }, + "E74012343845", + "vrepig %v20, 4660", + )); + + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7102000F800", + "vleb %v17, 0(%r2), 15", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0800", + "vleb %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7123000F800", + "vleb %v17, 0(%r2,%r3), 15", + )); + insns.push(( + Inst::VecLoadLane { + size: 8, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0800", + "vleb %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E71020007801", + "vleh %v17, 0(%r2), 7", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0801", + "vleh %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E71230007801", + "vleh %v17, 0(%r2,%r3), 7", + )); + insns.push(( + Inst::VecLoadLane { + size: 16, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0801", + "vleh %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 3, + }, + "E71020003803", + "vlef %v17, 0(%r2), 3", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0803", + "vlef %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 3, + }, + "E71230003803", + "vlef %v17, 0(%r2,%r3), 3", + )); + insns.push(( + Inst::VecLoadLane { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0803", + "vlef %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 1, + }, + "E71020001802", + "vleg %v17, 0(%r2), 1", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0802", + "vleg %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 1, + }, + "E71230001802", + "vleg %v17, 0(%r2,%r3), 1", + )); + insns.push(( + Inst::VecLoadLane { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0802", + "vleg %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78102000", + "le %f1, 0(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78102FFF", + "le %f1, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1020008064", + "ley %f1, -524288(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED102FFF7F64", + "ley %f1, 524287(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71020000803", + "vlef %v17, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0803", + "vlef %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78123000", + "le %f1, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "78123FFF", + "le %f1, 4095(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1230008064", + "ley %f1, -524288(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED123FFF7F64", + "ley %f1, 524287(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71230000803", + "vlef %v17, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 32, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0803", + "vlef %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68102000", + "ld %f1, 0(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68102FFF", + "ld %f1, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(2), + index: zero_reg(), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1020008065", + "ldy %f1, -524288(%r2)", + )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7708,12 +10752,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F65", "ldy %f1, 524287(%r2)", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7721,12 +10767,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E71020000802", "vleg %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7734,12 +10782,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF0802", "vleg %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7747,77 +10797,209 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "68123000", "ld %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuLoad64 { + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "68123FFF", + "ld %f1, 4095(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(-524288).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED1230008065", + "ldy %f1, -524288(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, rd: writable_vr(1), + mem: MemArg::BXD20 { + base: gpr(3), + index: gpr(2), + disp: SImm20::maybe_from_i64(524287).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "ED123FFF7F65", + "ldy %f1, 524287(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E71230000802", + "vleg %v17, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneUndef { + size: 64, + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7123FFF0802", + "vleg %v17, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7102000F808", + "vsteb %v17, 0(%r2), 15", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E7102FFF0808", + "vsteb %v17, 4095(%r2), 0", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 15, + }, + "E7123000F808", + "vsteb %v17, 0(%r2,%r3), 15", + )); + insns.push(( + Inst::VecStoreLane { + size: 8, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "68123FFF", - "ld %f1, 4095(%r2,%r3)", + "E7123FFF0808", + "vsteb %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(-524288).unwrap(), + Inst::VecStoreLane { + size: 16, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 7, }, - "ED1230008065", - "ldy %f1, -524288(%r2,%r3)", + "E71020007809", + "vsteh %v17, 0(%r2), 7", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(1), - mem: MemArg::BXD20 { - base: gpr(3), - index: gpr(2), - disp: SImm20::maybe_from_i64(524287).unwrap(), + Inst::VecStoreLane { + size: 16, + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "ED123FFF7F65", - "ldy %f1, 524287(%r2,%r3)", + "E7102FFF0809", + "vsteh %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(17), + Inst::VecStoreLane { + size: 16, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 7, }, - "E71230000802", - "vleg %v17, 0(%r2,%r3), 0", + "E71230007809", + "vsteh %v17, 0(%r2,%r3), 7", )); insns.push(( - Inst::FpuLoad64 { - rd: writable_vr(17), + Inst::VecStoreLane { + size: 16, + rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), index: gpr(2), disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, - "E7123FFF0802", - "vleg %v17, 4095(%r2,%r3), 0", + "E7123FFF0809", + "vsteh %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7825,12 +11007,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70102000", "ste %f1, 0(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7838,12 +11022,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70102FFF", "ste %f1, 4095(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7851,12 +11037,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1020008066", "stey %f1, -524288(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -7864,12 +11052,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F66", "stey %f1, 524287(%r2)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7877,12 +11067,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102000080B", "vstef %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -7890,12 +11082,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF080B", "vstef %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7903,12 +11097,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70123000", "ste %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -7916,12 +11112,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "70123FFF", "ste %f1, 4095(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -7929,12 +11127,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1230008066", "stey %f1, -524288(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -7942,12 +11142,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED123FFF7F66", "stey %f1, 524287(%r2,%r3)", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -7955,12 +11157,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123000080B", "vstef %v17, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore32 { + Inst::VecStoreLane { + size: 32, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -7968,12 +11172,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123FFF080B", "vstef %v17, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7981,12 +11187,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60102000", "std %f1, 0(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -7994,12 +11202,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60102FFF", "std %f1, 4095(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8007,12 +11217,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1020008067", "stdy %f1, -524288(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8020,12 +11232,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED102FFF7F67", "stdy %f1, 524287(%r2)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -8033,12 +11247,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102000080A", "vsteg %v17, 0(%r2), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(2), @@ -8046,12 +11262,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7102FFF080A", "vsteg %v17, 4095(%r2), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8059,12 +11277,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60123000", "std %f1, 0(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8072,12 +11292,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "60123FFF", "std %f1, 4095(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8085,12 +11307,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED1230008067", "stdy %f1, -524288(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8098,12 +11322,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "ED123FFF7F67", "stdy %f1, 524287(%r2,%r3)", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -8111,12 +11337,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123000080A", "vsteg %v17, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStore64 { + Inst::VecStoreLane { + size: 64, rd: vr(17), mem: MemArg::BXD12 { base: gpr(3), @@ -8124,13 +11352,194 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E7123FFF080A", "vsteg %v17, 4095(%r2,%r3), 0", )); - insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000001", + "vlebrh %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0001", + "vlebrh %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000001", + "vlebrh %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 16, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0001", + "vlebrh %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000003", + "vlebrf %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0003", + "vlebrf %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000003", + "vlebrf %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 32, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0003", + "vlebrf %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61020000002", + "vlebrg %v1, 0(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0002", + "vlebrg %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E61230000002", + "vlebrg %v1, 0(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRev { + size: 64, + rd: writable_vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0002", + "vlebrg %v1, 4095(%r2,%r3), 0", + )); + insns.push(( + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8138,12 +11547,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61020000003", - "vlebrf %f1, 0(%r2), 0", + "vlebrf %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8151,12 +11562,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF0003", - "vlebrf %f1, 4095(%r2), 0", + "vlebrf %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8164,12 +11577,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E61010000003", - "lay %r1, -524288(%r2) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8177,12 +11592,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E61010000003", - "lay %r1, 524287(%r2) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8190,12 +11607,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61230000003", - "vlebrf %f1, 0(%r2,%r3), 0", + "vlebrf %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8203,12 +11622,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF0003", - "vlebrf %f1, 4095(%r2,%r3), 0", + "vlebrf %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8216,12 +11637,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E61010000003", - "lay %r1, -524288(%r2,%r3) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev32 { + Inst::VecLoadLaneRevUndef { + size: 32, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8229,12 +11652,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E61010000003", - "lay %r1, 524287(%r2,%r3) ; vlebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vlebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8242,12 +11667,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61020000002", - "vlebrg %f1, 0(%r2), 0", + "vlebrg %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8255,12 +11682,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF0002", - "vlebrg %f1, 4095(%r2), 0", + "vlebrg %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8268,12 +11697,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E61010000002", - "lay %r1, -524288(%r2) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8281,12 +11712,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E61010000002", - "lay %r1, 524287(%r2) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8294,12 +11727,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E61230000002", - "vlebrg %f1, 0(%r2,%r3), 0", + "vlebrg %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8307,12 +11742,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF0002", - "vlebrg %f1, 4095(%r2,%r3), 0", + "vlebrg %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8320,12 +11757,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E61010000002", - "lay %r1, -524288(%r2,%r3) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vlebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuLoadRev64 { + Inst::VecLoadLaneRevUndef { + size: 64, rd: writable_vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8333,12 +11772,74 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E61010000002", - "lay %r1, 524287(%r2,%r3) ; vlebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vlebrg %v1, 0(%r1), 0", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E61020007009", + "vstebrh %v1, 0(%r2), 7", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6102FFF0009", + "vstebrh %v1, 4095(%r2), 0", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + lane_imm: 7, + }, + "E61230007009", + "vstebrh %v1, 0(%r2,%r3), 7", + )); + insns.push(( + Inst::VecStoreLaneRev { + size: 16, + rd: vr(1), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + lane_imm: 0, + }, + "E6123FFF0009", + "vstebrh %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8346,12 +11847,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102000000B", - "vstebrf %f1, 0(%r2), 0", + "vstebrf %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8359,12 +11862,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF000B", - "vstebrf %f1, 4095(%r2), 0", + "vstebrf %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8372,12 +11877,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E6101000000B", - "lay %r1, -524288(%r2) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8385,12 +11892,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E6101000000B", - "lay %r1, 524287(%r2) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8398,12 +11907,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123000000B", - "vstebrf %f1, 0(%r2,%r3), 0", + "vstebrf %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8411,12 +11922,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF000B", - "vstebrf %f1, 4095(%r2,%r3), 0", + "vstebrf %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8424,12 +11937,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E6101000000B", - "lay %r1, -524288(%r2,%r3) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev32 { + Inst::VecStoreLaneRev { + size: 32, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8437,12 +11952,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E6101000000B", - "lay %r1, 524287(%r2,%r3) ; vstebrf %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vstebrf %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8450,12 +11967,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102000000A", - "vstebrg %f1, 0(%r2), 0", + "vstebrg %v1, 0(%r2), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(2), @@ -8463,12 +11982,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6102FFF000A", - "vstebrg %f1, 4095(%r2), 0", + "vstebrg %v1, 4095(%r2), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8476,12 +11997,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31020008071E6101000000A", - "lay %r1, -524288(%r2) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(2), @@ -8489,12 +12012,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3102FFF7F71E6101000000A", - "lay %r1, 524287(%r2) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8502,12 +12027,14 @@ fn test_s390x_binemit() { disp: UImm12::zero(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123000000A", - "vstebrg %f1, 0(%r2,%r3), 0", + "vstebrg %v1, 0(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD12 { base: gpr(3), @@ -8515,12 +12042,14 @@ fn test_s390x_binemit() { disp: UImm12::maybe_from_u64(4095).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E6123FFF000A", - "vstebrg %f1, 4095(%r2,%r3), 0", + "vstebrg %v1, 4095(%r2,%r3), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8528,12 +12057,14 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(-524288).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E31230008071E6101000000A", - "lay %r1, -524288(%r2,%r3) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, -524288(%r2,%r3) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::FpuStoreRev64 { + Inst::VecStoreLaneRev { + size: 64, rd: vr(1), mem: MemArg::BXD20 { base: gpr(3), @@ -8541,304 +12072,476 @@ fn test_s390x_binemit() { disp: SImm20::maybe_from_i64(524287).unwrap(), flags: MemFlags::trusted(), }, + lane_imm: 0, }, "E3123FFF7F71E6101000000A", - "lay %r1, 524287(%r2,%r3) ; vstebrg %f1, 0(%r1), 0", + "lay %r1, 524287(%r2,%r3) ; vstebrg %v1, 0(%r1), 0", )); insns.push(( - Inst::LoadFpuConst32 { + Inst::VecInsertLane { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400000022", + "vlvgb %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF0022", + "vlvgb %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 8, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430000822", + "vlvgb %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400001022", + "vlvgh %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF1022", + "vlvgh %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 16, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430001822", + "vlvgh %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400002022", + "vlvgf %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF2022", + "vlvgf %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLane { + size: 32, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), + }, + "E78430002822", + "vlvgf %v24, %r4, 0(%r3)", + )); + insns.push(( + Inst::VecInsertLane { + size: 64, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), + }, + "E78400003022", + "vlvgg %v8, %r4, 0", + )); + insns.push(( + Inst::VecInsertLane { + size: 64, rd: writable_vr(8), - const_data: 1.0_f32.to_bits(), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "A71500043F80000078801000", - "bras %r1, 8 ; data.f32 1 ; le %f8, 0(%r1)", + "E78400FF3022", + "vlvgg %v8, %r4, 255", )); insns.push(( - Inst::LoadFpuConst32 { + Inst::VecInsertLane { + size: 64, rd: writable_vr(24), - const_data: 1.0_f32.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "A71500043F800000E78010000803", - "bras %r1, 8 ; data.f32 1 ; vlef %v24, 0(%r1), 0", + "E78430003822", + "vlvgg %v24, %r4, 0(%r3)", )); insns.push(( - Inst::LoadFpuConst64 { + Inst::VecInsertLaneUndef { + size: 8, rd: writable_vr(8), - const_data: 1.0_f64.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "A71500063FF000000000000068801000", - "bras %r1, 12 ; data.f64 1 ; ld %f8, 0(%r1)", + "E78400000022", + "vlvgb %v8, %r4, 0", )); insns.push(( - Inst::LoadFpuConst64 { + Inst::VecInsertLaneUndef { + size: 8, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF0022", + "vlvgb %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLaneUndef { + size: 8, rd: writable_vr(24), - const_data: 1.0_f64.to_bits(), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "A71500063FF0000000000000E78010000802", - "bras %r1, 12 ; data.f64 1 ; vleg %v24, 0(%r1), 0", + "E78430000822", + "vlvgb %v24, %r4, 0(%r3)", )); - insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Cvt64To32, - mode: FpuRoundMode::Current, + Inst::VecInsertLaneUndef { + size: 16, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B344008C", - "ledbra %f8, %f12, 0", + "E78400001022", + "vlvgh %v8, %r4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Cvt64To32, - mode: FpuRoundMode::ToNearest, + Inst::VecInsertLaneUndef { + size: 16, + rd: writable_vr(8), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), + }, + "E78400FF1022", + "vlvgh %v8, %r4, 255", + )); + insns.push(( + Inst::VecInsertLaneUndef { + size: 16, rd: writable_vr(24), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001838C5", - "wledb %v24, %f12, 0, 1", + "E78430001822", + "vlvgh %v24, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNegInfinity, + Inst::VecInsertLaneUndef { + size: 32, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B357708C", - "fiebr %f8, %f12, 7", + "E78400002022", + "vlvgf %v8, %r4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNegInfinity, + Inst::VecInsertLaneUndef { + size: 32, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B35F708C", - "fidbr %f8, %f12, 7", + "E78400FF2022", + "vlvgf %v8, %r4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToPosInfinity, - rd: writable_vr(8), - rn: vr(12), + Inst::VecInsertLaneUndef { + size: 32, + rd: writable_vr(24), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "B357608C", - "fiebr %f8, %f12, 6", + "E78430002822", + "vlvgf %v24, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToPosInfinity, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "B35F608C", - "fidbr %f8, %f12, 6", + "B3C10084", + "ldgr %f8, %r4", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToZero, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B357508C", - "fiebr %f8, %f12, 5", + "E78400FF3022", + "vlvgg %v8, %r4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToZero, + Inst::VecInsertLaneUndef { + size: 64, rd: writable_vr(8), - rn: vr(12), + rn: gpr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "B35F508C", - "fidbr %f8, %f12, 5", + "E78430003022", + "vlvgg %v8, %r4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNearestTiesToEven, - rd: writable_vr(8), - rn: vr(12), + Inst::VecExtractLane { + size: 8, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "B357408C", - "fiebr %f8, %f12, 4", + "E78400FF0021", + "vlgvb %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNearestTiesToEven, - rd: writable_vr(8), - rn: vr(12), + Inst::VecExtractLane { + size: 8, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "B35F408C", - "fidbr %f8, %f12, 4", + "E78430000421", + "vlgvb %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001828C7", - "wfisb %v24, %f12, 0, 1", + "E78400001021", + "vlgvh %r8, %v4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::Round64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001838C7", - "wfidb %v24, %f12, 0, 1", + "E78400FF1021", + "vlgvh %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToSInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 16, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001828C2", - "wcfeb %v24, %f12, 0, 1", + "E78430001421", + "vlgvh %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToSInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001838C2", - "wcgdb %v24, %f12, 0, 1", + "E78400002021", + "vlgvf %r8, %v4, 0", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToUInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001828C0", - "wclfeb %v24, %f12, 0, 1", + "E78400FF2021", + "vlgvf %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::ToUInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 32, + rd: writable_gpr(8), + rn: vr(20), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001838C0", - "wclgdb %v24, %f12, 0, 1", + "E78430002421", + "vlgvf %r8, %v20, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromSInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: zero_reg(), }, - "E78C001828C3", - "wcefb %v24, %f12, 0, 1", + "B3CD0084", + "lgdr %r8, %f4", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromSInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 255, + lane_reg: zero_reg(), }, - "E78C001838C3", - "wcdgb %v24, %f12, 0, 1", + "E78400FF3021", + "vlgvg %r8, %v4, 255", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromUInt32, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecExtractLane { + size: 64, + rd: writable_gpr(8), + rn: vr(4), + lane_imm: 0, + lane_reg: gpr(3), }, - "E78C001828C1", - "wcelfb %v24, %f12, 0, 1", + "E78430003021", + "vlgvg %r8, %v4, 0(%r3)", )); insns.push(( - Inst::FpuRound { - op: FpuRoundOp::FromUInt64, - mode: FpuRoundMode::ToNearest, - rd: writable_vr(24), - rn: vr(12), + Inst::VecInsertLaneImm { + size: 8, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 15, }, - "E78C001838C1", - "wcdlgb %v24, %f12, 0, 1", + "E7401234F840", + "vleib %v20, 4660, 15", )); - insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(8), - ra: vr(10), + Inst::VecInsertLaneImm { + size: 16, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 7, }, - "E7468000A08D", - "vsel %v4, %v6, %v8, %v10", + "E74012347841", + "vleih %v20, 4660, 7", )); insns.push(( - Inst::VecSelect { + Inst::VecInsertLaneImm { + size: 32, rd: writable_vr(20), - rn: vr(6), - rm: vr(8), - ra: vr(10), + imm: 0x1234, + lane_imm: 3, }, - "E7468000A88D", - "vsel %v20, %v6, %v8, %v10", + "E74012343843", + "vleif %v20, 4660, 3", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(22), - rm: vr(8), - ra: vr(10), + Inst::VecInsertLaneImm { + size: 64, + rd: writable_vr(20), + imm: 0x1234, + lane_imm: 1, }, - "E7468000A48D", - "vsel %v4, %v22, %v8, %v10", + "E74012341842", + "vleig %v20, 4660, 1", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(24), - ra: vr(10), + Inst::VecReplicateLane { + size: 8, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 15, }, - "E7468000A28D", - "vsel %v4, %v6, %v24, %v10", + "E748000F084D", + "vrepb %v20, %v8, 15", )); insns.push(( - Inst::VecSelect { - rd: writable_vr(4), - rn: vr(6), - rm: vr(8), - ra: vr(26), + Inst::VecReplicateLane { + size: 16, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 7, }, - "E7468000A18D", - "vsel %v4, %v6, %v8, %v26", + "E7480007184D", + "vreph %v20, %v8, 7", )); insns.push(( - Inst::VecSelect { + Inst::VecReplicateLane { + size: 32, rd: writable_vr(20), - rn: vr(22), - rm: vr(24), - ra: vr(26), + rn: vr(8), + lane_imm: 3, }, - "E7468000AF8D", - "vsel %v20, %v22, %v24, %v26", + "E7480003284D", + "vrepf %v20, %v8, 3", + )); + insns.push(( + Inst::VecReplicateLane { + size: 64, + rd: writable_vr(20), + rn: vr(8), + lane_imm: 1, + }, + "E7480001384D", + "vrepg %v20, %v8, 1", )); let flags = settings::Flags::new(settings::builder()); diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index 676e0d4794f4..ad5af092bc6b 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -2,6 +2,7 @@ use crate::binemit::{Addend, CodeOffset, Reloc}; use crate::ir::{types, ExternalName, Opcode, Type}; +use crate::isa::CallConv; use crate::machinst::*; use crate::{settings, CodegenError, CodegenResult}; use alloc::boxed::Box; @@ -28,7 +29,7 @@ mod emit_tests; pub use crate::isa::s390x::lower::isle::generated_code::{ ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp, - ShiftOp, UnaryOp, + ShiftOp, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp, }; /// Additional information for (direct) Call instructions, left out of line to lower the size of @@ -40,6 +41,8 @@ pub struct CallInfo { pub defs: SmallVec<[Writable; 8]>, pub clobbers: PRegSet, pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, } /// Additional information for CallInd instructions, left out of line to lower the size of the Inst @@ -51,6 +54,8 @@ pub struct CallIndInfo { pub defs: SmallVec<[Writable; 8]>, pub clobbers: PRegSet, pub opcode: Opcode, + pub caller_callconv: CallConv, + pub callee_callconv: CallConv, } #[test] @@ -156,22 +161,42 @@ impl Inst { | Inst::FpuMove64 { .. } | Inst::FpuCMov32 { .. } | Inst::FpuCMov64 { .. } - | Inst::MovToFpr32 { .. } - | Inst::MovToFpr64 { .. } - | Inst::MovFromFpr32 { .. } - | Inst::MovFromFpr64 { .. } | Inst::FpuRR { .. } | Inst::FpuRRR { .. } | Inst::FpuRRRR { .. } | Inst::FpuCmp32 { .. } | Inst::FpuCmp64 { .. } - | Inst::FpuLoad32 { .. } - | Inst::FpuStore32 { .. } - | Inst::FpuLoad64 { .. } - | Inst::FpuStore64 { .. } | Inst::LoadFpuConst32 { .. } | Inst::LoadFpuConst64 { .. } + | Inst::VecRRR { .. } + | Inst::VecRR { .. } + | Inst::VecShiftRR { .. } | Inst::VecSelect { .. } + | Inst::VecPermute { .. } + | Inst::VecPermuteDWImm { .. } + | Inst::VecIntCmp { .. } + | Inst::VecIntCmpS { .. } + | Inst::VecFloatCmp { .. } + | Inst::VecFloatCmpS { .. } + | Inst::VecLoad { .. } + | Inst::VecStore { .. } + | Inst::VecLoadReplicate { .. } + | Inst::VecMov { .. } + | Inst::VecCMov { .. } + | Inst::MovToVec128 { .. } + | Inst::VecLoadConst { .. } + | Inst::VecLoadConstReplicate { .. } + | Inst::VecImmByteMask { .. } + | Inst::VecImmBitMask { .. } + | Inst::VecImmReplicate { .. } + | Inst::VecLoadLane { .. } + | Inst::VecLoadLaneUndef { .. } + | Inst::VecStoreLane { .. } + | Inst::VecInsertLane { .. } + | Inst::VecInsertLaneUndef { .. } + | Inst::VecExtractLane { .. } + | Inst::VecInsertLaneImm { .. } + | Inst::VecReplicateLane { .. } | Inst::Call { .. } | Inst::CallInd { .. } | Inst::Ret { .. } @@ -207,19 +232,33 @@ impl Inst { Inst::FpuRound { op, .. } => match op { FpuRoundOp::ToSInt32 | FpuRoundOp::FromSInt32 => InstructionSet::MIE2, FpuRoundOp::ToUInt32 | FpuRoundOp::FromUInt32 => InstructionSet::MIE2, + FpuRoundOp::ToSInt32x4 | FpuRoundOp::FromSInt32x4 => InstructionSet::MIE2, + FpuRoundOp::ToUInt32x4 | FpuRoundOp::FromUInt32x4 => InstructionSet::MIE2, _ => InstructionSet::Base, }, // These are all part of VXRS_EXT2 - Inst::FpuLoadRev32 { .. } - | Inst::FpuStoreRev32 { .. } - | Inst::FpuLoadRev64 { .. } - | Inst::FpuStoreRev64 { .. } => InstructionSet::VXRS_EXT2, + Inst::VecLoadRev { .. } + | Inst::VecStoreRev { .. } + | Inst::VecLoadReplicateRev { .. } + | Inst::VecLoadLaneRev { .. } + | Inst::VecLoadLaneRevUndef { .. } + | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2, Inst::DummyUse { .. } => InstructionSet::Base, } } + /// Create a 128-bit move instruction. + pub fn mov128(to_reg: Writable, from_reg: Reg) -> Inst { + assert!(to_reg.to_reg().class() == RegClass::Float); + assert!(from_reg.class() == RegClass::Float); + Inst::VecMov { + rd: to_reg, + rn: from_reg, + } + } + /// Create a 64-bit move instruction. pub fn mov64(to_reg: Writable, from_reg: Reg) -> Inst { assert!(to_reg.to_reg().class() == from_reg.class()); @@ -323,6 +362,17 @@ impl Inst { } } + /// Create an instruction that loads a 128-bit floating-point constant. + pub fn load_vec_constant(rd: Writable, value: u128) -> Inst { + // FIXME: This doesn't special-case constants that can be loaded + // without a constant pool, like the ISLE lowering does. Ideally, + // we should not have to duplicate the logic here. + Inst::VecLoadConst { + rd, + const_data: value, + } + } + /// Generic constructor for a load (zero-extending where appropriate). pub fn gen_load(into_reg: Writable, mem: MemArg, ty: Type) -> Inst { match ty { @@ -330,8 +380,19 @@ impl Inst { types::B16 | types::I16 => Inst::Load64ZExt16 { rd: into_reg, mem }, types::B32 | types::I32 => Inst::Load64ZExt32 { rd: into_reg, mem }, types::B64 | types::I64 | types::R64 => Inst::Load64 { rd: into_reg, mem }, - types::F32 => Inst::FpuLoad32 { rd: into_reg, mem }, - types::F64 => Inst::FpuLoad64 { rd: into_reg, mem }, + types::F32 => Inst::VecLoadLaneUndef { + size: 32, + rd: into_reg, + mem, + lane_imm: 0, + }, + types::F64 => Inst::VecLoadLaneUndef { + size: 64, + rd: into_reg, + mem, + lane_imm: 0, + }, + _ if ty.is_vector() && ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem }, _ => unimplemented!("gen_load({})", ty), } } @@ -343,8 +404,19 @@ impl Inst { types::B16 | types::I16 => Inst::Store16 { rd: from_reg, mem }, types::B32 | types::I32 => Inst::Store32 { rd: from_reg, mem }, types::B64 | types::I64 | types::R64 => Inst::Store64 { rd: from_reg, mem }, - types::F32 => Inst::FpuStore32 { rd: from_reg, mem }, - types::F64 => Inst::FpuStore64 { rd: from_reg, mem }, + types::F32 => Inst::VecStoreLane { + size: 32, + rd: from_reg, + mem, + lane_imm: 0, + }, + types::F64 => Inst::VecStoreLane { + size: 64, + rd: from_reg, + mem, + lane_imm: 0, + }, + _ if ty.is_vector() && ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem }, _ => unimplemented!("gen_store({})", ty), } } @@ -365,6 +437,9 @@ fn memarg_operands VReg>(memarg: &MemArg, collector: &mut Operand } &MemArg::InitialSPOffset { .. } | &MemArg::NominalSPOffset { .. } => {} } + // mem_finalize might require %r1 to hold (part of) the address. + // Conservatively assume this will always be necessary here. + collector.reg_early_def(writable_gpr(1)); } fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandCollector<'_, F>) { @@ -579,13 +654,6 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_mod(rd); collector.reg_use(rm); } - &Inst::MovToFpr32 { rd, rn } - | &Inst::MovToFpr64 { rd, rn } - | &Inst::MovFromFpr32 { rd, rn } - | &Inst::MovFromFpr64 { rd, rn } => { - collector.reg_def(rd); - collector.reg_use(rn); - } &Inst::FpuRR { rd, rn, .. } => { collector.reg_def(rd); collector.reg_use(rn); @@ -605,50 +673,158 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_use(rn); collector.reg_use(rm); } - &Inst::FpuLoad32 { rd, ref mem, .. } => { + &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); + } + &Inst::FpuRound { rd, rn, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecRRR { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecRR { rd, rn, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecShiftRR { + rd, rn, shift_reg, .. + } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(shift_reg); + } + &Inst::VecSelect { rd, rn, rm, ra, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + collector.reg_use(ra); + } + &Inst::VecPermute { rd, rn, rm, ra, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + collector.reg_use(ra); + } + &Inst::VecPermuteDWImm { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecIntCmp { rd, rn, rm, .. } | &Inst::VecIntCmpS { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecFloatCmp { rd, rn, rm, .. } | &Inst::VecFloatCmpS { rd, rn, rm, .. } => { + collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecLoad { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuLoad64 { rd, ref mem, .. } => { + &Inst::VecLoadRev { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuStore32 { rd, ref mem, .. } => { + &Inst::VecStore { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuStore64 { rd, ref mem, .. } => { + &Inst::VecStoreRev { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuLoadRev32 { rd, ref mem, .. } => { + &Inst::VecLoadReplicate { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadReplicateRev { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); } - &Inst::FpuLoadRev64 { rd, ref mem, .. } => { + &Inst::VecMov { rd, rn } => { + collector.reg_def(rd); + collector.reg_use(rn); + } + &Inst::VecCMov { rd, rm, .. } => { + collector.reg_mod(rd); + collector.reg_use(rm); + } + &Inst::MovToVec128 { rd, rn, rm } => { collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(rm); + } + &Inst::VecLoadConst { rd, .. } | &Inst::VecLoadConstReplicate { rd, .. } => { + collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); + } + &Inst::VecImmByteMask { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecImmBitMask { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecImmReplicate { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecLoadLane { rd, ref mem, .. } => { + collector.reg_mod(rd); memarg_operands(mem, collector); } - &Inst::FpuStoreRev32 { rd, ref mem, .. } => { + &Inst::VecLoadLaneUndef { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreLaneRev { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::FpuStoreRev64 { rd, ref mem, .. } => { + &Inst::VecLoadLaneRevUndef { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreLane { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); } - &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + &Inst::VecLoadLaneRev { rd, ref mem, .. } => { + collector.reg_mod(rd); + memarg_operands(mem, collector); + } + &Inst::VecInsertLane { + rd, rn, lane_reg, .. + } => { + collector.reg_mod(rd); + collector.reg_use(rn); + collector.reg_use(lane_reg); + } + &Inst::VecInsertLaneUndef { + rd, rn, lane_reg, .. + } => { collector.reg_def(rd); + collector.reg_use(rn); + collector.reg_use(lane_reg); } - &Inst::FpuRound { rd, rn, .. } => { + &Inst::VecExtractLane { + rd, rn, lane_reg, .. + } => { collector.reg_def(rd); collector.reg_use(rn); + collector.reg_use(lane_reg); } - &Inst::VecSelect { rd, rn, rm, ra, .. } => { + &Inst::VecInsertLaneImm { rd, .. } => { + collector.reg_def(rd); + } + &Inst::VecReplicateLane { rd, rn, .. } => { collector.reg_def(rd); collector.reg_use(rn); - collector.reg_use(rm); - collector.reg_use(ra); } &Inst::Extend { rd, rn, .. } => { collector.reg_def(rd); @@ -682,9 +858,11 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC &Inst::TrapIf { .. } => {} &Inst::JTSequence { ridx, .. } => { collector.reg_use(ridx); + collector.reg_early_def(writable_gpr(1)); } &Inst::LoadExtNameFar { rd, .. } => { collector.reg_def(rd); + collector.reg_def(writable_gpr(1)); } &Inst::LoadAddr { rd, ref mem } => { collector.reg_def(rd); @@ -720,6 +898,7 @@ impl MachInst for Inst { &Inst::Mov64 { rd, rm } => Some((rd, rm)), &Inst::FpuMove32 { rd, rn } => Some((rd, rn)), &Inst::FpuMove64 { rd, rn } => Some((rd, rn)), + &Inst::VecMov { rd, rn } => Some((rd, rn)), _ => None, } } @@ -732,6 +911,21 @@ impl MachInst for Inst { } } + fn is_included_in_clobbers(&self) -> bool { + // We exclude call instructions from the clobber-set when they are calls + // from caller to callee with the same ABI. Such calls cannot possibly + // force any new registers to be saved in the prologue, because anything + // that the callee clobbers, the caller is also allowed to clobber. This + // both saves work and enables us to more precisely follow the + // half-caller-save, half-callee-save SysV ABI for some vector + // registers. + match self { + &Inst::Call { ref info, .. } => info.caller_callconv != info.callee_callconv, + &Inst::CallInd { ref info, .. } => info.caller_callconv != info.callee_callconv, + _ => true, + } + } + fn is_term(&self) -> MachTerminator { match self { &Inst::Ret { .. } | &Inst::EpiloguePlaceholder => MachTerminator::Ret, @@ -761,11 +955,13 @@ impl MachInst for Inst { } fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { - assert!(ty.bits() <= 64); + assert!(ty.bits() <= 128); if ty.bits() <= 32 { Inst::mov32(to_reg, from_reg) - } else { + } else if ty.bits() <= 64 { Inst::mov64(to_reg, from_reg) + } else { + Inst::mov128(to_reg, from_reg) } } @@ -778,11 +974,18 @@ impl MachInst for Inst { let to_reg = to_regs .only_reg() .expect("multi-reg values not supported yet"); - let value = value as u64; match ty { + _ if ty.is_vector() && ty.bits() == 128 => { + let mut ret = SmallVec::new(); + ret.push(Inst::load_vec_constant(to_reg, value)); + ret + } types::F64 => { let mut ret = SmallVec::new(); - ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value))); + ret.push(Inst::load_fp_constant64( + to_reg, + f64::from_bits(value as u64), + )); ret } types::F32 => { @@ -793,7 +996,7 @@ impl MachInst for Inst { )); ret } - types::I64 | types::B64 | types::R64 => Inst::load_constant64(to_reg, value), + types::I64 | types::B64 | types::R64 => Inst::load_constant64(to_reg, value as u64), types::B1 | types::I8 | types::B8 @@ -832,6 +1035,7 @@ impl MachInst for Inst { types::F64 => Ok((&[RegClass::Float], &[types::F64])), types::I128 => Ok((&[RegClass::Int, RegClass::Int], &[types::I64, types::I64])), types::B128 => Ok((&[RegClass::Int, RegClass::Int], &[types::B64, types::B64])), + _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])), // FIXME: We don't really have IFLAGS, but need to allow it here // for now to support the SelectifSpectreGuard instruction. types::IFLAGS => Ok((&[RegClass::Int], &[types::I64])), @@ -845,7 +1049,7 @@ impl MachInst for Inst { fn canonical_type_for_rc(rc: RegClass) -> Type { match rc { RegClass::Int => types::I64, - RegClass::Float => types::F64, + RegClass::Float => types::I8X16, } } @@ -1497,43 +1701,6 @@ impl Inst { let mem = mem.pretty_print_default(); format!("{}{} {}, {}", mem_str, op.unwrap(), rd, mem) } - &Inst::FpuLoad32 { rd, ref mem } - | &Inst::FpuLoad64 { rd, ref mem } - | &Inst::FpuLoadRev32 { rd, ref mem } - | &Inst::FpuLoadRev64 { rd, ref mem } => { - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuLoad32 { .. } => (Some("le"), Some("ley"), "vlef"), - &Inst::FpuLoad64 { .. } => (Some("ld"), Some("ldy"), "vleg"), - &Inst::FpuLoadRev32 { .. } => (None, None, "vlebrf"), - &Inst::FpuLoadRev64 { .. } => (None, None, "vlebrg"), - _ => unreachable!(), - }; - - let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); - let mem = mem.with_allocs(allocs); - if rd_fpr.is_some() && opcode_rx.is_some() { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, true, false, true); - let op = match &mem { - &MemArg::BXD12 { .. } => opcode_rx, - &MemArg::BXD20 { .. } => opcode_rxy, - _ => unreachable!(), - }; - let mem = mem.pretty_print_default(); - format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) - } else { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, false, false, true); - let mem = mem.pretty_print_default(); - format!( - "{}{} {}, {}, 0", - mem_str, - opcode_vrx, - rd_fpr.unwrap_or(rd), - mem - ) - } - } &Inst::Store8 { rd, ref mem } | &Inst::Store16 { rd, ref mem } | &Inst::Store32 { rd, ref mem } @@ -1599,43 +1766,6 @@ impl Inst { format!("{}{} {}, {}", mem_str, op, mem, imm) } - &Inst::FpuStore32 { rd, ref mem } - | &Inst::FpuStore64 { rd, ref mem } - | &Inst::FpuStoreRev32 { rd, ref mem } - | &Inst::FpuStoreRev64 { rd, ref mem } => { - let (opcode_rx, opcode_rxy, opcode_vrx) = match self { - &Inst::FpuStore32 { .. } => (Some("ste"), Some("stey"), "vstef"), - &Inst::FpuStore64 { .. } => (Some("std"), Some("stdy"), "vsteg"), - &Inst::FpuStoreRev32 { .. } => (None, None, "vstebrf"), - &Inst::FpuStoreRev64 { .. } => (None, None, "vstebrg"), - _ => unreachable!(), - }; - - let (rd, rd_fpr) = pretty_print_fpr(rd, allocs); - let mem = mem.with_allocs(allocs); - if rd_fpr.is_some() && opcode_rx.is_some() { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, true, false, true); - let op = match &mem { - &MemArg::BXD12 { .. } => opcode_rx, - &MemArg::BXD20 { .. } => opcode_rxy, - _ => unreachable!(), - }; - let mem = mem.pretty_print_default(); - format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) - } else { - let (mem_str, mem) = - mem_finalize_for_show(&mem, state, true, false, false, true); - let mem = mem.pretty_print_default(); - format!( - "{}{} {}, {}, 0", - mem_str, - opcode_vrx, - rd_fpr.unwrap_or(rd), - mem - ) - } - } &Inst::LoadMultiple64 { rt, rt2, ref mem } => { let mem = mem.with_allocs(allocs); let (mem_str, mem) = mem_finalize_for_show(&mem, state, false, true, false, false); @@ -1780,69 +1910,77 @@ impl Inst { format!("j{} 10 ; vlr {}, {}", cond, rd, rm) } } - &Inst::MovToFpr32 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - format!("vlvgf {}, {}, 0", rd, rn) - } - &Inst::MovToFpr64 { rd, rn } => { - let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - if rd_fpr.is_some() { - format!("ldgr {}, {}", rd_fpr.unwrap(), rn) - } else { - format!("vlvgg {}, {}, 0", rd, rn) - } - } - &Inst::MovFromFpr32 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let rn = pretty_print_reg(rn, allocs); - format!("vlgvf {}, {}, 0", rd, rn) - } - &Inst::MovFromFpr64 { rd, rn } => { - let rd = pretty_print_reg(rd.to_reg(), allocs); - let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); - if rn_fpr.is_some() { - format!("lgdr {}, {}", rd, rn_fpr.unwrap()) - } else { - format!("vlgvg {}, {}, 0", rd, rn) - } - } &Inst::FpuRR { fpu_op, rd, rn } => { let (op, op_fpr) = match fpu_op { - FPUOp1::Abs32 => ("wflpsb", "lpebr"), - FPUOp1::Abs64 => ("wflpdb", "lpdbr"), - FPUOp1::Neg32 => ("wflcsb", "lcebr"), - FPUOp1::Neg64 => ("wflcdb", "lcdbr"), - FPUOp1::NegAbs32 => ("wflnsb", "lnebr"), - FPUOp1::NegAbs64 => ("wflndb", "lndbr"), - FPUOp1::Sqrt32 => ("wfsqsb", "sqebr"), - FPUOp1::Sqrt64 => ("wfsqdb", "sqdbr"), - FPUOp1::Cvt32To64 => ("wldeb", "ldebr"), + FPUOp1::Abs32 => ("wflpsb", Some("lpebr")), + FPUOp1::Abs64 => ("wflpdb", Some("lpdbr")), + FPUOp1::Abs32x4 => ("vflpsb", None), + FPUOp1::Abs64x2 => ("vflpdb", None), + FPUOp1::Neg32 => ("wflcsb", Some("lcebr")), + FPUOp1::Neg64 => ("wflcdb", Some("lcdbr")), + FPUOp1::Neg32x4 => ("vflcsb", None), + FPUOp1::Neg64x2 => ("vflcdb", None), + FPUOp1::NegAbs32 => ("wflnsb", Some("lnebr")), + FPUOp1::NegAbs64 => ("wflndb", Some("lndbr")), + FPUOp1::NegAbs32x4 => ("vflnsb", None), + FPUOp1::NegAbs64x2 => ("vflndb", None), + FPUOp1::Sqrt32 => ("wfsqsb", Some("sqebr")), + FPUOp1::Sqrt64 => ("wfsqdb", Some("sqdbr")), + FPUOp1::Sqrt32x4 => ("vfsqsb", None), + FPUOp1::Sqrt64x2 => ("vfsqdb", None), + FPUOp1::Cvt32To64 => ("wldeb", Some("ldebr")), + FPUOp1::Cvt32x4To64x2 => ("vldeb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); - if rd_fpr.is_some() && rn_fpr.is_some() { - format!("{} {}, {}", op_fpr, rd_fpr.unwrap(), rn_fpr.unwrap()) - } else { + if op_fpr.is_some() && rd_fpr.is_some() && rn_fpr.is_some() { + format!( + "{} {}, {}", + op_fpr.unwrap(), + rd_fpr.unwrap(), + rn_fpr.unwrap() + ) + } else if op.starts_with('w') { format!("{} {}, {}", op, rd_fpr.unwrap_or(rd), rn_fpr.unwrap_or(rn)) + } else { + format!("{} {}, {}", op, rd, rn) } } &Inst::FpuRRR { fpu_op, rd, rn, rm } => { let (op, opt_m6, op_fpr) = match fpu_op { FPUOp2::Add32 => ("wfasb", "", Some("aebr")), FPUOp2::Add64 => ("wfadb", "", Some("adbr")), + FPUOp2::Add32x4 => ("vfasb", "", None), + FPUOp2::Add64x2 => ("vfadb", "", None), FPUOp2::Sub32 => ("wfssb", "", Some("sebr")), FPUOp2::Sub64 => ("wfsdb", "", Some("sdbr")), + FPUOp2::Sub32x4 => ("vfssb", "", None), + FPUOp2::Sub64x2 => ("vfsdb", "", None), FPUOp2::Mul32 => ("wfmsb", "", Some("meebr")), FPUOp2::Mul64 => ("wfmdb", "", Some("mdbr")), + FPUOp2::Mul32x4 => ("vfmsb", "", None), + FPUOp2::Mul64x2 => ("vfmdb", "", None), FPUOp2::Div32 => ("wfdsb", "", Some("debr")), FPUOp2::Div64 => ("wfddb", "", Some("ddbr")), + FPUOp2::Div32x4 => ("vfdsb", "", None), + FPUOp2::Div64x2 => ("vfddb", "", None), FPUOp2::Max32 => ("wfmaxsb", ", 1", None), FPUOp2::Max64 => ("wfmaxdb", ", 1", None), + FPUOp2::Max32x4 => ("vfmaxsb", ", 1", None), + FPUOp2::Max64x2 => ("vfmaxdb", ", 1", None), FPUOp2::Min32 => ("wfminsb", ", 1", None), FPUOp2::Min64 => ("wfmindb", ", 1", None), + FPUOp2::Min32x4 => ("vfminsb", ", 1", None), + FPUOp2::Min64x2 => ("vfmindb", ", 1", None), + FPUOp2::MaxPseudo32 => ("wfmaxsb", ", 3", None), + FPUOp2::MaxPseudo64 => ("wfmaxdb", ", 3", None), + FPUOp2::MaxPseudo32x4 => ("vfmaxsb", ", 3", None), + FPUOp2::MaxPseudo64x2 => ("vfmaxdb", ", 3", None), + FPUOp2::MinPseudo32 => ("wfminsb", ", 3", None), + FPUOp2::MinPseudo64 => ("wfmindb", ", 3", None), + FPUOp2::MinPseudo32x4 => ("vfminsb", ", 3", None), + FPUOp2::MinPseudo64x2 => ("vfmindb", ", 3", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); @@ -1855,7 +1993,7 @@ impl Inst { rd_fpr.unwrap(), rm_fpr.unwrap() ) - } else { + } else if op.starts_with('w') { format!( "{} {}, {}, {}{}", op, @@ -1864,6 +2002,8 @@ impl Inst { rm_fpr.unwrap_or(rm), opt_m6 ) + } else { + format!("{} {}, {}, {}{}", op, rd, rn, rm, opt_m6) } } &Inst::FpuRRRR { @@ -1874,25 +2014,34 @@ impl Inst { ra, } => { let (op, op_fpr) = match fpu_op { - FPUOp3::MAdd32 => ("wfmasb", "maebr"), - FPUOp3::MAdd64 => ("wfmadb", "madbr"), - FPUOp3::MSub32 => ("wfmssb", "msebr"), - FPUOp3::MSub64 => ("wfmsdb", "msdbr"), + FPUOp3::MAdd32 => ("wfmasb", Some("maebr")), + FPUOp3::MAdd64 => ("wfmadb", Some("madbr")), + FPUOp3::MAdd32x4 => ("vfmasb", None), + FPUOp3::MAdd64x2 => ("vfmadb", None), + FPUOp3::MSub32 => ("wfmssb", Some("msebr")), + FPUOp3::MSub64 => ("wfmsdb", Some("msdbr")), + FPUOp3::MSub32x4 => ("vfmssb", None), + FPUOp3::MSub64x2 => ("vfmsdb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); let (rm, rm_fpr) = pretty_print_fpr(rm, allocs); let (ra, ra_fpr) = pretty_print_fpr(ra, allocs); - if rd == ra && rd_fpr.is_some() && rn_fpr.is_some() && rm_fpr.is_some() { + if op_fpr.is_some() + && rd == ra + && rd_fpr.is_some() + && rn_fpr.is_some() + && rm_fpr.is_some() + { format!( "{} {}, {}, {}", - op_fpr, + op_fpr.unwrap(), rd_fpr.unwrap(), rn_fpr.unwrap(), rm_fpr.unwrap() ) - } else { + } else if op.starts_with('w') { format!( "{} {}, {}, {}, {}", op, @@ -1901,6 +2050,8 @@ impl Inst { rm_fpr.unwrap_or(rm), ra_fpr.unwrap_or(ra) ) + } else { + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) } } &Inst::FpuCmp32 { rn, rm } => { @@ -1975,16 +2126,27 @@ impl Inst { }; let (opcode, opcode_fpr) = match op { FpuRoundOp::Cvt64To32 => ("wledb", Some("ledbra")), + FpuRoundOp::Cvt64x2To32x4 => ("vledb", None), FpuRoundOp::Round32 => ("wfisb", Some("fiebr")), FpuRoundOp::Round64 => ("wfidb", Some("fidbr")), + FpuRoundOp::Round32x4 => ("vfisb", None), + FpuRoundOp::Round64x2 => ("vfidb", None), FpuRoundOp::ToSInt32 => ("wcfeb", None), FpuRoundOp::ToSInt64 => ("wcgdb", None), FpuRoundOp::ToUInt32 => ("wclfeb", None), FpuRoundOp::ToUInt64 => ("wclgdb", None), + FpuRoundOp::ToSInt32x4 => ("vcfeb", None), + FpuRoundOp::ToSInt64x2 => ("vcgdb", None), + FpuRoundOp::ToUInt32x4 => ("vclfeb", None), + FpuRoundOp::ToUInt64x2 => ("vclgdb", None), FpuRoundOp::FromSInt32 => ("wcefb", None), FpuRoundOp::FromSInt64 => ("wcdgb", None), FpuRoundOp::FromUInt32 => ("wcelfb", None), FpuRoundOp::FromUInt64 => ("wcdlgb", None), + FpuRoundOp::FromSInt32x4 => ("vcefb", None), + FpuRoundOp::FromSInt64x2 => ("vcdgb", None), + FpuRoundOp::FromUInt32x4 => ("vcelfb", None), + FpuRoundOp::FromUInt64x2 => ("vcdlgb", None), }; let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); @@ -1997,7 +2159,7 @@ impl Inst { rn_fpr.unwrap(), mode ) - } else { + } else if opcode.starts_with('w') { format!( "{} {}, {}, 0, {}", opcode, @@ -2005,8 +2167,168 @@ impl Inst { rn_fpr.unwrap_or(rn), mode ) + } else { + format!("{} {}, {}, 0, {}", opcode, rd, rn, mode) } } + &Inst::VecRRR { op, rd, rn, rm } => { + let op = match op { + VecBinaryOp::Add8x16 => "vab", + VecBinaryOp::Add16x8 => "vah", + VecBinaryOp::Add32x4 => "vaf", + VecBinaryOp::Add64x2 => "vag", + VecBinaryOp::Sub8x16 => "vsb", + VecBinaryOp::Sub16x8 => "vsh", + VecBinaryOp::Sub32x4 => "vsf", + VecBinaryOp::Sub64x2 => "vsg", + VecBinaryOp::Mul8x16 => "vmlb", + VecBinaryOp::Mul16x8 => "vmlhw", + VecBinaryOp::Mul32x4 => "vmlf", + VecBinaryOp::UMulHi8x16 => "vmlhb", + VecBinaryOp::UMulHi16x8 => "vmlhh", + VecBinaryOp::UMulHi32x4 => "vmlhf", + VecBinaryOp::SMulHi8x16 => "vmhb", + VecBinaryOp::SMulHi16x8 => "vmhh", + VecBinaryOp::SMulHi32x4 => "vmhf", + VecBinaryOp::UMulEven8x16 => "vmleb", + VecBinaryOp::UMulEven16x8 => "vmleh", + VecBinaryOp::UMulEven32x4 => "vmlef", + VecBinaryOp::SMulEven8x16 => "vmeb", + VecBinaryOp::SMulEven16x8 => "vmeh", + VecBinaryOp::SMulEven32x4 => "vmef", + VecBinaryOp::UMulOdd8x16 => "vmlob", + VecBinaryOp::UMulOdd16x8 => "vmloh", + VecBinaryOp::UMulOdd32x4 => "vmlof", + VecBinaryOp::SMulOdd8x16 => "vmob", + VecBinaryOp::SMulOdd16x8 => "vmoh", + VecBinaryOp::SMulOdd32x4 => "vmof", + VecBinaryOp::UMax8x16 => "vmxlb", + VecBinaryOp::UMax16x8 => "vmxlh", + VecBinaryOp::UMax32x4 => "vmxlf", + VecBinaryOp::UMax64x2 => "vmxlg", + VecBinaryOp::SMax8x16 => "vmxb", + VecBinaryOp::SMax16x8 => "vmxh", + VecBinaryOp::SMax32x4 => "vmxf", + VecBinaryOp::SMax64x2 => "vmxg", + VecBinaryOp::UMin8x16 => "vmnlb", + VecBinaryOp::UMin16x8 => "vmnlh", + VecBinaryOp::UMin32x4 => "vmnlf", + VecBinaryOp::UMin64x2 => "vmnlg", + VecBinaryOp::SMin8x16 => "vmnb", + VecBinaryOp::SMin16x8 => "vmnh", + VecBinaryOp::SMin32x4 => "vmnf", + VecBinaryOp::SMin64x2 => "vmng", + VecBinaryOp::UAvg8x16 => "vavglb", + VecBinaryOp::UAvg16x8 => "vavglh", + VecBinaryOp::UAvg32x4 => "vavglf", + VecBinaryOp::UAvg64x2 => "vavglg", + VecBinaryOp::SAvg8x16 => "vavgb", + VecBinaryOp::SAvg16x8 => "vavgh", + VecBinaryOp::SAvg32x4 => "vavgf", + VecBinaryOp::SAvg64x2 => "vavgg", + VecBinaryOp::And128 => "vn", + VecBinaryOp::Orr128 => "vo", + VecBinaryOp::Xor128 => "vx", + VecBinaryOp::NotAnd128 => "vnn", + VecBinaryOp::NotOrr128 => "vno", + VecBinaryOp::NotXor128 => "vnx", + VecBinaryOp::AndNot128 => "vnc", + VecBinaryOp::OrrNot128 => "voc", + VecBinaryOp::BitPermute128 => "vbperm", + VecBinaryOp::LShLByByte128 => "vslb", + VecBinaryOp::LShRByByte128 => "vsrlb", + VecBinaryOp::AShRByByte128 => "vsrab", + VecBinaryOp::LShLByBit128 => "vsl", + VecBinaryOp::LShRByBit128 => "vsrl", + VecBinaryOp::AShRByBit128 => "vsra", + VecBinaryOp::Pack16x8 => "vpkh", + VecBinaryOp::Pack32x4 => "vpkf", + VecBinaryOp::Pack64x2 => "vpkg", + VecBinaryOp::PackUSat16x8 => "vpklsh", + VecBinaryOp::PackUSat32x4 => "vpklsf", + VecBinaryOp::PackUSat64x2 => "vpklsg", + VecBinaryOp::PackSSat16x8 => "vpksh", + VecBinaryOp::PackSSat32x4 => "vpksf", + VecBinaryOp::PackSSat64x2 => "vpksg", + VecBinaryOp::MergeLow8x16 => "vmrlb", + VecBinaryOp::MergeLow16x8 => "vmrlh", + VecBinaryOp::MergeLow32x4 => "vmrlf", + VecBinaryOp::MergeLow64x2 => "vmrlg", + VecBinaryOp::MergeHigh8x16 => "vmrhb", + VecBinaryOp::MergeHigh16x8 => "vmrhh", + VecBinaryOp::MergeHigh32x4 => "vmrhf", + VecBinaryOp::MergeHigh64x2 => "vmrhg", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::VecRR { op, rd, rn } => { + let op = match op { + VecUnaryOp::Abs8x16 => "vlpb", + VecUnaryOp::Abs16x8 => "vlph", + VecUnaryOp::Abs32x4 => "vlpf", + VecUnaryOp::Abs64x2 => "vlpg", + VecUnaryOp::Neg8x16 => "vlcb", + VecUnaryOp::Neg16x8 => "vlch", + VecUnaryOp::Neg32x4 => "vlcf", + VecUnaryOp::Neg64x2 => "vlcg", + VecUnaryOp::Popcnt8x16 => "vpopctb", + VecUnaryOp::Popcnt16x8 => "vpopcth", + VecUnaryOp::Popcnt32x4 => "vpopctf", + VecUnaryOp::Popcnt64x2 => "vpopctg", + VecUnaryOp::UnpackULow8x16 => "vupllb", + VecUnaryOp::UnpackULow16x8 => "vupllh", + VecUnaryOp::UnpackULow32x4 => "vupllf", + VecUnaryOp::UnpackUHigh8x16 => "vuplhb", + VecUnaryOp::UnpackUHigh16x8 => "vuplhh", + VecUnaryOp::UnpackUHigh32x4 => "vuplhf", + VecUnaryOp::UnpackSLow8x16 => "vuplb", + VecUnaryOp::UnpackSLow16x8 => "vuplh", + VecUnaryOp::UnpackSLow32x4 => "vuplf", + VecUnaryOp::UnpackSHigh8x16 => "vuphb", + VecUnaryOp::UnpackSHigh16x8 => "vuphh", + VecUnaryOp::UnpackSHigh32x4 => "vuphf", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("{} {}, {}", op, rd, rn) + } + &Inst::VecShiftRR { + shift_op, + rd, + rn, + shift_imm, + shift_reg, + } => { + let op = match shift_op { + VecShiftOp::RotL8x16 => "verllb", + VecShiftOp::RotL16x8 => "verllh", + VecShiftOp::RotL32x4 => "verllf", + VecShiftOp::RotL64x2 => "verllg", + VecShiftOp::LShL8x16 => "veslb", + VecShiftOp::LShL16x8 => "veslh", + VecShiftOp::LShL32x4 => "veslf", + VecShiftOp::LShL64x2 => "veslg", + VecShiftOp::LShR8x16 => "vesrlb", + VecShiftOp::LShR16x8 => "vesrlh", + VecShiftOp::LShR32x4 => "vesrlf", + VecShiftOp::LShR64x2 => "vesrlg", + VecShiftOp::AShR8x16 => "vesrab", + VecShiftOp::AShR16x8 => "vesrah", + VecShiftOp::AShR32x4 => "vesraf", + VecShiftOp::AShR64x2 => "vesrag", + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let shift_reg = if shift_reg != zero_reg() { + format!("({})", pretty_print_reg(shift_reg, allocs)) + } else { + "".to_string() + }; + format!("{} {}, {}, {}{}", op, rd, rn, shift_imm, shift_reg) + } &Inst::VecSelect { rd, rn, rm, ra } => { let rd = pretty_print_reg(rd.to_reg(), allocs); let rn = pretty_print_reg(rn, allocs); @@ -2014,6 +2336,409 @@ impl Inst { let ra = pretty_print_reg(ra, allocs); format!("vsel {}, {}, {}, {}", rd, rn, rm, ra) } + &Inst::VecPermute { rd, rn, rm, ra } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + let ra = pretty_print_reg(ra, allocs); + format!("vperm {}, {}, {}, {}", rd, rn, rm, ra) + } + &Inst::VecPermuteDWImm { + rd, + rn, + rm, + idx1, + idx2, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + let m4 = (idx1 & 1) * 4 + (idx2 & 1); + format!("vpdi {}, {}, {}, {}", rd, rn, rm, m4) + } + &Inst::VecIntCmp { op, rd, rn, rm } | &Inst::VecIntCmpS { op, rd, rn, rm } => { + let op = match op { + VecIntCmpOp::CmpEq8x16 => "vceqb", + VecIntCmpOp::CmpEq16x8 => "vceqh", + VecIntCmpOp::CmpEq32x4 => "vceqf", + VecIntCmpOp::CmpEq64x2 => "vceqg", + VecIntCmpOp::SCmpHi8x16 => "vchb", + VecIntCmpOp::SCmpHi16x8 => "vchh", + VecIntCmpOp::SCmpHi32x4 => "vchf", + VecIntCmpOp::SCmpHi64x2 => "vchg", + VecIntCmpOp::UCmpHi8x16 => "vchlb", + VecIntCmpOp::UCmpHi16x8 => "vchlh", + VecIntCmpOp::UCmpHi32x4 => "vchlf", + VecIntCmpOp::UCmpHi64x2 => "vchlg", + }; + let s = match self { + &Inst::VecIntCmp { .. } => "", + &Inst::VecIntCmpS { .. } => "s", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{}{} {}, {}, {}", op, s, rd, rn, rm) + } + &Inst::VecFloatCmp { op, rd, rn, rm } | &Inst::VecFloatCmpS { op, rd, rn, rm } => { + let op = match op { + VecFloatCmpOp::CmpEq32x4 => "vfcesb", + VecFloatCmpOp::CmpEq64x2 => "vfcedb", + VecFloatCmpOp::CmpHi32x4 => "vfchsb", + VecFloatCmpOp::CmpHi64x2 => "vfchdb", + VecFloatCmpOp::CmpHiEq32x4 => "vfchesb", + VecFloatCmpOp::CmpHiEq64x2 => "vfchedb", + }; + let s = match self { + &Inst::VecFloatCmp { .. } => "", + &Inst::VecFloatCmpS { .. } => "s", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("{}{} {}, {}, {}", op, s, rd, rn, rm) + } + &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + let opcode = match self { + &Inst::VecLoad { .. } => "vl", + &Inst::VecLoadRev { .. } => "vlbrq", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + let opcode = match self { + &Inst::VecStore { .. } => "vst", + &Inst::VecStoreRev { .. } => "vstbrq", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd, allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecLoadReplicate { size, rd, ref mem } + | &Inst::VecLoadReplicateRev { size, rd, ref mem } => { + let opcode = match (self, size) { + (&Inst::VecLoadReplicate { .. }, 8) => "vlrepb", + (&Inst::VecLoadReplicate { .. }, 16) => "vlreph", + (&Inst::VecLoadReplicate { .. }, 32) => "vlrepf", + (&Inst::VecLoadReplicate { .. }, 64) => "vlrepg", + (&Inst::VecLoadReplicateRev { .. }, 16) => "vlbrreph", + (&Inst::VecLoadReplicateRev { .. }, 32) => "vlbrrepf", + (&Inst::VecLoadReplicateRev { .. }, 64) => "vlbrrepg", + _ => unreachable!(), + }; + + let rd = pretty_print_reg(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + let (mem_str, mem) = mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, opcode, rd, mem) + } + &Inst::VecMov { rd, rn } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("vlr {}, {}", rd, rn) + } + &Inst::VecCMov { rd, cond, rm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rm = pretty_print_reg(rm, allocs); + let cond = cond.invert().pretty_print_default(); + format!("j{} 10 ; vlr {}, {}", cond, rd, rm) + } + &Inst::MovToVec128 { rd, rn, rm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let rm = pretty_print_reg(rm, allocs); + format!("vlvgp {}, {}, {}", rd, rn, rm) + } + &Inst::VecLoadConst { rd, const_data } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg(), &mut empty_allocs); + format!( + "bras {}, 20 ; data.u128 0x{:032x} ; vl {}, 0({})", + tmp, const_data, rd, tmp + ) + } + &Inst::VecLoadConstReplicate { + size, + rd, + const_data, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg(), &mut empty_allocs); + let (opcode, data) = match size { + 32 => ("vlrepf", format!("0x{:08x}", const_data as u32)), + 64 => ("vlrepg", format!("0x{:016x}", const_data)), + _ => unreachable!(), + }; + format!( + "bras {}, {} ; data.u{} {} ; {} {}, 0({})", + tmp, + 4 + size / 8, + size, + data, + opcode, + rd, + tmp + ) + } + &Inst::VecImmByteMask { rd, mask } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + format!("vgbm {}, {}", rd, mask) + } + &Inst::VecImmBitMask { + size, + rd, + start_bit, + end_bit, + } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let op = match size { + 8 => "vgmb", + 16 => "vgmh", + 32 => "vgmf", + 64 => "vgmg", + _ => unreachable!(), + }; + format!("{} {}, {}, {}", op, rd, start_bit, end_bit) + } + &Inst::VecImmReplicate { size, rd, imm } => { + let rd = pretty_print_reg(rd.to_reg(), allocs); + let op = match size { + 8 => "vrepib", + 16 => "vrepih", + 32 => "vrepif", + 64 => "vrepig", + _ => unreachable!(), + }; + format!("{} {}, {}", op, rd, imm) + } + &Inst::VecLoadLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRev { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneUndef { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecLoadLaneRevUndef { + size, + rd, + ref mem, + lane_imm, + } => { + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecLoadLane { .. }, 8) => ("vleb", None, None), + (&Inst::VecLoadLane { .. }, 16) => ("vleh", None, None), + (&Inst::VecLoadLane { .. }, 32) => ("vlef", None, None), + (&Inst::VecLoadLane { .. }, 64) => ("vleg", None, None), + (&Inst::VecLoadLaneRev { .. }, 16) => ("vlebrh", None, None), + (&Inst::VecLoadLaneRev { .. }, 32) => ("vlebrf", None, None), + (&Inst::VecLoadLaneRev { .. }, 64) => ("vlebrg", None, None), + (&Inst::VecLoadLaneUndef { .. }, 8) => ("vleb", None, None), + (&Inst::VecLoadLaneUndef { .. }, 16) => ("vleh", None, None), + (&Inst::VecLoadLaneUndef { .. }, 32) => ("vlef", Some("le"), Some("ley")), + (&Inst::VecLoadLaneUndef { .. }, 64) => ("vleg", Some("ld"), Some("ldy")), + (&Inst::VecLoadLaneRevUndef { .. }, 16) => ("vlebrh", None, None), + (&Inst::VecLoadLaneRevUndef { .. }, 32) => ("vlebrf", None, None), + (&Inst::VecLoadLaneRevUndef { .. }, 64) => ("vlebrg", None, None), + _ => unreachable!(), + }; + + let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); + let mem = mem.with_allocs(allocs); + if lane_imm == 0 && rd_fpr.is_some() && opcode_rx.is_some() { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, true, false, true); + let op = match &mem { + &MemArg::BXD12 { .. } => opcode_rx, + &MemArg::BXD20 { .. } => opcode_rxy, + _ => unreachable!(), + }; + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) + } else { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}, {}", mem_str, opcode_vrx, rd, mem, lane_imm) + } + } + &Inst::VecStoreLane { + size, + rd, + ref mem, + lane_imm, + } + | &Inst::VecStoreLaneRev { + size, + rd, + ref mem, + lane_imm, + } => { + let (opcode_vrx, opcode_rx, opcode_rxy) = match (self, size) { + (&Inst::VecStoreLane { .. }, 8) => ("vsteb", None, None), + (&Inst::VecStoreLane { .. }, 16) => ("vsteh", None, None), + (&Inst::VecStoreLane { .. }, 32) => ("vstef", Some("ste"), Some("stey")), + (&Inst::VecStoreLane { .. }, 64) => ("vsteg", Some("std"), Some("stdy")), + (&Inst::VecStoreLaneRev { .. }, 16) => ("vstebrh", None, None), + (&Inst::VecStoreLaneRev { .. }, 32) => ("vstebrf", None, None), + (&Inst::VecStoreLaneRev { .. }, 64) => ("vstebrg", None, None), + _ => unreachable!(), + }; + + let (rd, rd_fpr) = pretty_print_fpr(rd, allocs); + let mem = mem.with_allocs(allocs); + if lane_imm == 0 && rd_fpr.is_some() && opcode_rx.is_some() { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, true, false, true); + let op = match &mem { + &MemArg::BXD12 { .. } => opcode_rx, + &MemArg::BXD20 { .. } => opcode_rxy, + _ => unreachable!(), + }; + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}", mem_str, op.unwrap(), rd_fpr.unwrap(), mem) + } else { + let (mem_str, mem) = + mem_finalize_for_show(&mem, state, true, false, false, true); + let mem = mem.pretty_print_default(); + format!("{}{} {}, {}, {}", mem_str, opcode_vrx, rd, mem, lane_imm,) + } + } + &Inst::VecInsertLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let op = match size { + 8 => "vlvgb", + 16 => "vlvgh", + 32 => "vlvgf", + 64 => "vlvgg", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + format!("{} {}, {}, {}{}", op, rd, rn, lane_imm, lane_reg) + } + &Inst::VecInsertLaneUndef { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let (opcode_vrs, opcode_rre) = match size { + 8 => ("vlvgb", None), + 16 => ("vlvgh", None), + 32 => ("vlvgf", None), + 64 => ("vlvgg", Some("ldgr")), + _ => unreachable!(), + }; + let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg.is_empty() && rd_fpr.is_some() + { + format!("{} {}, {}", opcode_rre.unwrap(), rd_fpr.unwrap(), rn) + } else { + format!("{} {}, {}, {}{}", opcode_vrs, rd, rn, lane_imm, lane_reg) + } + } + &Inst::VecExtractLane { + size, + rd, + rn, + lane_imm, + lane_reg, + } => { + let (opcode_vrs, opcode_rre) = match size { + 8 => ("vlgvb", None), + 16 => ("vlgvh", None), + 32 => ("vlgvf", None), + 64 => ("vlgvg", Some("lgdr")), + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let (rn, rn_fpr) = pretty_print_fpr(rn, allocs); + let lane_reg = if lane_reg != zero_reg() { + format!("({})", pretty_print_reg(lane_reg, allocs)) + } else { + "".to_string() + }; + if opcode_rre.is_some() && lane_imm == 0 && lane_reg.is_empty() && rn_fpr.is_some() + { + format!("{} {}, {}", opcode_rre.unwrap(), rd, rn_fpr.unwrap()) + } else { + format!("{} {}, {}, {}{}", opcode_vrs, rd, rn, lane_imm, lane_reg) + } + } + &Inst::VecInsertLaneImm { + size, + rd, + imm, + lane_imm, + } => { + let op = match size { + 8 => "vleib", + 16 => "vleih", + 32 => "vleif", + 64 => "vleig", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + format!("{} {}, {}, {}", op, rd, imm, lane_imm) + } + &Inst::VecReplicateLane { + size, + rd, + rn, + lane_imm, + } => { + let op = match size { + 8 => "vrepb", + 16 => "vreph", + 32 => "vrepf", + 64 => "vrepg", + _ => unreachable!(), + }; + let rd = pretty_print_reg(rd.to_reg(), allocs); + let rn = pretty_print_reg(rn, allocs); + format!("{} {}, {}, {}", op, rd, rn, lane_imm) + } &Inst::Extend { rd, rn, diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 5dfc2ec3eca7..0685d7e653f3 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -36,6 +36,12 @@ (imm $F64 x)) +;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty (vconst (u128_from_constant x)))) + (vec_imm ty x)) + + ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (null))) @@ -98,6 +104,39 @@ (rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_sload32 x) y))) (add_mem_sext32 ty y (sink_sload32 x))) +;; Add two vector registers. +(rule (lower (has_type (ty_vec128 ty) (iadd x y))) + (vec_add ty x y)) + + +;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate unsigned) two vector registers. +(rule (lower (has_type (ty_vec128 ty) (uadd_sat x y))) + (let ((sum Reg (vec_add ty x y))) + (vec_or ty sum (vec_cmphl ty x sum)))) + + +;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate signed) two vector registers. $I64X2 not supported. +(rule (lower (has_type (ty_vec128 ty) (sadd_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (vec_add (vec_widen_type ty) (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (vec_add (vec_widen_type ty) (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + + +;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers. +(rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y))) + (let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits)))) + (vec_pack (vec_widen_type ty) + (vec_add ty y (vec_lshr_by_byte y size)) + (vec_add ty x (vec_lshr_by_byte x size))))) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -129,6 +168,28 @@ (rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_sload32 y)))) (sub_mem_sext32 ty x (sink_sload32 y))) +;; Sub two vector registers. +(rule (lower (has_type (ty_vec128 ty) (isub x y))) + (vec_sub ty x y)) + + +;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate unsigned) two vector registers. +(rule (lower (has_type (ty_vec128 ty) (usub_sat x y))) + (vec_and ty (vec_sub ty x y) (vec_cmphl ty x y))) + + +;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Add (saturate signed) two vector registers. $I64X2 not supported. +(rule (lower (has_type (ty_vec128 ty) (ssub_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (vec_sub (vec_widen_type ty) (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (vec_sub (vec_widen_type ty) (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -141,6 +202,10 @@ (rule (lower (has_type (fits_in_64 ty) (iabs (sext32_value x)))) (abs_reg_sext32 ty x)) +;; Absolute value of a vector register. +(rule (lower (has_type (ty_vec128 ty) (iabs x))) + (vec_abs ty x)) + ;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -193,6 +258,45 @@ (rule (lower (has_type (fits_in_64 ty) (ineg (sext32_value x)))) (neg_reg_sext32 ty x)) +;; Negate a vector register. +(rule (lower (has_type (ty_vec128 ty) (ineg x))) + (vec_neg ty x)) + + +;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned maximum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (umax x y))) + (vec_umax ty x y)) + + +;;;; Rules for `umin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned minimum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (umin x y))) + (vec_umin ty x y)) + + +;;;; Rules for `imax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Signed maximum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (imax x y))) + (vec_smax ty x y)) + + +;;;; Rules for `imin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Signed minimum of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (imin x y))) + (vec_smin ty x y)) + + +;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned average of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (avg_round x y))) + (vec_uavg ty x y)) + ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -238,6 +342,24 @@ (rule (lower (has_type (fits_in_64 ty) (imul (sinkable_sload32 x) y))) (mul_mem_sext32 ty y (sink_sload32 x))) +;; Multiply two vector registers, using a helper. +(decl vec_mul_impl (Type Reg Reg) Reg) +(rule (lower (has_type (ty_vec128 ty) (imul x y))) + (vec_mul_impl ty x y)) + +;; Multiply two vector registers - byte, halfword, and word. +(rule (vec_mul_impl $I8X16 x y) (vec_mul $I8X16 x y)) +(rule (vec_mul_impl $I16X8 x y) (vec_mul $I16X8 x y)) +(rule (vec_mul_impl $I32X4 x y) (vec_mul $I32X4 x y)) + +;; Multiply two vector registers - doubleword. Has to be scalarized. +(rule (vec_mul_impl $I64X2 x y) + (mov_to_vec128 $I64X2 + (mul_reg $I64 (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg))) + (mul_reg $I64 (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg))))) + ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -260,6 +382,22 @@ (let ((pair RegPair (umul_wide x y))) (copy_reg $I64 (regpair_hi pair)))) +;; Multiply high part unsigned, vector types with 8-, 16-, or 32-bit elements. +(rule (lower (has_type $I8X16 (umulhi x y))) (vec_umulhi $I8X16 x y)) +(rule (lower (has_type $I16X8 (umulhi x y))) (vec_umulhi $I16X8 x y)) +(rule (lower (has_type $I32X4 (umulhi x y))) (vec_umulhi $I32X4 x y)) + +;; Multiply high part unsigned, vector types with 64-bit elements. +;; Has to be scalarized. +(rule (lower (has_type $I64X2 (umulhi x y))) + (let ((pair_0 RegPair (umul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg)))) + (res_0 Reg (copy_reg $I64 (regpair_hi pair_0))) + (pair_1 RegPair (umul_wide (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg)))) + (res_1 Reg (copy_reg $I64 (regpair_hi pair_1)))) + (mov_to_vec128 $I64X2 res_0 res_1))) + ;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -282,6 +420,55 @@ (let ((pair RegPair (smul_wide x y))) (copy_reg $I64 (regpair_hi pair)))) +;; Multiply high part signed, vector types with 8-, 16-, or 32-bit elements. +(rule (lower (has_type $I8X16 (smulhi x y))) (vec_smulhi $I8X16 x y)) +(rule (lower (has_type $I16X8 (smulhi x y))) (vec_smulhi $I16X8 x y)) +(rule (lower (has_type $I32X4 (smulhi x y))) (vec_smulhi $I32X4 x y)) + +;; Multiply high part unsigned, vector types with 64-bit elements. +;; Has to be scalarized. +(rule (lower (has_type $I64X2 (smulhi x y))) + (let ((pair_0 RegPair (smul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) + (vec_extract_lane $I64X2 y 0 (zero_reg)))) + (res_0 Reg (copy_reg $I64 (regpair_hi pair_0))) + (pair_1 RegPair (smul_wide (vec_extract_lane $I64X2 x 1 (zero_reg)) + (vec_extract_lane $I64X2 y 1 (zero_reg)))) + (res_1 Reg (copy_reg $I64 (regpair_hi pair_1)))) + (mov_to_vec128 $I64X2 res_0 res_1))) + + +;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Widening pairwise dot product of two vector registers. +(rule (lower (has_type dst_ty (widening_pairwise_dot_product_s + x @ (value_type src_ty) y))) + (vec_add dst_ty (vec_smul_even src_ty x y) + (vec_smul_odd src_ty x y))) + + +;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Fixed-point multiplication of two vector registers. +(rule (lower (has_type (ty_vec128 ty) (sqmul_round_sat x y))) + (vec_pack_ssat (vec_widen_type ty) + (sqmul_impl (vec_widen_type ty) + (vec_unpacks_high ty x) + (vec_unpacks_high ty y)) + (sqmul_impl (vec_widen_type ty) + (vec_unpacks_low ty x) + (vec_unpacks_low ty y)))) + +;; Helper to perform the rounded multiply in the wider type. +(decl sqmul_impl (Type Reg Reg) Reg) +(rule (sqmul_impl $I32X4 x y) + (vec_ashr_imm $I32X4 (vec_add $I32X4 (vec_mul_impl $I32X4 x y) + (vec_imm_bit_mask $I32X4 17 17)) + 15)) +(rule (sqmul_impl $I64X2 x y) + (vec_ashr_imm $I64X2 (vec_add $I64X2 (vec_mul_impl $I64X2 x y) + (vec_imm_bit_mask $I64X2 33 33)) + 31)) + ;;;; Rules for `udiv` and `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -487,6 +674,15 @@ (let ((masked_amt u8 (mask_amt_imm ty y))) (lshl_imm ty x masked_amt))) +;; Vector shift left, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (ishl x y))) + (vec_lshl_reg ty x y)) + +;; Vector shift left, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (ishl x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_lshl_imm ty x masked_amt))) + ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -504,6 +700,15 @@ (masked_amt u8 (mask_amt_imm ty y))) (lshr_imm (ty_ext32 ty) ext_reg masked_amt))) +;; Vector shift right logical, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (ushr x y))) + (vec_lshr_reg ty x y)) + +;; Vector shift right logical, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (ushr x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_lshr_imm ty x masked_amt))) + ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -521,6 +726,15 @@ (masked_amt u8 (mask_amt_imm ty y))) (ashr_imm (ty_ext32 ty) ext_reg masked_amt))) +;; Vector shift right arithmetic, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (sshr x y))) + (vec_ashr_reg ty x y)) + +;; Vector shift right arithmetic, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (sshr x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_ashr_imm ty x masked_amt))) + ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -556,6 +770,15 @@ (or_reg ty (lshl_imm ext_ty ext_reg masked_pos_amt) (lshr_imm ext_ty ext_reg masked_neg_amt)))) +;; Vector rotate left, shift amount in register. +(rule (lower (has_type (ty_vec128 ty) (rotl x y))) + (vec_rot_reg ty x y)) + +;; Vector rotate left, immediate shift amount. +(rule (lower (has_type (ty_vec128 ty) (rotl x (i64_from_value y)))) + (let ((masked_amt u8 (mask_amt_imm ty y))) + (vec_rot_imm ty x masked_amt))) + ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -594,6 +817,19 @@ (or_reg ty (lshl_imm ext_ty ext_reg masked_neg_amt) (lshr_imm ext_ty ext_reg masked_pos_amt)))) +;; Vector rotate right, shift amount in register. +;; Implemented as rotate left with negated rotate amount. +(rule (lower (has_type (ty_vec128 ty) (rotr x y))) + (let ((negated_amt Reg (neg_reg $I32 y))) + (vec_rot_reg ty x negated_amt))) + +;; Vector rotate right, immediate shift amount. +;; Implemented as rotate left with negated rotate amount. +(rule (lower (has_type (ty_vec128 ty) (rotr x (i64_from_negated_value y)))) + (let ((negated_amt u8 (mask_amt_imm ty y))) + (vec_rot_imm ty x negated_amt))) + + ;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Always a no-op. @@ -623,6 +859,49 @@ (put_in_reg_sext64 x)) +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y)) + (vec_pack_ssat ty y x)) + + +;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y)) + (vec_pack_usat ty y x)) + + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y)) + (let ((zero Reg (vec_imm ty 0))) + (vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero)))) + + +;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_low x @ (value_type (ty_vec128 ty)))) + (vec_unpacks_low ty x)) + + +;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_high x @ (value_type (ty_vec128 ty)))) + (vec_unpacks_high ty x)) + + +;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_low x @ (value_type (ty_vec128 ty)))) + (vec_unpacku_low ty x)) + + +;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_high x @ (value_type (ty_vec128 ty)))) + (vec_unpacku_high ty x)) + + ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; z15 version using a single instruction (NOR). @@ -634,6 +913,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bnot x))) (not_reg ty x)) +;; Vector version using vector NOR. +(rule (lower (has_type (ty_vec128 ty) (bnot x))) + (vec_not ty x)) + ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -657,6 +940,9 @@ (rule (lower (has_type (fits_in_64 ty) (band (sinkable_load_32_64 x) y))) (and_mem ty y (sink_load x))) +;; And two vector registers. +(rule (lower (has_type (ty_vec128 ty) (band x y))) + (vec_and ty x y)) ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -680,6 +966,10 @@ (rule (lower (has_type (fits_in_64 ty) (bor (sinkable_load_32_64 x) y))) (or_mem ty y (sink_load x))) +;; Or two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bor x y))) + (vec_or ty x y)) + ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -699,6 +989,10 @@ (rule (lower (has_type (fits_in_64 ty) (bxor (sinkable_load_32_64 x) y))) (xor_mem ty y (sink_load x))) +;; Xor two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bxor x y))) + (vec_xor ty x y)) + ;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -710,6 +1004,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (band_not x y))) (and_reg ty x (not_reg ty y))) +;; And-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (band_not x y))) + (vec_and_not ty x y)) + ;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -721,6 +1019,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bor_not x y))) (or_reg ty x (not_reg ty y))) +;; Or-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bor_not x y))) + (vec_or_not ty x y)) + ;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -732,6 +1034,10 @@ (rule (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bxor_not x y))) (not_reg ty (xor_reg ty x y))) +;; Xor-not two vector registers. +(rule (lower (has_type (ty_vec128 ty) (bxor_not x y))) + (vec_not_xor ty x y)) + ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -749,6 +1055,17 @@ (if_false Reg (and_reg ty z (not_reg ty rx)))) (or_reg ty if_false if_true))) +;; Bitselect vector registers. +(rule (lower (has_type (ty_vec128 ty) (bitselect x y z))) + (vec_select ty y z x)) + + +;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Vector select. +(rule (lower (has_type (ty_vec128 ty) (vselect x y z))) + (vec_select ty y z x)) + ;;;; Rules for `breduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -917,6 +1234,10 @@ (cnt1 Reg (add_reg $I64 cnt2 (lshl_imm $I64 cnt2 8)))) (lshr_imm $I64 cnt1 56))) +;; Population count for vector types. +(rule (lower (has_type (ty_vec128 ty) (popcnt x))) + (vec_popcnt ty x)) + ;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -960,6 +1281,20 @@ (fmax_reg ty x y)) +;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Minimum of two registers. +(rule (lower (has_type ty (fmin_pseudo x y))) + (fmin_pseudo_reg ty x y)) + + +;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Maximum of two registers. +(rule (lower (has_type ty (fmax_pseudo x y))) + (fmax_pseudo_reg ty x y)) + + ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Copysign of two registers. @@ -967,6 +1302,10 @@ (vec_select $F32 x y (imm $F32 2147483647))) (rule (lower (has_type $F64 (fcopysign x y))) (vec_select $F64 x y (imm $F64 9223372036854775807))) +(rule (lower (has_type $F32X4 (fcopysign x y))) + (vec_select $F32X4 x y (vec_imm_bit_mask $F32X4 1 31))) +(rule (lower (has_type $F64X2 (fcopysign x y))) + (vec_select $F64X2 x y (vec_imm_bit_mask $F64X2 1 63))) ;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1028,35 +1367,73 @@ ;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Promote a register. -(rule (lower (has_type dst_ty (fpromote x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) (fpromote x @ (value_type src_ty)))) (fpromote_reg dst_ty src_ty x)) +;;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Promote a register. +(rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4)))) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x))) + + ;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Demote a register. -(rule (lower (has_type dst_ty (fdemote x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) (fdemote x @ (value_type src_ty)))) (fdemote_reg dst_ty src_ty (FpuRoundMode.Current) x)) +;;;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Demote a register. +(rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2)))) + (let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x))) + (vec_permute $F32X4 dst (vec_imm $F32X4 0) + (vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16 + 0 1 2 3 8 9 10 11))))) + + ;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Convert a 32-bit or smaller unsigned integer to $F32 (z15 instruction). (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty)))))) (fcvt_from_uint_reg $F32 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr32 (put_in_reg_zext32 x)))) + (put_in_reg_zext32 x))) ;; Convert a 64-bit or smaller unsigned integer to $F32, via an intermediate $F64. (rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_64 ty))))) (fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven) (fcvt_from_uint_reg $F64 (FpuRoundMode.ShorterPrecision) - (mov_to_fpr64 (put_in_reg_zext64 x))))) + (put_in_reg_zext64 x)))) ;; Convert a 64-bit or smaller unsigned integer to $F64. (rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_64 ty))))) (fcvt_from_uint_reg $F64 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr64 (put_in_reg_zext64 x)))) + (put_in_reg_zext64 x))) + +;; Convert $I32X4 to $F32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4) + (fcvt_from_uint x @ (value_type $I32X4)))) + (fcvt_from_uint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x)) + +;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4) + (fcvt_from_uint x @ (value_type $I32X4)))) + (vec_permute $F32X4 + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacku_high $I32X4 x))) + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacku_low $I32X4 x))) + (vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27)))) + +;; Convert $I64X2 to $F64X2. +(rule (lower (has_type $F64X2 (fcvt_from_uint x @ (value_type $I64X2)))) + (fcvt_from_uint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x)) ;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1065,25 +1442,55 @@ (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (and (vxrs_ext2_enabled) (fits_in_32 ty)))))) (fcvt_from_sint_reg $F32 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr32 (put_in_reg_sext32 x)))) + (put_in_reg_sext32 x))) ;; Convert a 64-bit or smaller signed integer to $F32, via an intermediate $F64. (rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_64 ty))))) (fdemote_reg $F32 $F64 (FpuRoundMode.ToNearestTiesToEven) (fcvt_from_sint_reg $F64 (FpuRoundMode.ShorterPrecision) - (mov_to_fpr64 (put_in_reg_sext64 x))))) + (put_in_reg_sext64 x)))) ;; Convert a 64-bit or smaller signed integer to $F64. (rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_64 ty))))) (fcvt_from_sint_reg $F64 (FpuRoundMode.ToNearestTiesToEven) - (mov_to_fpr64 (put_in_reg_sext64 x)))) + (put_in_reg_sext64 x))) + +;; Convert $I32X4 to $F32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $F32X4) + (fcvt_from_sint x @ (value_type $I32X4)))) + (fcvt_from_sint_reg $F32X4 (FpuRoundMode.ToNearestTiesToEven) x)) + +;; Convert $I32X4 to $F32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $F32X4) + (fcvt_from_sint x @ (value_type $I32X4)))) + (vec_permute $F32X4 + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacks_high $I32X4 x))) + (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ShorterPrecision) + (vec_unpacks_low $I32X4 x))) + (vec_imm $I8X16 (imm8x16 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27)))) + +;; Convert $I64X2 to $F64X2. +(rule (lower (has_type $F64X2 (fcvt_from_sint x @ (value_type $I64X2)))) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) x)) + + +;;;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Convert the low half of a $I32X4 to a $F64X2. +(rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4)))) + (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) + (vec_unpacks_low $I32X4 x))) ;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to an unsigned integer value. +;; Convert a scalar floating-point value in a register to an unsigned integer. ;; Traps if the input cannot be represented in the output type. -(rule (lower (has_type dst_ty (fcvt_to_uint x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_uint x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; First, check whether the input is a NaN, and trap if so. (_1 Reg (trap_if (fcmp_reg src_ty src src) @@ -1104,9 +1511,10 @@ ;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to a signed integer value. +;; Convert a scalar floating-point value in a register to a signed integer. ;; Traps if the input cannot be represented in the output type. -(rule (lower (has_type dst_ty (fcvt_to_sint x @ (value_type src_ty)))) +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_sint x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; First, check whether the input is a NaN, and trap if so. (_1 Reg (trap_if (fcmp_reg src_ty src src) @@ -1128,8 +1536,9 @@ ;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to an unsigned integer value. -(rule (lower (has_type dst_ty (fcvt_to_uint_sat x @ (value_type src_ty)))) +;; Convert a scalar floating-point value in a register to an unsigned integer. +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_uint_sat x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; Perform the conversion using the larger type size. (flt_ty Type (fcvt_flt_ty dst_ty src_ty)) @@ -1139,11 +1548,30 @@ ;; Clamp the output to the destination type bounds. (uint_sat_reg dst_ty int_ty dst))) +;; Convert $F32X4 to $I32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4) + (fcvt_to_uint_sat x @ (value_type $F32X4)))) + (fcvt_to_uint_reg $F32X4 (FpuRoundMode.ToZero) x)) + +;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4) + (fcvt_to_uint_sat x @ (value_type $F32X4)))) + (vec_pack_usat $I64X2 + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 x x))) + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x))))) + +;; Convert $F64X2 to $I64X2. +(rule (lower (has_type $I64X2 (fcvt_to_uint_sat x @ (value_type $F64X2)))) + (fcvt_to_uint_reg $F64X2 (FpuRoundMode.ToZero) x)) + ;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Convert a floating-point value in a register to a signed integer value. -(rule (lower (has_type dst_ty (fcvt_to_sint_sat x @ (value_type src_ty)))) +;; Convert a scalar floating-point value in a register to a signed integer. +(rule (lower (has_type (fits_in_64 dst_ty) + (fcvt_to_sint_sat x @ (value_type src_ty)))) (let ((src Reg (put_in_reg x)) ;; Perform the conversion using the larger type size. (flt_ty Type (fcvt_flt_ty dst_ty src_ty)) @@ -1160,24 +1588,578 @@ ;; Clamp the output to the destination type bounds. (sint_sat_reg dst_ty int_ty sat))) +;; Convert $F32X4 to $I32X4 (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) $I32X4) + (fcvt_to_sint_sat src @ (value_type $F32X4)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I32X4 + (fcvt_to_sint_reg $F32X4 (FpuRoundMode.ToZero) src) + (vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src))) + +;; Convert $F32X4 to $I32X4 (via two $F64X2 on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) $I32X4) + (fcvt_to_sint_sat src @ (value_type $F32X4)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I32X4 + (vec_pack_ssat $I64X2 + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_high $I32X4 src src))) + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 src src)))) + (vec_imm $I32X4 0) (vec_fcmpeq $F32X4 src src))) + +;; Convert $F64X2 to $I64X2. +(rule (lower (has_type $I64X2 (fcvt_to_sint_sat src @ (value_type $F64X2)))) + ;; See above for why we need to handle NaNs specially. + (vec_select $I64X2 + (fcvt_to_sint_reg $F64X2 (FpuRoundMode.ToZero) src) + (vec_imm $I64X2 0) (vec_fcmpeq $F64X2 src src))) + ;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Reinterpret a 64-bit integer value as floating-point. (rule (lower (has_type $F64 (bitcast x @ (value_type $I64)))) - (mov_to_fpr64 x)) + (vec_insert_lane_undef $F64X2 x 0 (zero_reg))) ;; Reinterpret a 64-bit floating-point value as integer. (rule (lower (has_type $I64 (bitcast x @ (value_type $F64)))) - (mov_from_fpr64 x)) + (vec_extract_lane $F64X2 x 0 (zero_reg))) -;; Reinterpret a 32-bit integer value as floating-point (via $I64). +;; Reinterpret a 32-bit integer value as floating-point. (rule (lower (has_type $F32 (bitcast x @ (value_type $I32)))) - (mov_to_fpr32 x)) + (vec_insert_lane_undef $F32X4 x 0 (zero_reg))) -;; Reinterpret a 32-bit floating-point value as integer (via $I64). +;; Reinterpret a 32-bit floating-point value as integer. (rule (lower (has_type $I32 (bitcast x @ (value_type $F32)))) - (mov_from_fpr32 x)) + (vec_extract_lane $F32X4 x 0 (zero_reg))) + + +;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Raw bitcast is always a no-op. +(rule (lower (raw_bitcast x)) x) + + +;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Insert vector lane from general-purpose register. +(rule (lower (insertlane x @ (value_type ty) + y @ (value_type (ty_int_bool_ref_scalar_64 _)) + (u8_from_uimm8 idx))) + (vec_insert_lane ty x y (be_lane_idx ty idx) (zero_reg))) + +;; Insert vector lane from floating-point register. +(rule (lower (insertlane x @ (value_type ty) + y @ (value_type (ty_scalar_float _)) + (u8_from_uimm8 idx))) + (vec_move_lane_and_insert ty x (be_lane_idx ty idx) y 0)) + +;; Insert vector lane from another vector lane. +(rule (lower (insertlane x @ (value_type ty) + (extractlane y (u8_from_uimm8 src_idx)) + (u8_from_uimm8 dst_idx))) + (vec_move_lane_and_insert ty x (be_lane_idx ty dst_idx) + y (be_lane_idx ty src_idx))) + +;; Insert vector lane from signed 16-bit immediate. +(rule (lower (insertlane x @ (value_type ty) (i16_from_value y) + (u8_from_uimm8 idx))) + (vec_insert_lane_imm ty x y (be_lane_idx ty idx))) + +;; Insert vector lane from big-endian memory. +(rule (lower (insertlane x @ (value_type ty) (sinkable_load y) + (u8_from_uimm8 idx))) + (vec_load_lane ty x (sink_load y) (be_lane_idx ty idx))) + +;; Insert vector lane from little-endian memory. +(rule (lower (insertlane x @ (value_type ty) (sinkable_load_little y) + (u8_from_uimm8 idx))) + (vec_load_lane_little ty x (sink_load y) (be_lane_idx ty idx))) + + +;; Helper to extract one lane from a vector and insert it into another. +(decl vec_move_lane_and_insert (Type Reg u8 Reg u8) Reg) + +;; For 64-bit elements we always use VPDI. +(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 0 src src_idx) + (vec_permute_dw_imm ty src src_idx dst 1)) +(rule (vec_move_lane_and_insert ty @ (multi_lane 64 _) dst 1 src src_idx) + (vec_permute_dw_imm ty dst 0 src src_idx)) + +;; If source and destination index are the same, use vec_select. +(rule (vec_move_lane_and_insert ty dst idx src idx) + (vec_select ty src + dst (vec_imm_byte_mask ty (lane_byte_mask ty idx)))) + +;; Otherwise replicate source first and then use vec_select. +(rule (vec_move_lane_and_insert ty dst dst_idx src src_idx) + (vec_select ty (vec_replicate_lane ty src src_idx) + dst (vec_imm_byte_mask ty (lane_byte_mask ty dst_idx)))) + + +;; Helper to implement a generic little-endian variant of vec_load_lane. +(decl vec_load_lane_little (Type Reg MemArg u8) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_lane_little ty @ (multi_lane 8 _) dst addr lane_imm) + (vec_load_lane ty dst addr lane_imm)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) +(rule (vec_load_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) dst addr lane_imm) + (vec_load_lane_rev ty dst addr lane_imm)) + +;; On z14, use a little-endian load to GPR followed by vec_insert_lane. +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev16 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev32 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) dst addr lane_imm) + (vec_insert_lane ty dst (loadrev64 addr) lane_imm (zero_reg))) + +;; Helper to implement a generic little-endian variant of vec_load_lane_undef. +(decl vec_load_lane_little_undef (Type MemArg u8) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_lane_little_undef ty @ (multi_lane 8 _) addr lane_imm) + (vec_load_lane_undef ty addr lane_imm)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) addr lane_imm) + (vec_load_lane_rev_undef ty addr lane_imm)) + +;; On z14, use a little-endian load to GPR followed by vec_insert_lane_undef. +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev16 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev32 addr) lane_imm (zero_reg))) +(rule (vec_load_lane_little_undef (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) addr lane_imm) + (vec_insert_lane_undef ty (loadrev64 addr) lane_imm (zero_reg))) + + +;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Extract vector lane to general-purpose register. +(rule (lower (has_type (ty_int_bool_ref_scalar_64 _) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))) + (vec_extract_lane ty x (be_lane_idx ty idx) (zero_reg))) + +;; Extract vector lane to floating-point register. +(rule (lower (has_type (ty_scalar_float _) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))) + (vec_replicate_lane ty x (be_lane_idx ty idx))) + +;; Extract vector lane and store to big-endian memory. +(rule (lower (store flags @ (bigendian) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)) + addr offset)) + (side_effect (vec_store_lane ty x + (lower_address flags addr offset) (be_lane_idx ty idx)))) + +;; Extract vector lane and store to little-endian memory. +(rule (lower (store flags @ (littleendian) + (extractlane x @ (value_type ty) (u8_from_uimm8 idx)) + addr offset)) + (side_effect (vec_store_lane_little ty x + (lower_address flags addr offset) (be_lane_idx ty idx)))) + + +;; Helper to implement a generic little-endian variant of vec_store_lane. +(decl vec_store_lane_little (Type Reg MemArg u8) SideEffectNoResult) + +;; 8-byte little-endian stores can be performed via a normal store. +(rule (vec_store_lane_little ty @ (multi_lane 8 _) src addr lane_imm) + (vec_store_lane ty src addr lane_imm)) + +;; On z15, we have instructions to perform little-endian stores. +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) +(rule (vec_store_lane_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) src addr lane_imm) + (vec_store_lane_rev ty src addr lane_imm)) + +;; On z14, use vec_extract_lane followed by a little-endian store from GPR. +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) src addr lane_imm) + (storerev16 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) src addr lane_imm) + (storerev32 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) +(rule (vec_store_lane_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) src addr lane_imm) + (storerev64 (vec_extract_lane ty src lane_imm (zero_reg)) addr)) + + +;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Load replicated value from general-purpose register. +(rule (lower (has_type ty (splat + x @ (value_type (ty_int_bool_ref_scalar_64 _))))) + (vec_replicate_lane ty (vec_insert_lane_undef ty x 0 (zero_reg)) 0)) + +;; Load replicated value from floating-point register. +(rule (lower (has_type ty (splat + x @ (value_type (ty_scalar_float _))))) + (vec_replicate_lane ty x 0)) + +;; Load replicated value from vector lane. +(rule (lower (has_type ty (splat (extractlane x (u8_from_uimm8 idx))))) + (vec_replicate_lane ty x (be_lane_idx ty idx))) + +;; Load replicated 16-bit immediate value. +(rule (lower (has_type ty (splat (i16_from_value x)))) + (vec_imm_replicate ty x)) + +;; Load replicated value from big-endian memory. +(rule (lower (has_type ty (splat (sinkable_load x)))) + (vec_load_replicate ty (sink_load x))) + +;; Load replicated value from little-endian memory. +(rule (lower (has_type ty (splat (sinkable_load_little x)))) + (vec_load_replicate_little ty (sink_load x))) + + +;; Helper to implement a generic little-endian variant of vec_load_replicate +(decl vec_load_replicate_little (Type MemArg) Reg) + +;; 8-byte little-endian loads can be performed via a normal load. +(rule (vec_load_replicate_little ty @ (multi_lane 8 _) addr) + (vec_load_replicate ty addr)) + +;; On z15, we have instructions to perform little-endian loads. +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 16 _)) addr) + (vec_load_replicate_rev ty addr)) +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 32 _)) addr) + (vec_load_replicate_rev ty addr)) +(rule (vec_load_replicate_little (and (vxrs_ext2_enabled) + ty @ (multi_lane 64 _)) addr) + (vec_load_replicate_rev ty addr)) + +;; On z14, use a little-endian load (via GPR) and replicate. +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 16 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 32 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) +(rule (vec_load_replicate_little (and (vxrs_ext2_disabled) + ty @ (multi_lane 64 _)) addr) + (vec_replicate_lane ty (vec_load_lane_little_undef ty addr 0) 0)) + + +;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Load scalar value from general-purpose register. +(rule (lower (has_type ty (scalar_to_vector + x @ (value_type (ty_int_bool_ref_scalar_64 _))))) + (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg))) + +;; Load scalar value from floating-point register. +(rule (lower (has_type ty (scalar_to_vector + x @ (value_type (ty_scalar_float _))))) + (vec_move_lane_and_zero ty (be_lane_idx ty 0) x 0)) + +;; Load scalar value from vector lane. +(rule (lower (has_type ty (scalar_to_vector + (extractlane x (u8_from_uimm8 idx))))) + (vec_move_lane_and_zero ty (be_lane_idx ty 0) x (be_lane_idx ty idx))) + +;; Load scalar 16-bit immediate value. +(rule (lower (has_type ty (scalar_to_vector (i16_from_value x)))) + (vec_insert_lane_imm ty (vec_imm ty 0) x (be_lane_idx ty 0))) + +;; Load scalar value from big-endian memory. +(rule (lower (has_type ty (scalar_to_vector (sinkable_load x)))) + (vec_load_lane ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0))) + +;; Load scalar value lane from little-endian memory. +(rule (lower (has_type ty (scalar_to_vector (sinkable_load_little x)))) + (vec_load_lane_little ty (vec_imm ty 0) (sink_load x) (be_lane_idx ty 0))) + + +;; Helper to extract one lane from a vector and insert it into a zero vector. +(decl vec_move_lane_and_zero (Type u8 Reg u8) Reg) + +;; For 64-bit elements we always use VPDI. +(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 0 src src_idx) + (vec_permute_dw_imm ty src src_idx (vec_imm ty 0) 0)) +(rule (vec_move_lane_and_zero ty @ (multi_lane 64 _) 1 src src_idx) + (vec_permute_dw_imm ty (vec_imm ty 0) 0 src src_idx)) + +;; If source and destination index are the same, simply mask to this lane. +(rule (vec_move_lane_and_zero ty idx src idx) + (vec_and ty src + (vec_imm_byte_mask ty (lane_byte_mask ty idx)))) + +;; Otherwise replicate source first and then mask to the lane. +(rule (vec_move_lane_and_zero ty dst_idx src src_idx) + (vec_and ty (vec_replicate_lane ty src src_idx) + (vec_imm_byte_mask ty (lane_byte_mask ty dst_idx)))) + + +;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; General case: use vec_permute and then mask off zero lanes. +(rule (lower (shuffle x y (shuffle_mask permute_mask and_mask))) + (vec_and $I8X16 (vec_imm_byte_mask $I8X16 and_mask) + (vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask)))) + +;; If the pattern has no zero lanes, just a vec_permute suffices. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (vec_permute $I8X16 x y (vec_imm $I8X16 permute_mask))) + +;; Special patterns that can be implemented via MERGE HIGH. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) permute_mask) + (vec_merge_high $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23) permute_mask) + (vec_merge_high $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23) permute_mask) + (vec_merge_high $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23) permute_mask) + (vec_merge_high $I8X16 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7) permute_mask) + (vec_merge_high $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7) permute_mask) + (vec_merge_high $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7) permute_mask) + (vec_merge_high $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7) permute_mask) + (vec_merge_high $I8X16 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7) permute_mask) + (vec_merge_high $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7) permute_mask) + (vec_merge_high $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7) permute_mask) + (vec_merge_high $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7) permute_mask) + (vec_merge_high $I8X16 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23) permute_mask) + (vec_merge_high $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23) permute_mask) + (vec_merge_high $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23) permute_mask) + (vec_merge_high $I16X8 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23) permute_mask) + (vec_merge_high $I8X16 y y)) + +;; Special patterns that can be implemented via MERGE LOW. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) permute_mask) + (vec_merge_low $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31) permute_mask) + (vec_merge_low $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31) permute_mask) + (vec_merge_low $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31) permute_mask) + (vec_merge_low $I8X16 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15) permute_mask) + (vec_merge_low $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15) permute_mask) + (vec_merge_low $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15) permute_mask) + (vec_merge_low $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15) permute_mask) + (vec_merge_low $I8X16 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15) permute_mask) + (vec_merge_low $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15) permute_mask) + (vec_merge_low $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15) permute_mask) + (vec_merge_low $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15) permute_mask) + (vec_merge_low $I8X16 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31) permute_mask) + (vec_merge_low $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31) permute_mask) + (vec_merge_low $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31) permute_mask) + (vec_merge_low $I16X8 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31) permute_mask) + (vec_merge_low $I8X16 y y)) + +;; Special patterns that can be implemented via PACK. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31) permute_mask) + (vec_pack $I64X2 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31) permute_mask) + (vec_pack $I32X4 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) permute_mask) + (vec_pack $I16X8 x y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 20 21 22 23 28 29 30 31 4 5 6 7 12 13 14 15) permute_mask) + (vec_pack $I64X2 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 18 19 22 23 26 27 30 31 2 3 6 7 10 11 14 15) permute_mask) + (vec_pack $I32X4 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 17 19 21 23 25 27 29 31 1 3 5 7 9 11 13 15) permute_mask) + (vec_pack $I16X8 y x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15) permute_mask) + (vec_pack $I64X2 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 2 3 6 7 10 11 14 15 2 3 6 7 10 11 14 15) permute_mask) + (vec_pack $I32X4 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 1 3 5 7 9 11 13 15 1 3 5 7 9 11 13 15) permute_mask) + (vec_pack $I16X8 x x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 20 21 22 23 28 29 30 31 20 21 22 23 28 29 30 31) permute_mask) + (vec_pack $I64X2 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 18 19 22 23 26 27 30 31 18 19 22 23 26 27 30 31) permute_mask) + (vec_pack $I32X4 y y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 17 19 21 23 25 27 29 31 17 19 21 23 25 27 29 31) permute_mask) + (vec_pack $I16X8 y y)) + +;; Special patterns that can be implemented via UNPACK HIGH. +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 0 1 2 3 _ _ _ _ 4 5 6 7) permute_mask) + (vec_unpacku_high $I32X4 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 0 1 _ _ 2 3 _ _ 4 5 _ _ 6 7) permute_mask) + (vec_unpacku_high $I16X8 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 0 _ 1 _ 2 _ 3 _ 4 _ 5 _ 6 _ 7) permute_mask) + (vec_unpacku_high $I8X16 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 16 17 18 19 _ _ _ _ 20 21 22 23) permute_mask) + (vec_unpacku_high $I32X4 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 16 17 _ _ 18 19 _ _ 20 21 _ _ 22 23) permute_mask) + (vec_unpacku_high $I16X8 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 16 _ 17 _ 18 _ 19 _ 20 _ 21 _ 22 _ 23) permute_mask) + (vec_unpacku_high $I8X16 y)) + +;; Special patterns that can be implemented via UNPACK LOW. +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 8 9 10 11 _ _ _ _ 12 13 14 15) permute_mask) + (vec_unpacku_low $I32X4 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 8 9 _ _ 10 11 _ _ 12 13 _ _ 14 15) permute_mask) + (vec_unpacku_low $I16X8 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 8 _ 9 _ 10 _ 11 _ 12 _ 13 _ 14 _ 15) permute_mask) + (vec_unpacku_low $I8X16 x)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 3855))) + (if-let (imm8x16 _ _ _ _ 24 25 26 27 _ _ _ _ 28 29 30 31) permute_mask) + (vec_unpacku_low $I32X4 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 13107))) + (if-let (imm8x16 _ _ 24 25 _ _ 26 27 _ _ 28 29 _ _ 30 31) permute_mask) + (vec_unpacku_low $I16X8 y)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 21845))) + (if-let (imm8x16 _ 24 _ 25 _ 26 _ 27 _ 28 _ 29 _ 30 _ 31) permute_mask) + (vec_unpacku_low $I8X16 y)) + +;; Special patterns that can be implemented via PERMUTE DOUBLEWORD IMMEDIATE. +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 24 25 26 27 28 29 30 31) permute_mask) + (vec_permute_dw_imm $I8X16 x 0 y 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) permute_mask) + (vec_permute_dw_imm $I8X16 x 1 y 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15) permute_mask) + (vec_permute_dw_imm $I8X16 y 0 x 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 0 1 2 3 4 5 6 7) permute_mask) + (vec_permute_dw_imm $I8X16 y 1 x 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) permute_mask) + (vec_permute_dw_imm $I8X16 x 0 x 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7) permute_mask) + (vec_permute_dw_imm $I8X16 x 1 x 0)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) permute_mask) + (vec_permute_dw_imm $I8X16 y 0 y 1)) +(rule (lower (shuffle x y (shuffle_mask permute_mask 65535))) + (if-let (imm8x16 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23) permute_mask) + (vec_permute_dw_imm $I8X16 y 1 y 0)) + + +;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; We need to modify the lane mask at runtime in two ways: +;; - convert from little-endian to big-endian lane numbering +;; - handle mask elements outside the range 0..15 by zeroing the lane +;; +;; To do so efficiently, we compute: +;; permute-lane-element := umax (239, ~ swizzle-lane-element) +;; which has the following effect: +;; elements 0 .. 15 --> 255 .. 240 (i.e. 31 .. 16 mod 32) +;; everything else --> 239 (i.e. 15 mod 32) +;; +;; Then, we can use a single permute instruction with +;; a zero vector as first operand (covering lane 15) +;; the input vector as second operand (covering lanes 16 .. 31) +;; to implement the required swizzle semantics. + +(rule (lower (has_type (ty_vec128 ty) (swizzle x y))) + (vec_permute ty (vec_imm ty 0) x + (vec_umax $I8X16 (vec_imm_splat $I8X16 239) + (vec_not $I8X16 y)))) ;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1249,35 +2231,39 @@ (rule (lower (has_type $R64 (load flags @ (littleendian) addr offset))) (loadrev64 (lower_address flags addr offset))) -;; Load 32-bit big-endian floating-point values. +;; Load 32-bit big-endian floating-point values (as vector lane). (rule (lower (has_type $F32 (load flags @ (bigendian) addr offset))) - (fpu_load32 (lower_address flags addr offset))) + (vec_load_lane_undef $F32X4 (lower_address flags addr offset) 0)) -;; Load 32-bit little-endian floating-point values (z15 instruction). -(rule (lower (has_type (and (vxrs_ext2_enabled) $F32) - (load flags @ (littleendian) addr offset))) - (fpu_loadrev32 (lower_address flags addr offset))) +;; Load 32-bit little-endian floating-point values (as vector lane). +(rule (lower (has_type $F32 (load flags @ (littleendian) addr offset))) + (vec_load_lane_little_undef $F32X4 (lower_address flags addr offset) 0)) -;; Load 32-bit little-endian floating-point values (via GPR on z14). -(rule (lower (has_type (and (vxrs_ext2_disabled) $F32) - (load flags @ (littleendian) addr offset))) - (let ((gpr Reg (loadrev32 (lower_address flags addr offset)))) - (mov_to_fpr32 gpr))) - -;; Load 64-bit big-endian floating-point values. +;; Load 64-bit big-endian floating-point values (as vector lane). (rule (lower (has_type $F64 (load flags @ (bigendian) addr offset))) - (fpu_load64 (lower_address flags addr offset))) + (vec_load_lane_undef $F64X2 (lower_address flags addr offset) 0)) -;; Load 64-bit little-endian floating-point values (z15 instruction). -(rule (lower (has_type (and (vxrs_ext2_enabled) $F64) - (load flags @ (littleendian) addr offset))) - (fpu_loadrev64 (lower_address flags addr offset))) +;; Load 64-bit little-endian floating-point values (as vector lane). +(rule (lower (has_type $F64 (load flags @ (littleendian) addr offset))) + (vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0)) -;; Load 64-bit little-endian floating-point values (via GPR on z14). -(rule (lower (has_type (and (vxrs_ext2_disabled) $F64) - (load flags @ (littleendian) addr offset))) - (let ((gpr Reg (loadrev64 (lower_address flags addr offset)))) - (mov_to_fpr64 gpr))) +;; Load 128-bit big-endian vector values. +(rule (lower (has_type (ty_vec128 ty) (load flags @ (bigendian) addr offset))) + (vec_load ty (lower_address flags addr offset))) + +;; Load 128-bit little-endian vector values (z15 instruction). +(rule (lower (has_type (and (vxrs_ext2_enabled) (ty_vec128 ty)) + (load flags @ (littleendian) addr offset))) + (vec_loadrev ty (lower_address flags addr offset))) + +;; Load 128-bit little-endian vector values (via GPRs on z14). +(rule (lower (has_type (and (vxrs_ext2_disabled) (ty_vec128 ty)) + (load flags @ (littleendian) addr offset))) + (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) + (hi_addr MemArg (lower_address_bias flags addr offset 8)) + (lo_val Reg (loadrev64 lo_addr)) + (hi_val Reg (loadrev64 hi_addr))) + (mov_to_vec128 ty hi_val lo_val))) ;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1380,6 +2366,69 @@ (sext64_reg $I32 reg32))) +;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Unsigned 8->16 bit extension, big-endian source value. +(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I8X16 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 8->16 bit extension, little-endian source value. +(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I8X16 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 8->16 bit extension, big-endian source value. +(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I8X16 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 8->16 bit extension, little-endian source value. +(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I8X16 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 16->32 bit extension, big-endian source value. +(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I16X8 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 16->32 bit extension, little-endian source value. +(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I16X8 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 16->32 bit extension, big-endian source value. +(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I16X8 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 16->32 bit extension, little-endian source value. +(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I16X8 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 32->64 bit extension, big-endian source value. +(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset))) + (vec_unpacku_high $I32X4 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Unsigned 32->64 bit extension, little-endian source value. +(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset))) + (vec_unpacku_high $I32X4 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 32->64 bit extension, big-endian source value. +(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset))) + (vec_unpacks_high $I32X4 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) + +;; Signed 32->64 bit extension, little-endian source value. +(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset))) + (vec_unpacks_high $I32X4 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) + + ;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The actual store logic for integer types is identical for the `store`, @@ -1405,41 +2454,49 @@ (rule (lower (store flags val @ (value_type $R64) addr offset)) (side_effect (istore64_impl flags val addr offset))) -;; Store 32-bit big-endian floating-point type. +;; Store 32-bit big-endian floating-point type (as vector lane). (rule (lower (store flags @ (bigendian) val @ (value_type $F32) addr offset)) - (side_effect (fpu_store32 (put_in_reg val) - (lower_address flags addr offset)))) + (side_effect (vec_store_lane $F32X4 val + (lower_address flags addr offset) 0))) -;; Store 32-bit little-endian floating-point type (z15 instruction). +;; Store 32-bit little-endian floating-point type (as vector lane). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F32 (vxrs_ext2_enabled))) addr offset)) - (side_effect (fpu_storerev32 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type $F32) addr offset)) + (side_effect (vec_store_lane_little $F32X4 val + (lower_address flags addr offset) 0))) + +;; Store 64-bit big-endian floating-point type (as vector lane). +(rule (lower (store flags @ (bigendian) + val @ (value_type $F64) addr offset)) + (side_effect (vec_store_lane $F64X2 val + (lower_address flags addr offset) 0))) -;; Store 32-bit little-endian floating-point type (via GPR on z14). +;; Store 64-bit little-endian floating-point type (as vector lane). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F32 (vxrs_ext2_disabled))) addr offset)) - (let ((gpr Reg (mov_from_fpr32 (put_in_reg val)))) - (side_effect (storerev32 gpr (lower_address flags addr offset))))) + val @ (value_type $F64) addr offset)) + (side_effect (vec_store_lane_little $F64X2 val + (lower_address flags addr offset) 0))) -;; Store 64-bit big-endian floating-point type. +;; Store 128-bit big-endian vector type. (rule (lower (store flags @ (bigendian) - val @ (value_type $F64) addr offset)) - (side_effect (fpu_store64 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type (ty_vec128 ty)) addr offset)) + (side_effect (vec_store val (lower_address flags addr offset)))) -;; Store 64-bit little-endian floating-point type (z15 instruction). +;; Store 128-bit little-endian vector type (z15 instruction). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F64 (vxrs_ext2_enabled))) addr offset)) - (side_effect (fpu_storerev64 (put_in_reg val) - (lower_address flags addr offset)))) + val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_enabled))) addr offset)) + (side_effect (vec_storerev val (lower_address flags addr offset)))) -;; Store 64-bit little-endian floating-point type (via GPR on z14). +;; Store 128-bit little-endian vector type (via GPRs on z14). (rule (lower (store flags @ (littleendian) - val @ (value_type (and $F64 (vxrs_ext2_disabled))) addr offset)) - (let ((gpr Reg (mov_from_fpr64 (put_in_reg val)))) - (side_effect (storerev64 gpr (lower_address flags addr offset))))) + val @ (value_type (and (ty_vec128 ty) (vxrs_ext2_disabled))) addr offset)) + (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) + (hi_addr MemArg (lower_address_bias flags addr offset 8)) + (lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg))) + (hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg)))) + (side_effect (side_effect_concat (storerev64 lo_val lo_addr) + (storerev64 hi_val hi_addr))))) ;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1947,7 +3004,7 @@ ;; Main `icmp` entry point. Generate a `ProducesBool` capturing the ;; integer comparison and immediately lower it to a 0/1 integer result. ;; In this case, it is safe to sink memory loads. -(rule (lower (has_type ty (icmp int_cc x y))) +(rule (lower (has_type (fits_in_64 ty) (icmp int_cc x y))) (lower_bool ty (icmp_val $true int_cc x y))) @@ -2033,12 +3090,36 @@ (rule (icmpu_val $true x @ (value_type (fits_in_64 ty)) (sinkable_uload32 y)) (icmpu_mem_zext32 ty x (sink_uload32 y))) +;; Vector `icmp` produces a boolean vector. +;; We need to handle the various IntCC flags separately here. + +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.Equal) x y))) + (vec_cmpeq ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.NotEqual) x y))) + (vec_not ty (vec_cmpeq ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThan) x y))) + (vec_cmph ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThanOrEqual) x y))) + (vec_not ty (vec_cmph ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedLessThan) x y))) + (vec_cmph ty y x)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (vec_not ty (vec_cmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThan) x y))) + (vec_cmphl ty x y)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (vec_not ty (vec_cmphl ty x y))) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedLessThan) x y))) + (vec_cmphl ty y x)) +(rule (lower (has_type (ty_vec128 ty) (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (vec_not ty (vec_cmphl ty y x))) + ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Main `fcmp` entry point. Generate a `ProducesBool` capturing the ;; integer comparison and immediately lower it to a 0/1 integer result. -(rule (lower (has_type ty (fcmp float_cc x y))) +(rule (lower (has_type (fits_in_64 ty) (fcmp float_cc x y))) (lower_bool ty (fcmp_val float_cc x y))) ;; Return a `ProducesBool` to implement any floating-point comparison. @@ -2047,6 +3128,217 @@ (bool (fcmp_reg ty x y) (floatcc_as_cond float_cc))) +;; Vector `fcmp` produces a boolean vector. +;; We need to handle the various FloatCC flags separately here. + +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Equal) x y))) + (vec_fcmpeq ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.NotEqual) x y))) + (vec_not ty (vec_fcmpeq ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThan) x y))) + (vec_fcmph ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (vec_not ty (vec_fcmph ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (vec_fcmphe ty x y)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (vec_not ty (vec_fcmphe ty x y))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThan) x y))) + (vec_fcmph ty y x)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (vec_not ty (vec_fcmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.LessThanOrEqual) x y))) + (vec_fcmphe ty y x)) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (vec_not ty (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Ordered) x y))) + (vec_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.Unordered) x y))) + (vec_not_or ty (vec_fcmphe ty x y) (vec_fcmphe ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.OrderedNotEqual) x y))) + (vec_or ty (vec_fcmph ty x y) (vec_fcmph ty y x))) +(rule (lower (has_type (ty_vec128 ty) (fcmp (FloatCC.UnorderedOrEqual) x y))) + (vec_not_or ty (vec_fcmph ty x y) (vec_fcmph ty y x))) + + +;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Main `vall_true` entry point. Generate a `ProducesBool` capturing the +;; comparison and immediately lower it to a 0/1 integer result. +(rule (lower (has_type (fits_in_64 ty) (vall_true x))) + (lower_bool ty (vall_true_val x))) + +;; Return a `ProducesBool` to implement `vall_true`. +(decl vall_true_val (Value) ProducesBool) +(rule (vall_true_val x @ (value_type ty)) + (bool (vec_cmpeqs ty x (vec_imm ty 0)) + (floatcc_as_cond (FloatCC.Unordered)))) + +;; Short-circuit `vall_true` on the result of a `icmp`. +(rule (vall_true_val (has_type ty (icmp (IntCC.Equal) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.NotEqual) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) + +;; Short-circuit `vall_true` on the result of a `fcmp` where possible. +(rule (vall_true_val (has_type ty (fcmp (FloatCC.Equal) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThan) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Equal)))) +(rule (vall_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Unordered)))) + + +;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Main `vany_true` entry point. Generate a `ProducesBool` capturing the +;; comparison and immediately lower it to a 0/1 integer result. +(rule (lower (has_type (fits_in_64 ty) (vany_true x))) + (lower_bool ty (vany_true_val x))) + +;; Return a `ProducesBool` to implement `vany_true`. +(decl vany_true_val (Value) ProducesBool) +(rule (vany_true_val x @ (value_type ty)) + (bool (vec_cmpeqs ty x (vec_imm ty 0)) + (floatcc_as_cond (FloatCC.NotEqual)))) + +;; Short-circuit `vany_true` on the result of a `icmp`. +(rule (vany_true_val (has_type ty (icmp (IntCC.Equal) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.NotEqual) x y))) + (bool (vec_cmpeqs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThan) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThanOrEqual) x y))) + (bool (vec_cmphs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedLessThan) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.SignedGreaterThanOrEqual) x y))) + (bool (vec_cmphs ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThan) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThanOrEqual) x y))) + (bool (vec_cmphls ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedLessThan) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (icmp (IntCC.UnsignedGreaterThanOrEqual) x y))) + (bool (vec_cmphls ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) + +;; Short-circuit `vany_true` on the result of a `fcmp` where possible. +(rule (vany_true_val (has_type ty (fcmp (FloatCC.Equal) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.NotEqual) x y))) + (bool (vec_fcmpeqs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThan) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThanOrEqual) x y))) + (bool (vec_fcmphs ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.GreaterThanOrEqual) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrLessThan) x y))) + (bool (vec_fcmphes ty x y) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThan) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) x y))) + (bool (vec_fcmphs ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.LessThanOrEqual) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.Ordered)))) +(rule (vany_true_val (has_type ty (fcmp (FloatCC.UnorderedOrGreaterThan) x y))) + (bool (vec_fcmphes ty y x) + (floatcc_as_cond (FloatCC.NotEqual)))) + + +;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56 + 64 72 80 88 96 104 112 120)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 0 16 32 48 64 80 96 112)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 0 32 64 96)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + +(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2)))) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 128 128 0 64)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) + ;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index 2c87621aae32..df93c47023dd 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -43,15 +43,28 @@ impl LowerBackend for S390xBackend { | Opcode::Bconst | Opcode::F32const | Opcode::F64const + | Opcode::Vconst | Opcode::Null | Opcode::Iadd | Opcode::IaddIfcout | Opcode::Isub + | Opcode::UaddSat + | Opcode::SaddSat + | Opcode::UsubSat + | Opcode::SsubSat + | Opcode::IaddPairwise + | Opcode::Imin + | Opcode::Umin + | Opcode::Imax + | Opcode::Umax + | Opcode::AvgRound | Opcode::Iabs | Opcode::Ineg | Opcode::Imul | Opcode::Umulhi | Opcode::Smulhi + | Opcode::WideningPairwiseDotProductS + | Opcode::SqmulRoundSat | Opcode::Udiv | Opcode::Urem | Opcode::Sdiv @@ -64,6 +77,13 @@ impl LowerBackend for S390xBackend { | Opcode::Ireduce | Opcode::Uextend | Opcode::Sextend + | Opcode::Snarrow + | Opcode::Unarrow + | Opcode::Uunarrow + | Opcode::SwidenLow + | Opcode::SwidenHigh + | Opcode::UwidenLow + | Opcode::UwidenHigh | Opcode::Bnot | Opcode::Band | Opcode::Bor @@ -72,6 +92,7 @@ impl LowerBackend for S390xBackend { | Opcode::BorNot | Opcode::BxorNot | Opcode::Bitselect + | Opcode::Vselect | Opcode::Breduce | Opcode::Bextend | Opcode::Bmask @@ -86,11 +107,15 @@ impl LowerBackend for S390xBackend { | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax + | Opcode::FminPseudo + | Opcode::FmaxPseudo | Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote + | Opcode::FvpromoteLow + | Opcode::Fvdemote | Opcode::Ceil | Opcode::Floor | Opcode::Trunc @@ -99,11 +124,20 @@ impl LowerBackend for S390xBackend { | Opcode::Fcopysign | Opcode::FcvtFromUint | Opcode::FcvtFromSint + | Opcode::FcvtLowFromSint | Opcode::FcvtToUint | Opcode::FcvtToSint | Opcode::FcvtToUintSat | Opcode::FcvtToSintSat + | Opcode::Splat + | Opcode::Swizzle + | Opcode::Shuffle + | Opcode::Insertlane + | Opcode::Extractlane + | Opcode::ScalarToVector + | Opcode::VhighBits | Opcode::Bitcast + | Opcode::RawBitcast | Opcode::Load | Opcode::Uload8 | Opcode::Sload8 @@ -111,6 +145,12 @@ impl LowerBackend for S390xBackend { | Opcode::Sload16 | Opcode::Uload32 | Opcode::Sload32 + | Opcode::Uload8x8 + | Opcode::Sload8x8 + | Opcode::Uload16x4 + | Opcode::Sload16x4 + | Opcode::Uload32x2 + | Opcode::Sload32x2 | Opcode::Store | Opcode::Istore8 | Opcode::Istore16 @@ -122,6 +162,8 @@ impl LowerBackend for S390xBackend { | Opcode::Fence | Opcode::Icmp | Opcode::Fcmp + | Opcode::VanyTrue + | Opcode::VallTrue | Opcode::IsNull | Opcode::IsInvalid | Opcode::Select @@ -147,57 +189,15 @@ impl LowerBackend for S390xBackend { ) } - Opcode::UaddSat - | Opcode::SaddSat - | Opcode::UsubSat - | Opcode::SsubSat - | Opcode::Bitrev - | Opcode::FcvtLowFromSint + Opcode::Bitrev | Opcode::ConstAddr | Opcode::TlsValue | Opcode::GetPinnedReg | Opcode::SetPinnedReg | Opcode::Isplit | Opcode::Iconcat - | Opcode::RawBitcast - | Opcode::Splat - | Opcode::Swizzle - | Opcode::Insertlane - | Opcode::Extractlane - | Opcode::Imin - | Opcode::Umin - | Opcode::Imax - | Opcode::Umax - | Opcode::AvgRound - | Opcode::FminPseudo - | Opcode::FmaxPseudo - | Opcode::Uload8x8 - | Opcode::Sload8x8 - | Opcode::Uload16x4 - | Opcode::Sload16x4 - | Opcode::Uload32x2 - | Opcode::Sload32x2 - | Opcode::Vconst - | Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat - | Opcode::Vselect - | Opcode::VanyTrue - | Opcode::VallTrue - | Opcode::VhighBits - | Opcode::ScalarToVector - | Opcode::Snarrow - | Opcode::Unarrow - | Opcode::Uunarrow - | Opcode::SwidenLow - | Opcode::SwidenHigh - | Opcode::UwidenLow - | Opcode::UwidenHigh - | Opcode::WideningPairwiseDotProductS - | Opcode::SqmulRoundSat - | Opcode::FvpromoteLow - | Opcode::Fvdemote - | Opcode::IaddPairwise | Opcode::DynamicStackLoad | Opcode::DynamicStackStore | Opcode::DynamicStackAddr diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index 2d41c6a88adc..4db95d40afb5 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -6,7 +6,7 @@ pub mod generated_code; // Types that the generated ISLE code uses via `use super::*`. use crate::isa::s390x::abi::S390xMachineDeps; use crate::isa::s390x::inst::{ - stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, + stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, UImm12, UImm16Shifted, UImm32Shifted, }; use crate::isa::s390x::settings::Flags as IsaFlags; @@ -91,6 +91,8 @@ where defs, clobbers, opcode: *opcode, + caller_callconv: self.lower_ctx.abi().call_conv(), + callee_callconv: abi.call_conv(), }) } @@ -102,6 +104,8 @@ where defs, clobbers, opcode: *opcode, + caller_callconv: self.lower_ctx.abi().call_conv(), + callee_callconv: abi.call_conv(), }) } @@ -195,6 +199,46 @@ where } } + #[inline] + fn u64_pair_split(&mut self, n: u128) -> (u64, u64) { + ((n >> 64) as u64, n as u64) + } + + #[inline] + fn u64_pair_concat(&mut self, hi: u64, lo: u64) -> u128 { + (hi as u128) << 64 | (lo as u128) + } + + #[inline] + fn u32_pair_split(&mut self, n: u64) -> (u32, u32) { + ((n >> 32) as u32, n as u32) + } + + #[inline] + fn u32_pair_concat(&mut self, hi: u32, lo: u32) -> u64 { + (hi as u64) << 32 | (lo as u64) + } + + #[inline] + fn u16_pair_split(&mut self, n: u32) -> (u16, u16) { + ((n >> 16) as u16, n as u16) + } + + #[inline] + fn u16_pair_concat(&mut self, hi: u16, lo: u16) -> u32 { + (hi as u32) << 16 | (lo as u32) + } + + #[inline] + fn u8_pair_split(&mut self, n: u16) -> (u8, u8) { + ((n >> 8) as u8, n as u8) + } + + #[inline] + fn u8_pair_concat(&mut self, hi: u8, lo: u8) -> u16 { + (hi as u16) << 8 | (lo as u16) + } + #[inline] fn u8_as_u16(&mut self, n: u8) -> u16 { n as u16 @@ -248,6 +292,15 @@ where } } + #[inline] + fn i16_from_u32(&mut self, n: u32) -> Option { + if let Ok(imm) = i16::try_from(n as i32) { + Some(imm) + } else { + None + } + } + #[inline] fn uimm32shifted_from_u64(&mut self, n: u64) -> Option { UImm32Shifted::maybe_from_u64(n) @@ -258,11 +311,49 @@ where UImm16Shifted::maybe_from_u64(n) } + #[inline] + fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 { + ty.lane_count() as u8 - 1 - idx + } + + #[inline] + fn lane_byte_mask(&mut self, ty: Type, idx: u8) -> u16 { + let lane_bytes = (ty.lane_bits() / 8) as u8; + let lane_mask = (1u16 << lane_bytes) - 1; + lane_mask << (16 - ((idx + 1) * lane_bytes)) + } + + #[inline] + fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) { + let bytes = idx.to_be_bytes(); + let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16); + let bytes = bytes.map(|x| { + if x < 16 { + 15 - x + } else if x < 32 { + 47 - x + } else { + 128 + } + }); + let permute_mask = u128::from_be_bytes(bytes); + (permute_mask, and_mask) + } + #[inline] fn u64_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; let constant = self.lower_ctx.get_constant(inst)?; - Some(constant) + let ty = self.lower_ctx.output_ty(inst, 0); + Some(zero_extend_to_u64(constant, self.ty_bits(ty).unwrap())) + } + + #[inline] + fn u64_from_inverted_value(&mut self, val: Value) -> Option { + let inst = self.lower_ctx.dfg().value_def(val).inst()?; + let constant = self.lower_ctx.get_constant(inst)?; + let ty = self.lower_ctx.output_ty(inst, 0); + Some(zero_extend_to_u64(!constant, self.ty_bits(ty).unwrap())) } #[inline] @@ -349,22 +440,22 @@ where #[inline] fn uimm16shifted_from_inverted_value(&mut self, val: Value) -> Option { - let constant = self.u64_from_value(val)?; - let imm = UImm16Shifted::maybe_from_u64(!constant)?; + let constant = self.u64_from_inverted_value(val)?; + let imm = UImm16Shifted::maybe_from_u64(constant)?; Some(imm.negate_bits()) } #[inline] fn uimm32shifted_from_inverted_value(&mut self, val: Value) -> Option { - let constant = self.u64_from_value(val)?; - let imm = UImm32Shifted::maybe_from_u64(!constant)?; + let constant = self.u64_from_inverted_value(val)?; + let imm = UImm32Shifted::maybe_from_u64(constant)?; Some(imm.negate_bits()) } #[inline] fn mask_amt_imm(&mut self, ty: Type, amt: i64) -> u8 { - let mask = self.ty_bits(ty).unwrap() - 1; - (amt as u8) & mask + let mask = ty.lane_bits() - 1; + (amt as u8) & (mask as u8) } #[inline] @@ -498,13 +589,18 @@ where } #[inline] - fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, flags: MemFlags) -> MemArg { - MemArg::reg_plus_reg(x, y, flags) + fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, bias: u8, flags: MemFlags) -> MemArg { + MemArg::BXD12 { + base: x, + index: y, + disp: UImm12::maybe_from_u64(bias as u64).unwrap(), + flags, + } } #[inline] - fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, flags: MemFlags) -> MemArg { - MemArg::reg_plus_off(reg, off, flags) + fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, bias: u8, flags: MemFlags) -> MemArg { + MemArg::reg_plus_off(reg, off + (bias as i64), flags) } #[inline] @@ -586,6 +682,17 @@ where } } +/// Zero-extend the low `from_bits` bits of `value` to a full u64. +#[inline] +fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 { + assert!(from_bits <= 64); + if from_bits >= 64 { + value + } else { + value & ((1u64 << from_bits) - 1) + } +} + /// Sign-extend the low `from_bits` bits of `value` to a full u64. #[inline] fn sign_extend_to_u64(value: u64, from_bits: u8) -> u64 { diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs index 45bf8884b916..718871e82708 100644 --- a/cranelift/codegen/src/machinst/abi_impl.rs +++ b/cranelift/codegen/src/machinst/abi_impl.rs @@ -696,6 +696,11 @@ impl ABISig { let ret_arg = self.stack_ret_arg?; Some(self.args[ret_arg].clone()) } + + /// Get calling convention used. + pub fn call_conv(&self) -> isa::CallConv { + self.call_conv + } } /// ABI object for a function body. diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index c941ead14356..ad9731e6a8e6 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -7,7 +7,8 @@ use std::cell::Cell; pub use super::MachLabel; pub use crate::ir::{ - ArgumentExtension, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, SigRef, StackSlot, + ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate, + SigRef, StackSlot, }; pub use crate::isa::unwind::UnwindInst; pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable}; @@ -547,6 +548,18 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn u128_from_immediate(&mut self, imm: Immediate) -> Option { + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + Some(u128::from_le_bytes(bytes.try_into().ok()?)) + } + + #[inline] + fn u128_from_constant(&mut self, constant: Constant) -> Option { + let bytes = self.lower_ctx.get_constant_data(constant).as_slice(); + Some(u128::from_le_bytes(bytes.try_into().ok()?)) + } + fn nonzero_u64_from_imm64(&mut self, val: Imm64) -> Option { match val.bits() { 0 => None, diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index a496f9a61657..42a4f74c240b 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -12,8 +12,8 @@ use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; use crate::ir::{ types::{FFLAGS, IFLAGS}, ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, ExternalName, Function, - GlobalValue, GlobalValueData, Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc, - Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart, + GlobalValue, GlobalValueData, Immediate, Inst, InstructionData, MemFlags, Opcode, Signature, + SourceLoc, Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart, }; use crate::machinst::{ non_writable_value_regs, writable_value_regs, ABICallee, BlockIndex, BlockLoweringOrder, @@ -167,6 +167,8 @@ pub trait LowerCtx { /// for the input produced by the sunk instruction), otherwise the /// side-effect will occur twice. fn sink_inst(&mut self, ir_inst: Inst); + /// Retrieve immediate data given a handle. + fn get_immediate_data(&self, imm: Immediate) -> &ConstantData; /// Retrieve constant data given a handle. fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData; /// Indicate that a constant should be emitted. @@ -1448,6 +1450,10 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { self.inst_sunk.insert(ir_inst); } + fn get_immediate_data(&self, imm: Immediate) -> &ConstantData { + self.f.dfg.immediates.get(imm).unwrap() + } + fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData { self.f.dfg.constants.get(constant_handle) } diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 62933cd7a16c..9b1d1e3bb3f4 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -661,6 +661,17 @@ (decl reloc_distance_near () RelocDistance) (extern extractor reloc_distance_near reloc_distance_near) +;; Accessor for `Immediate` as u128. + +(decl u128_from_immediate (u128) Immediate) +(extern extractor u128_from_immediate u128_from_immediate) + +;; Accessor for `Constant` as u128. + +(decl u128_from_constant (u128) Constant) +(extern extractor u128_from_constant u128_from_constant) + + ;;;; Helpers for tail recursion loops ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; A range of integers to loop through. diff --git a/cranelift/filetests/filetests/isa/s390x/condops.clif b/cranelift/filetests/filetests/isa/s390x/condops.clif index 5a73e0ae1e31..d84cd49c0500 100644 --- a/cranelift/filetests/filetests/isa/s390x/condops.clif +++ b/cranelift/filetests/filetests/isa/s390x/condops.clif @@ -43,3 +43,18 @@ block0(v0: i32, v1: i8, v2: i8): ; locre %r2, %r3 ; br %r14 +function %i(i32, i8x16, i8x16) -> i8x16 { +block0(v0: i32, v1: i8x16, v2: i8x16): + v3 = iconst.i32 42 + v4 = icmp.i32 eq v0, v3 + v5 = select.i8x16 v4, v1, v2 + return v5 +} + +; block0: +; vlr %v20, %v24 +; clfi %r2, 42 +; vlr %v24, %v25 +; jne 10 ; vlr %v24, %v20 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif index 4a8a84f37e6c..47e28b87d687 100644 --- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif +++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif @@ -168,6 +168,46 @@ block0(v0: f64, v1: f64): ; wfmaxdb %f0, %f0, %f2, 1 ; br %r14 +function %fmin_pseudo_f32(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; wfminsb %f0, %f0, %f2, 3 +; br %r14 + +function %fmin_pseudo_f64(f64, f64) -> f64 { +block0(v0: f64, v1: f64): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; wfmindb %f0, %f0, %f2, 3 +; br %r14 + +function %fmax_pseudo_f32(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; wfmaxsb %f0, %f0, %f2, 3 +; br %r14 + +function %fmax_pseudo_f64(f64, f64) -> f64 { +block0(v0: f64, v1: f64): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; wfmaxdb %f0, %f0, %f2, 3 +; br %r14 + function %sqrt_f32(f32) -> f32 { block0(v0: f32): v1 = sqrt v0 diff --git a/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif b/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif index 76224768bc9f..736d72b7a1da 100644 --- a/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif +++ b/cranelift/filetests/filetests/isa/s390x/fpmem-arch13.clif @@ -8,7 +8,7 @@ block0(v0: i64): } ; block0: -; vlebrg %f0, 0(%r2), 0 +; vlebrg %v0, 0(%r2), 0 ; br %r14 function %load_f32_little(i64) -> f32 { @@ -18,7 +18,7 @@ block0(v0: i64): } ; block0: -; vlebrf %f0, 0(%r2), 0 +; vlebrf %v0, 0(%r2), 0 ; br %r14 function %store_f64_little(f64, i64) { @@ -28,7 +28,7 @@ block0(v0: f64, v1: i64): } ; block0: -; vstebrg %f0, 0(%r2), 0 +; vstebrg %v0, 0(%r2), 0 ; br %r14 function %store_f32_little(f32, i64) { @@ -38,6 +38,6 @@ block0(v0: f32, v1: i64): } ; block0: -; vstebrf %f0, 0(%r2), 0 +; vstebrf %v0, 0(%r2), 0 ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif new file mode 100644 index 000000000000..334c43821b8a --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif @@ -0,0 +1,824 @@ +test compile precise-output +target s390x + +function %iadd_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = iadd.i64x2 v0, v1 + return v2 +} + +; block0: +; vag %v24, %v24, %v25 +; br %r14 + +function %iadd_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd.i32x4 v0, v1 + return v2 +} + +; block0: +; vaf %v24, %v24, %v25 +; br %r14 + +function %iadd_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd.i16x8 v0, v1 + return v2 +} + +; block0: +; vah %v24, %v24, %v25 +; br %r14 + +function %iadd_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd.i8x16 v0, v1 + return v2 +} + +; block0: +; vab %v24, %v24, %v25 +; br %r14 + +function %isub_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = isub.i64x2 v0, v1 + return v2 +} + +; block0: +; vsg %v24, %v24, %v25 +; br %r14 + +function %isub_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = isub.i32x4 v0, v1 + return v2 +} + +; block0: +; vsf %v24, %v24, %v25 +; br %r14 + +function %isub_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = isub.i16x8 v0, v1 + return v2 +} + +; block0: +; vsh %v24, %v24, %v25 +; br %r14 + +function %isub_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = isub.i8x16 v0, v1 + return v2 +} + +; block0: +; vsb %v24, %v24, %v25 +; br %r14 + +function %iabs_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iabs.i64x2 v0 + return v1 +} + +; block0: +; vlpg %v24, %v24 +; br %r14 + +function %iabs_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iabs.i32x4 v0 + return v1 +} + +; block0: +; vlpf %v24, %v24 +; br %r14 + +function %iabs_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iabs.i16x8 v0 + return v1 +} + +; block0: +; vlph %v24, %v24 +; br %r14 + +function %iabs_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iabs.i8x16 v0 + return v1 +} + +; block0: +; vlpb %v24, %v24 +; br %r14 + +function %ineg_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = ineg.i64x2 v0 + return v1 +} + +; block0: +; vlcg %v24, %v24 +; br %r14 + +function %ineg_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = ineg.i32x4 v0 + return v1 +} + +; block0: +; vlcf %v24, %v24 +; br %r14 + +function %ineg_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = ineg.i16x8 v0 + return v1 +} + +; block0: +; vlch %v24, %v24 +; br %r14 + +function %ineg_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = ineg.i8x16 v0 + return v1 +} + +; block0: +; vlcb %v24, %v24 +; br %r14 + +function %umax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umax.i64x2 v0, v1 + return v2 +} + +; block0: +; vmxlg %v24, %v24, %v25 +; br %r14 + +function %umax_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umax.i32x4 v0, v1 + return v2 +} + +; block0: +; vmxlf %v24, %v24, %v25 +; br %r14 + +function %umax_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umax.i16x8 v0, v1 + return v2 +} + +; block0: +; vmxlh %v24, %v24, %v25 +; br %r14 + +function %umax_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umax.i8x16 v0, v1 + return v2 +} + +; block0: +; vmxlb %v24, %v24, %v25 +; br %r14 + +function %umin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umin.i64x2 v0, v1 + return v2 +} + +; block0: +; vmnlg %v24, %v24, %v25 +; br %r14 + +function %umin_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umin.i32x4 v0, v1 + return v2 +} + +; block0: +; vmnlf %v24, %v24, %v25 +; br %r14 + +function %umin_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umin.i16x8 v0, v1 + return v2 +} + +; block0: +; vmnlh %v24, %v24, %v25 +; br %r14 + +function %umin_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umin.i8x16 v0, v1 + return v2 +} + +; block0: +; vmnlb %v24, %v24, %v25 +; br %r14 + +function %imax_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imax.i64x2 v0, v1 + return v2 +} + +; block0: +; vmxg %v24, %v24, %v25 +; br %r14 + +function %imax_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imax.i32x4 v0, v1 + return v2 +} + +; block0: +; vmxf %v24, %v24, %v25 +; br %r14 + +function %imax_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imax.i16x8 v0, v1 + return v2 +} + +; block0: +; vmxh %v24, %v24, %v25 +; br %r14 + +function %imax_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imax.i8x16 v0, v1 + return v2 +} + +; block0: +; vmxb %v24, %v24, %v25 +; br %r14 + +function %imin_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imin.i64x2 v0, v1 + return v2 +} + +; block0: +; vmng %v24, %v24, %v25 +; br %r14 + +function %imin_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imin.i32x4 v0, v1 + return v2 +} + +; block0: +; vmnf %v24, %v24, %v25 +; br %r14 + +function %imin_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imin.i16x8 v0, v1 + return v2 +} + +; block0: +; vmnh %v24, %v24, %v25 +; br %r14 + +function %imin_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imin.i8x16 v0, v1 + return v2 +} + +; block0: +; vmnb %v24, %v24, %v25 +; br %r14 + +function %avg_round_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = avg_round.i64x2 v0, v1 + return v2 +} + +; block0: +; vavglg %v24, %v24, %v25 +; br %r14 + +function %avg_round_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = avg_round.i32x4 v0, v1 + return v2 +} + +; block0: +; vavglf %v24, %v24, %v25 +; br %r14 + +function %avg_round_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = avg_round.i16x8 v0, v1 + return v2 +} + +; block0: +; vavglh %v24, %v24, %v25 +; br %r14 + +function %avg_round_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round.i8x16 v0, v1 + return v2 +} + +; block0: +; vavglb %v24, %v24, %v25 +; br %r14 + +function %uadd_sat64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = uadd_sat.i64x2 v0, v1 + return v2 +} + +; block0: +; vag %v5, %v24, %v25 +; vchlg %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = uadd_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vaf %v5, %v24, %v25 +; vchlf %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = uadd_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vah %v5, %v24, %v25 +; vchlh %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %uadd_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = uadd_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vab %v5, %v24, %v25 +; vchlb %v7, %v24, %v5 +; vo %v24, %v5, %v7 +; br %r14 + +function %sadd_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = sadd_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; vag %v17, %v5, %v7 +; vuplf %v19, %v24 +; vuplf %v21, %v25 +; vag %v23, %v19, %v21 +; vpksg %v24, %v17, %v23 +; br %r14 + +function %sadd_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sadd_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vaf %v17, %v5, %v7 +; vuplh %v19, %v24 +; vuplh %v21, %v25 +; vaf %v23, %v19, %v21 +; vpksf %v24, %v17, %v23 +; br %r14 + +function %sadd_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = sadd_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vuphb %v5, %v24 +; vuphb %v7, %v25 +; vah %v17, %v5, %v7 +; vuplb %v19, %v24 +; vuplb %v21, %v25 +; vah %v23, %v19, %v21 +; vpksh %v24, %v17, %v23 +; br %r14 + +function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise.i32x4 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 32 +; vsrlb %v7, %v25, %v5 +; vaf %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vaf %v21, %v24, %v19 +; vpkg %v24, %v17, %v21 +; br %r14 + +function %usub_sat64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = usub_sat.i64x2 v0, v1 + return v2 +} + +; block0: +; vsg %v5, %v24, %v25 +; vchlg %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = usub_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vsf %v5, %v24, %v25 +; vchlf %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = usub_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vsh %v5, %v24, %v25 +; vchlh %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %usub_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = usub_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vsb %v5, %v24, %v25 +; vchlb %v7, %v24, %v25 +; vn %v24, %v5, %v7 +; br %r14 + +function %ssub_sat32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = ssub_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; vsg %v17, %v5, %v7 +; vuplf %v19, %v24 +; vuplf %v21, %v25 +; vsg %v23, %v19, %v21 +; vpksg %v24, %v17, %v23 +; br %r14 + +function %ssub_sat16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = ssub_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vsf %v17, %v5, %v7 +; vuplh %v19, %v24 +; vuplh %v21, %v25 +; vsf %v23, %v19, %v21 +; vpksf %v24, %v17, %v23 +; br %r14 + +function %ssub_sat8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = ssub_sat.i8x16 v0, v1 + return v2 +} + +; block0: +; vuphb %v5, %v24 +; vuphb %v7, %v25 +; vsh %v17, %v5, %v7 +; vuplb %v19, %v24 +; vuplb %v21, %v25 +; vsh %v23, %v19, %v21 +; vpksh %v24, %v17, %v23 +; br %r14 + +function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise.i32x4 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 32 +; vsrlb %v7, %v25, %v5 +; vaf %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vaf %v21, %v24, %v19 +; vpkg %v24, %v17, %v21 +; br %r14 + +function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd_pairwise.i16x8 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 16 +; vsrlb %v7, %v25, %v5 +; vah %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vah %v21, %v24, %v19 +; vpkf %v24, %v17, %v21 +; br %r14 + +function %iadd_pairwise_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd_pairwise.i8x16 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 8 +; vsrlb %v7, %v25, %v5 +; vab %v17, %v25, %v7 +; vsrlb %v19, %v24, %v5 +; vab %v21, %v24, %v19 +; vpkh %v24, %v17, %v21 +; br %r14 + +function %imul_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imul.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r5, %v25, 0 +; msgr %r3, %r5 +; vlgvg %r5, %v24, 1 +; vlgvg %r4, %v25, 1 +; msgr %r5, %r4 +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %imul_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imul.i32x4 v0, v1 + return v2 +} + +; block0: +; vmlf %v24, %v24, %v25 +; br %r14 + +function %imul_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imul.i16x8 v0, v1 + return v2 +} + +; block0: +; vmlhw %v24, %v24, %v25 +; br %r14 + +function %imul_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = imul.i8x16 v0, v1 + return v2 +} + +; block0: +; vmlb %v24, %v24, %v25 +; br %r14 + +function %umulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umulhi.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r1, %v25, 0 +; mlgr %r0, %r3 +; lgr %r2, %r0 +; vlgvg %r3, %v24, 1 +; vlgvg %r1, %v25, 1 +; mlgr %r0, %r3 +; vlvgp %v24, %r2, %r0 +; br %r14 + +function %umulhi_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umulhi.i32x4 v0, v1 + return v2 +} + +; block0: +; vmlhf %v24, %v24, %v25 +; br %r14 + +function %umulhi_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umulhi.i16x8 v0, v1 + return v2 +} + +; block0: +; vmlhh %v24, %v24, %v25 +; br %r14 + +function %umulhi_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umulhi.i8x16 v0, v1 + return v2 +} + +; block0: +; vmlhb %v24, %v24, %v25 +; br %r14 + +function %smulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = smulhi.i64x2 v0, v1 + return v2 +} + +; block0: +; vlgvg %r3, %v24, 0 +; vlgvg %r5, %v25, 0 +; mgrk %r0, %r3, %r5 +; lgr %r3, %r0 +; vlgvg %r2, %v24, 1 +; vlgvg %r4, %v25, 1 +; mgrk %r0, %r2, %r4 +; lgr %r4, %r3 +; vlvgp %v24, %r4, %r0 +; br %r14 + +function %smulhi_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = smulhi.i32x4 v0, v1 + return v2 +} + +; block0: +; vmhf %v24, %v24, %v25 +; br %r14 + +function %smulhi_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = smulhi.i16x8 v0, v1 + return v2 +} + +; block0: +; vmhh %v24, %v24, %v25 +; br %r14 + +function %smulhi_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = smulhi.i8x16 v0, v1 + return v2 +} + +; block0: +; vmhb %v24, %v24, %v25 +; br %r14 + +function %widening_pairwise_dot_product_s_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = widening_pairwise_dot_product_s v0, v1 + return v2 +} + +; block0: +; vmeh %v5, %v24, %v25 +; vmoh %v7, %v24, %v25 +; vaf %v24, %v5, %v7 +; br %r14 + +function %sqmul_round_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sqmul_round_sat.i16x8 v0, v1 + return v2 +} + +; block0: +; vuphh %v5, %v24 +; vuphh %v7, %v25 +; vmlf %v17, %v5, %v7 +; vgmf %v19, 17, 17 +; vaf %v21, %v17, %v19 +; vesraf %v23, %v21, 15 +; vuplh %v26, %v24 +; vuplh %v27, %v25 +; vmlf %v29, %v26, %v27 +; vgmf %v31, 17, 17 +; vaf %v1, %v29, %v31 +; vesraf %v3, %v1, 15 +; vpksf %v24, %v23, %v3 +; br %r14 + +function %sqmul_round_sat(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = sqmul_round_sat.i32x4 v0, v1 + return v2 +} + +; block0: +; vuphf %v5, %v24 +; vuphf %v7, %v25 +; lgdr %r3, %f5 +; lgdr %r5, %f7 +; msgr %r3, %r5 +; vlgvg %r5, %v5, 1 +; vlgvg %r4, %v7, 1 +; msgr %r5, %r4 +; vlvgp %v29, %r3, %r5 +; vgmg %v31, 33, 33 +; vag %v1, %v29, %v31 +; vesrag %v3, %v1, 31 +; vuplf %v5, %v24 +; vuplf %v7, %v25 +; lgdr %r3, %f5 +; lgdr %r5, %f7 +; msgr %r3, %r5 +; vlgvg %r5, %v5, 1 +; vlgvg %r4, %v7, 1 +; msgr %r5, %r4 +; vlvgp %v29, %r3, %r5 +; vgmg %v31, 33, 33 +; vag %v1, %v29, %v31 +; vesrag %v4, %v1, 31 +; vpksg %v24, %v3, %v4 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif new file mode 100644 index 000000000000..a5cff95c475c --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitops.clif @@ -0,0 +1,43 @@ +test compile precise-output +target s390x + +function %popcnt_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = popcnt.i64x2 v0 + return v1 +} + +; block0: +; vpopctg %v24, %v24 +; br %r14 + +function %popcnt_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = popcnt.i32x4 v0 + return v1 +} + +; block0: +; vpopctf %v24, %v24 +; br %r14 + +function %popcnt_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = popcnt.i16x8 v0 + return v1 +} + +; block0: +; vpopcth %v24, %v24 +; br %r14 + +function %popcnt_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = popcnt.i8x16 v0 + return v1 +} + +; block0: +; vpopctb %v24, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif new file mode 100644 index 000000000000..8722a78703b3 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif @@ -0,0 +1,364 @@ + +test compile precise-output +target s390x + +function %band_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = band.i64x2 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = band.i32x4 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = band.i16x8 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %band_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = band.i8x16 v0, v1 + return v2 +} + +; block0: +; vn %v24, %v24, %v25 +; br %r14 + +function %bor_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bor.i64x2 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bor.i32x4 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bor.i16x8 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bor_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bor.i8x16 v0, v1 + return v2 +} + +; block0: +; vo %v24, %v24, %v25 +; br %r14 + +function %bxor_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bxor.i64x2 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bxor.i32x4 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bxor.i16x8 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %bxor_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bxor.i8x16 v0, v1 + return v2 +} + +; block0: +; vx %v24, %v24, %v25 +; br %r14 + +function %band_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = band_not.i64x2 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = band_not.i32x4 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = band_not.i16x8 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %band_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = band_not.i8x16 v0, v1 + return v2 +} + +; block0: +; vnc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bor_not.i64x2 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bor_not.i32x4 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bor_not.i16x8 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bor_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bor_not.i8x16 v0, v1 + return v2 +} + +; block0: +; voc %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bxor_not.i64x2 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bxor_not.i32x4 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bxor_not.i16x8 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bxor_not_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = bxor_not.i8x16 v0, v1 + return v2 +} + +; block0: +; vnx %v24, %v24, %v25 +; br %r14 + +function %bnot_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = bnot.i64x2 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = bnot.i32x4 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = bnot.i16x8 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bnot_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = bnot.i8x16 v0 + return v1 +} + +; block0: +; vno %v24, %v24, %v24 +; br %r14 + +function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bitselect.i64x2 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4, v2: i32x4): + v3 = bitselect.i32x4 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8, v2: i16x8): + v3 = bitselect.i16x8 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i8x16): + v3 = bitselect.i8x16 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i64x2(b64x2, i64x2, i64x2) -> i64x2 { +block0(v0: b64x2, v1: i64x2, v2: i64x2): + v3 = vselect.i64x2 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i32x4(b32x4, i32x4, i32x4) -> i32x4 { +block0(v0: b32x4, v1: i32x4, v2: i32x4): + v3 = vselect.i32x4 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 { +block0(v0: b16x8, v1: i16x8, v2: i16x8): + v3 = vselect.i16x8 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + +function %vselect_i8x16(b8x16, i8x16, i8x16) -> i8x16 { +block0(v0: b8x16, v1: i8x16, v2: i8x16): + v3 = vselect.i8x16 v0, v1, v2 + return v3 +} + +; block0: +; vsel %v24, %v25, %v26, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-constants.clif b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif new file mode 100644 index 000000000000..b5a6969f2b3e --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif @@ -0,0 +1,213 @@ +test compile precise-output +target s390x + +function %vconst_i64x2_zero() -> i64x2 { +block0: + v1 = vconst.i64x2 [0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i64x2_splat1() -> i64x2 { +block0: + v1 = vconst.i64x2 [32767 32767] + return v1 +} + +; block0: +; vrepig %v24, 32767 +; br %r14 + +function %vconst_i64x2_splat2() -> i64x2 { +block0: + v1 = vconst.i64x2 [-32768 -32768] + return v1 +} + +; block0: +; vrepig %v24, -32768 +; br %r14 + +function %vconst_i64x2_splat3() -> i64x2 { +block0: + v1 = vconst.i64x2 [32768 32768] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000000008000 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_splat4() -> i64x2 { +block0: + v1 = vconst.i64x2 [-32769 -32769] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0xffffffffffff7fff ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_mixed() -> i64x2 { +block0: + v1 = vconst.i64x2 [1 2] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000000000000020000000000000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_zero() -> i32x4 { +block0: + v1 = vconst.i32x4 [0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i32x4_splat1() -> i32x4 { +block0: + v1 = vconst.i32x4 [32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepif %v24, 32767 +; br %r14 + +function %vconst_i32x4_splat2() -> i32x4 { +block0: + v1 = vconst.i32x4 [-32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepif %v24, -32768 +; br %r14 + +function %vconst_i32x4_splat3() -> i32x4 { +block0: + v1 = vconst.i32x4 [32768 32768 32768 32768] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0x00008000 ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat4() -> i32x4 { +block0: + v1 = vconst.i32x4 [-32769 -32769 -32769 -32769] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0xffff7fff ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat_i64() -> i32x4 { +block0: + v1 = vconst.i32x4 [1 2 1 2] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000200000001 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_mixed() -> i32x4 { +block0: + v1 = vconst.i32x4 [1 2 3 4] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000004000000030000000200000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i16x8_zero() -> i16x8 { +block0: + v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i16x8_splat1() -> i16x8 { +block0: + v1 = vconst.i16x8 [32767 32767 32767 32767 32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepih %v24, 32767 +; br %r14 + +function %vconst_i16x8_splat2() -> i16x8 { +block0: + v1 = vconst.i16x8 [-32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepih %v24, -32768 +; br %r14 + +function %vconst_i16x8_mixed() -> i16x8 { +block0: + v1 = vconst.i16x8 [1 2 3 4 5 6 7 8] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00080007000600050004000300020001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i8x16_zero() -> i8x16 { +block0: + v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i8x16_splat1() -> i8x16 { +block0: + v1 = vconst.i8x16 [127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127] + return v1 +} + +; block0: +; vrepib %v24, 127 +; br %r14 + +function %vconst_i8x16_splat2() -> i8x16 { +block0: + v1 = vconst.i8x16 [-128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128] + return v1 +} + +; block0: +; vrepib %v24, 128 +; br %r14 + +function %vconst_i8x16_mixed() -> i8x16 { +block0: + v1 = vconst.i8x16 [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x100f0e0d0c0b0a090807060504030201 ; vl %v24, 0(%r1) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif new file mode 100644 index 000000000000..b137c8cd214b --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif @@ -0,0 +1,222 @@ +test compile precise-output +target s390x + +function %snarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = snarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpksg %v24, %v25, %v24 +; br %r14 + +function %snarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = snarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpksf %v24, %v25, %v24 +; br %r14 + +function %snarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = snarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpksh %v24, %v25, %v24 +; br %r14 + +function %unarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = unarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxg %v7, %v25, %v5 +; vmxg %v17, %v24, %v5 +; vpklsg %v24, %v7, %v17 +; br %r14 + +function %unarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxf %v7, %v25, %v5 +; vmxf %v17, %v24, %v5 +; vpklsf %v24, %v7, %v17 +; br %r14 + +function %unarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = unarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxh %v7, %v25, %v5 +; vmxh %v17, %v24, %v5 +; vpklsh %v24, %v7, %v17 +; br %r14 + +function %uunarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { +block0(v0: i64x2, v1: i64x2): + v2 = uunarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpklsg %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = uunarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpklsf %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = uunarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpklsh %v24, %v25, %v24 +; br %r14 + +function %swiden_low_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_low.i32x4 v0 + return v1 +} + +; block0: +; vuplf %v24, %v24 +; br %r14 + +function %swiden_low_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_low.i16x8 v0 + return v1 +} + +; block0: +; vuplh %v24, %v24 +; br %r14 + +function %swiden_low_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_low.i8x16 v0 + return v1 +} + +; block0: +; vuplb %v24, %v24 +; br %r14 + +function %swiden_high_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = swiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuphf %v24, %v24 +; br %r14 + +function %swiden_high_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuphh %v24, %v24 +; br %r14 + +function %swiden_high_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuphb %v24, %v24 +; br %r14 + +function %uwiden_low_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_low.i32x4 v0 + return v1 +} + +; block0: +; vupllf %v24, %v24 +; br %r14 + +function %uwiden_low_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_low.i16x8 v0 + return v1 +} + +; block0: +; vupllh %v24, %v24 +; br %r14 + +function %uwiden_low_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_low.i8x16 v0 + return v1 +} + +; block0: +; vupllb %v24, %v24 +; br %r14 + +function %uwiden_high_i32x4_i64x2(i32x4) -> i64x2 { +block0(v0: i32x4): + v1 = uwiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuplhf %v24, %v24 +; br %r14 + +function %uwiden_high_i16x8_i32x4(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = uwiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuplhh %v24, %v24 +; br %r14 + +function %uwiden_high_i8x16_i16x8(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuplhb %v24, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif new file mode 100644 index 000000000000..32aeab3bd15d --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fcmp.clif @@ -0,0 +1,309 @@ +test compile precise-output +target s390x + +function %fcmp_eq_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 eq v0, v1 + return v2 +} + +; block0: +; vfcedb %v24, %v24, %v25 +; br %r14 + +function %fcmp_ne_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ne v0, v1 + return v2 +} + +; block0: +; vfcedb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_gt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 gt v0, v1 + return v2 +} + +; block0: +; vfchdb %v24, %v24, %v25 +; br %r14 + +function %fcmp_lt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 lt v0, v1 + return v2 +} + +; block0: +; vfchdb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ge_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ge v0, v1 + return v2 +} + +; block0: +; vfchedb %v24, %v24, %v25 +; br %r14 + +function %fcmp_le_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 le v0, v1 + return v2 +} + +; block0: +; vfchedb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ueq_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ueq v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vfchdb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_one_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 one v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vfchdb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_ugt_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ugt v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ult_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ult v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_uge_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 uge v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ule_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ule v0, v1 + return v2 +} + +; block0: +; vfchdb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ord_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 ord v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vfchedb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_uno_f64x2(f64x2, f64x2) -> b64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp.f64x2 uno v0, v1 + return v2 +} + +; block0: +; vfchedb %v5, %v24, %v25 +; vfchedb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_eq_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 eq v0, v1 + return v2 +} + +; block0: +; vfcesb %v24, %v24, %v25 +; br %r14 + +function %fcmp_ne_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ne v0, v1 + return v2 +} + +; block0: +; vfcesb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_gt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 gt v0, v1 + return v2 +} + +; block0: +; vfchsb %v24, %v24, %v25 +; br %r14 + +function %fcmp_lt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 lt v0, v1 + return v2 +} + +; block0: +; vfchsb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ge_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ge v0, v1 + return v2 +} + +; block0: +; vfchesb %v24, %v24, %v25 +; br %r14 + +function %fcmp_le_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 le v0, v1 + return v2 +} + +; block0: +; vfchesb %v24, %v25, %v24 +; br %r14 + +function %fcmp_ueq_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ueq v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vfchsb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + +function %fcmp_one_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 one v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vfchsb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_ugt_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ugt v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ult_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ult v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_uge_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 uge v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ule_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ule v0, v1 + return v2 +} + +; block0: +; vfchsb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %fcmp_ord_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 ord v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vfchesb %v7, %v25, %v24 +; vo %v24, %v5, %v7 +; br %r14 + +function %fcmp_uno_f32x4(f32x4, f32x4) -> b32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp.f32x4 uno v0, v1 + return v2 +} + +; block0: +; vfchesb %v5, %v24, %v25 +; vfchesb %v7, %v25, %v24 +; vno %v24, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif new file mode 100644 index 000000000000..4c00c348d458 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp-arch13.clif @@ -0,0 +1,90 @@ +test compile precise-output +target s390x arch13 + +function %fcvt_from_uint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; block0: +; vcelfb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_sint.f32x4 v0 + return v1 +} + +; block0: +; vcefb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_uint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_uint.f64x2 v0 + return v1 +} + +; block0: +; vcdlgb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vcdgb %v24, %v24, 0, 4 +; br %r14 + + +function %fcvt_to_uint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; block0: +; vclfeb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; block0: +; vcfeb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcesb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + +function %fcvt_to_uint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_uint_sat.i64x2 v0 + return v1 +} + +; block0: +; vclgdb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_sint_sat.i64x2 v0 + return v1 +} + +; block0: +; vcgdb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcedb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif new file mode 100644 index 000000000000..fc356d57a762 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif @@ -0,0 +1,533 @@ +test compile precise-output +target s390x + +function %vconst_f32x4_zero() -> f32x4 { +block0: + v1 = vconst.f32x4 [0x0.0 0x0.0 0x0.0 0x0.0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_f64x2_zero() -> f64x2 { +block0: + v1 = vconst.f64x2 [0x0.0 0x0.0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_f32x4_mixed() -> f32x4 { +block0: + v1 = vconst.f32x4 [0x1.0 0x2.0 0x3.0 0x4.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x4080000040400000400000003f800000 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_f64x2_mixed() -> f64x2 { +block0: + v1 = vconst.f64x2 [0x1.0 0x2.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x40000000000000003ff0000000000000 ; vl %v24, 0(%r1) +; br %r14 + +function %fadd_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fadd v0, v1 + return v2 +} + +; block0: +; vfasb %v24, %v24, %v25 +; br %r14 + +function %fadd_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fadd v0, v1 + return v2 +} + +; block0: +; vfadb %v24, %v24, %v25 +; br %r14 + +function %fsub_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fsub v0, v1 + return v2 +} + +; block0: +; vfssb %v24, %v24, %v25 +; br %r14 + +function %fsub_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fsub v0, v1 + return v2 +} + +; block0: +; vfsdb %v24, %v24, %v25 +; br %r14 + +function %fmul_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmul v0, v1 + return v2 +} + +; block0: +; vfmsb %v24, %v24, %v25 +; br %r14 + +function %fmul_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmul v0, v1 + return v2 +} + +; block0: +; vfmdb %v24, %v24, %v25 +; br %r14 + +function %fdiv_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fdiv v0, v1 + return v2 +} + +; block0: +; vfdsb %v24, %v24, %v25 +; br %r14 + +function %fdiv_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fdiv v0, v1 + return v2 +} + +; block0: +; vfddb %v24, %v24, %v25 +; br %r14 + +function %fmin_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin v0, v1 + return v2 +} + +; block0: +; vfminsb %v24, %v24, %v25, 1 +; br %r14 + +function %fmin_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin v0, v1 + return v2 +} + +; block0: +; vfmindb %v24, %v24, %v25, 1 +; br %r14 + +function %fmax_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmax v0, v1 + return v2 +} + +; block0: +; vfmaxsb %v24, %v24, %v25, 1 +; br %r14 + +function %fmax_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax v0, v1 + return v2 +} + +; block0: +; vfmaxdb %v24, %v24, %v25, 1 +; br %r14 + +function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; vfminsb %v24, %v24, %v25, 3 +; br %r14 + +function %fmin_pseudo_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin_pseudo v0, v1 + return v2 +} + +; block0: +; vfmindb %v24, %v24, %v25, 3 +; br %r14 + +function %fmax_pseudo_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; vfmaxsb %v24, %v24, %v25, 3 +; br %r14 + +function %fmax_pseudo_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax_pseudo v0, v1 + return v2 +} + +; block0: +; vfmaxdb %v24, %v24, %v25, 3 +; br %r14 + +function %sqrt_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = sqrt v0 + return v1 +} + +; block0: +; vfsqsb %v24, %v24 +; br %r14 + +function %sqrt_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = sqrt v0 + return v1 +} + +; block0: +; vfsqdb %v24, %v24 +; br %r14 + +function %fabs_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fabs v0 + return v1 +} + +; block0: +; vflpsb %v24, %v24 +; br %r14 + +function %fabs_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fabs v0 + return v1 +} + +; block0: +; vflpdb %v24, %v24 +; br %r14 + +function %fneg_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fneg v0 + return v1 +} + +; block0: +; vflcsb %v24, %v24 +; br %r14 + +function %fneg_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fneg v0 + return v1 +} + +; block0: +; vflcdb %v24, %v24 +; br %r14 + +function %fvpromote_low_f32x4(f32x4) -> f64x2 { +block0(v0: f32x4): + v1 = fvpromote_low v0 + return v1 +} + +; block0: +; vmrlf %v3, %v24, %v24 +; vldeb %v24, %v3 +; br %r14 + +function %fvdemote_f64x2(f64x2) -> f32x4 { +block0(v0: f64x2): + v1 = fvdemote v0 + return v1 +} + +; block0: +; vledb %v3, %v24, 0, 0 +; vgbm %v5, 0 +; bras %r1, 20 ; data.u128 0x10101010101010100001020308090a0b ; vl %v7, 0(%r1) +; vperm %v24, %v3, %v5, %v7 +; br %r14 + +function %ceil_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 6 +; br %r14 + +function %ceil_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 6 +; br %r14 + +function %floor_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = floor v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 7 +; br %r14 + +function %floor_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = floor v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 7 +; br %r14 + +function %trunc_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = trunc v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 5 +; br %r14 + +function %trunc_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = trunc v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 5 +; br %r14 + +function %nearest_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = nearest v0 + return v1 +} + +; block0: +; vfisb %v24, %v24, 0, 4 +; br %r14 + +function %nearest_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = nearest v0 + return v1 +} + +; block0: +; vfidb %v24, %v24, 0, 4 +; br %r14 + +function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; vfmasb %v24, %v24, %v25, %v26 +; br %r14 + +function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} + +; block0: +; vfmadb %v24, %v24, %v25, %v26 +; br %r14 + +function %fcopysign_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; vgmf %v5, 1, 31 +; vsel %v24, %v24, %v25, %v5 +; br %r14 + +function %fcopysign_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; vgmg %v5, 1, 63 +; vsel %v24, %v24, %v25, %v5 +; br %r14 + +function %fcvt_from_uint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} + +; block0: +; vuplhf %v3, %v24 +; vcdlgb %v5, %v3, 0, 3 +; vledb %v7, %v5, 0, 4 +; vupllf %v17, %v24 +; vcdlgb %v19, %v17, 0, 3 +; vledb %v21, %v19, 0, 4 +; bras %r1, 20 ; data.u128 0x0001020308090a0b1011121318191a1b ; vl %v23, 0(%r1) +; vperm %v24, %v7, %v21, %v23 +; br %r14 + +function %fcvt_from_sint_i32x4_f32x4(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_sint.f32x4 v0 + return v1 +} + +; block0: +; vuphf %v3, %v24 +; vcdgb %v5, %v3, 0, 3 +; vledb %v7, %v5, 0, 4 +; vuplf %v17, %v24 +; vcdgb %v19, %v17, 0, 3 +; vledb %v21, %v19, 0, 4 +; bras %r1, 20 ; data.u128 0x0001020308090a0b1011121318191a1b ; vl %v23, 0(%r1) +; vperm %v24, %v7, %v21, %v23 +; br %r14 + +function %fcvt_from_uint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_uint.f64x2 v0 + return v1 +} + +; block0: +; vcdlgb %v24, %v24, 0, 4 +; br %r14 + +function %fcvt_from_sint_i64x2_f64x2(i64x2) -> f64x2 { +block0(v0: i64x2): + v1 = fcvt_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vcdgb %v24, %v24, 0, 4 +; br %r14 + + +function %fcvt_low_from_sint_i32x4_f64x2(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = fcvt_low_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vuplf %v3, %v24 +; vcdgb %v24, %v3, 0, 4 +; br %r14 + +function %fcvt_to_uint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; block0: +; vmrhf %v3, %v24, %v24 +; vldeb %v5, %v3 +; vclgdb %v7, %v5, 0, 5 +; vmrlf %v17, %v24, %v24 +; vldeb %v19, %v17 +; vclgdb %v21, %v19, 0, 5 +; vpklsg %v24, %v7, %v21 +; br %r14 + +function %fcvt_to_sint_sat_f32x4_i32x4(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; block0: +; vmrhf %v3, %v24, %v24 +; vldeb %v5, %v3 +; vcgdb %v7, %v5, 0, 5 +; vmrlf %v17, %v24, %v24 +; vldeb %v19, %v17 +; vcgdb %v21, %v19, 0, 5 +; vpksg %v23, %v7, %v21 +; vgbm %v25, 0 +; vfcesb %v27, %v24, %v24 +; vsel %v24, %v23, %v25, %v27 +; br %r14 + +function %fcvt_to_uint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_uint_sat.i64x2 v0 + return v1 +} + +; block0: +; vclgdb %v24, %v24, 0, 5 +; br %r14 + +function %fcvt_to_sint_sat_f64x2_i64x2(f64x2) -> i64x2 { +block0(v0: f64x2): + v1 = fcvt_to_sint_sat.i64x2 v0 + return v1 +} + +; block0: +; vcgdb %v3, %v24, 0, 5 +; vgbm %v5, 0 +; vfcedb %v7, %v24, %v24 +; vsel %v24, %v3, %v5, %v7 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif b/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif new file mode 100644 index 000000000000..fe9e6fead830 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-icmp.clif @@ -0,0 +1,423 @@ +test compile precise-output +target s390x + +function %icmp_eq_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 eq v0, v1 + return v2 +} + +; block0: +; vceqg %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ne v0, v1 + return v2 +} + +; block0: +; vceqg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sgt v0, v1 + return v2 +} + +; block0: +; vchg %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 slt v0, v1 + return v2 +} + +; block0: +; vchg %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sge v0, v1 + return v2 +} + +; block0: +; vchg %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 sle v0, v1 + return v2 +} + +; block0: +; vchg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ugt v0, v1 + return v2 +} + +; block0: +; vchlg %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ult v0, v1 + return v2 +} + +; block0: +; vchlg %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 uge v0, v1 + return v2 +} + +; block0: +; vchlg %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i64x2(i64x2, i64x2) -> b64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp.i64x2 ule v0, v1 + return v2 +} + +; block0: +; vchlg %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 eq v0, v1 + return v2 +} + +; block0: +; vceqf %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ne v0, v1 + return v2 +} + +; block0: +; vceqf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sgt v0, v1 + return v2 +} + +; block0: +; vchf %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 slt v0, v1 + return v2 +} + +; block0: +; vchf %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sge v0, v1 + return v2 +} + +; block0: +; vchf %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 sle v0, v1 + return v2 +} + +; block0: +; vchf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ugt v0, v1 + return v2 +} + +; block0: +; vchlf %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ult v0, v1 + return v2 +} + +; block0: +; vchlf %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 uge v0, v1 + return v2 +} + +; block0: +; vchlf %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i32x4(i32x4, i32x4) -> b32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp.i32x4 ule v0, v1 + return v2 +} + +; block0: +; vchlf %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 eq v0, v1 + return v2 +} + +; block0: +; vceqh %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ne v0, v1 + return v2 +} + +; block0: +; vceqh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sgt v0, v1 + return v2 +} + +; block0: +; vchh %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 slt v0, v1 + return v2 +} + +; block0: +; vchh %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sge v0, v1 + return v2 +} + +; block0: +; vchh %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 sle v0, v1 + return v2 +} + +; block0: +; vchh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ugt v0, v1 + return v2 +} + +; block0: +; vchlh %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ult v0, v1 + return v2 +} + +; block0: +; vchlh %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 uge v0, v1 + return v2 +} + +; block0: +; vchlh %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i16x8(i16x8, i16x8) -> b16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp.i16x8 ule v0, v1 + return v2 +} + +; block0: +; vchlh %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_eq_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 eq v0, v1 + return v2 +} + +; block0: +; vceqb %v24, %v24, %v25 +; br %r14 + +function %icmp_ne_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ne v0, v1 + return v2 +} + +; block0: +; vceqb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sgt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sgt v0, v1 + return v2 +} + +; block0: +; vchb %v24, %v24, %v25 +; br %r14 + +function %icmp_slt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 slt v0, v1 + return v2 +} + +; block0: +; vchb %v24, %v25, %v24 +; br %r14 + +function %icmp_sge_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sge v0, v1 + return v2 +} + +; block0: +; vchb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_sle_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 sle v0, v1 + return v2 +} + +; block0: +; vchb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ugt_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ugt v0, v1 + return v2 +} + +; block0: +; vchlb %v24, %v24, %v25 +; br %r14 + +function %icmp_ult_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ult v0, v1 + return v2 +} + +; block0: +; vchlb %v24, %v25, %v24 +; br %r14 + +function %icmp_uge_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 uge v0, v1 + return v2 +} + +; block0: +; vchlb %v5, %v25, %v24 +; vno %v24, %v5, %v5 +; br %r14 + +function %icmp_ule_i8x16(i8x16, i8x16) -> b8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp.i8x16 ule v0, v1 + return v2 +} + +; block0: +; vchlb %v5, %v24, %v25 +; vno %v24, %v5, %v5 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif new file mode 100644 index 000000000000..5ee1ef906fa6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif @@ -0,0 +1,807 @@ +test compile precise-output +target s390x arch13 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlbrreph %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif new file mode 100644 index 000000000000..7efa4e3b719a --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif @@ -0,0 +1,1964 @@ +test compile precise-output +target s390x + +function %insertlane_i64x2_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 1 +; br %r14 + +function %insertlane_i64x2_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 0 +; br %r14 + +function %insertlane_i64x2_imm_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vleig %v24, 123, 1 +; br %r14 + +function %insertlane_i64x2_imm_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vleig %v24, 123, 0 +; br %r14 + +function %insertlane_i64x2_lane_0_0(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_i64x2_lane_0_1(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_i64x2_lane_1_0(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_i64x2_lane_1_1(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_i32x4_0(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 3 +; br %r14 + +function %insertlane_i32x4_3(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 0 +; br %r14 + +function %insertlane_i32x4_imm_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vleif %v24, 123, 3 +; br %r14 + +function %insertlane_i32x4_imm_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vleif %v24, 123, 0 +; br %r14 + +function %insertlane_i32x4_lane_0_0(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_lane_0_3(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_0(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_3(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %insertlane_i16x8_0(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 7 +; br %r14 + +function %insertlane_i16x8_7(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 0 +; br %r14 + +function %insertlane_i16x8_imm_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vleih %v24, 123, 7 +; br %r14 + +function %insertlane_i16x8_imm_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vleih %v24, 123, 0 +; br %r14 + +function %insertlane_i16x8_lane_0_0(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 3 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_lane_0_7(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vreph %v5, %v25, 7 +; vgbm %v7, 49152 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_0(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vreph %v5, %v25, 0 +; vgbm %v7, 3 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_7(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vgbm %v5, 49152 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 0 +; br %r14 + +function %insertlane_i8x16_0(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 15 +; br %r14 + +function %insertlane_i8x16_15(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 0 +; br %r14 + +function %insertlane_i8x16_imm_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vleib %v24, 123, 15 +; br %r14 + +function %insertlane_i8x16_imm_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vleib %v24, 123, 0 +; br %r14 + +function %insertlane_i8x16_lane_0_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 1 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_lane_0_15(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vrepb %v5, %v25, 15 +; vgbm %v7, 32768 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vrepb %v5, %v25, 0 +; vgbm %v7, 1 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_15(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vgbm %v5, 32768 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_0(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 0 + return v2 +} + +; block0: +; vpdi %v24, %v24, %v0, 0 +; br %r14 + +function %insertlane_f64x2_1(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 1 + return v2 +} + +; block0: +; vpdi %v24, %v0, %v24, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_0(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_1(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_f64x2_lane_1_0(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_f64x2_lane_1_1(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_f32x4_0(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 0 + return v2 +} + +; block0: +; vrepf %v5, %v0, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_3(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 3 + return v2 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v0, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_0(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_3(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_0(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_3(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %extractlane_i64x2_0(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 1 +; br %r14 + +function %extractlane_i64x2_1(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_0(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 3 +; br %r14 + +function %extractlane_i32x4_3(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_0(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 7 +; br %r14 + +function %extractlane_i16x8_7(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 7 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 0 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i8x16_0(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 15 +; br %r14 + +function %extractlane_i8x16_15(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_0(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + return v1 +} + +; block0: +; vrepg %v0, %v24, 1 +; br %r14 + +function %extractlane_f64x2_1(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + return v1 +} + +; block0: +; vrepg %v0, %v24, 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_0(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + return v1 +} + +; block0: +; vrepf %v0, %v24, 3 +; br %r14 + +function %extractlane_f32x4_3(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + return v1 +} + +; block0: +; vrepf %v0, %v24, 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %splat_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; ldgr %f3, %r2 +; vrepg %v24, %v3, 0 +; br %r14 + +function %splat_i64x2_imm() -> i64x2 { +block0: + v0 = iconst.i64 123 + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; vrepig %v24, 123 +; br %r14 + +function %splat_i64x2_lane_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_i64x2_lane_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_i32x4(i32) -> i32x4 { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vlvgf %v3, %r2, 0 +; vrepf %v24, %v3, 0 +; br %r14 + +function %splat_i32x4_imm() -> i32x4 { +block0: + v0 = iconst.i32 123 + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vrepif %v24, 123 +; br %r14 + +function %splat_i32x4_lane_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %splat_i16x8(i16) -> i16x8 { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vlvgh %v3, %r2, 0 +; vreph %v24, %v3, 0 +; br %r14 + +function %splat_i16x8_imm() -> i16x8 { +block0: + v0 = iconst.i16 123 + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vrepih %v24, 123 +; br %r14 + +function %splat_i16x8_lane_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 7 +; br %r14 + +function %splat_i16x8_lane_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 0 +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; lrvh %r5, 0(%r2) +; vlvgh %v5, %r5, 0 +; vreph %v24, %v5, 0 +; br %r14 + +function %splat_i8x16(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vlvgb %v3, %r2, 0 +; vrepb %v24, %v3, 0 +; br %r14 + +function %splat_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i8 123 + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vrepib %v24, 123 +; br %r14 + +function %splat_i8x16_lane_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 15 +; br %r14 + +function %splat_i8x16_lane_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 0 +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; block0: +; vrepg %v24, %v0, 0 +; br %r14 + +function %splat_f64x2_lane_0(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_f64x2_lane_1(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v24, %v0, 0 +; br %r14 + +function %splat_f32x4_lane_0(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %scalar_to_vector_i64x2(i64) -> i64x2 { +block0(v0: i64): + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgg %v24, %r2, 1 +; br %r14 + +function %scalar_to_vector_i64x2_imm() -> i64x2 { +block0: + v0 = iconst.i64 123 + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleig %v24, 123, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_0(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_1(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_i32x4(i32) -> i32x4 { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgf %v24, %r2, 3 +; br %r14 + +function %scalar_to_vector_i32x4_imm() -> i32x4 { +block0: + v0 = iconst.i32 123 + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleif %v24, 123, 3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_0(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_3(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %scalar_to_vector_i16x8(i16) -> i16x8 { +block0(v0: i16): + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgh %v24, %r2, 7 +; br %r14 + +function %scalar_to_vector_i16x8_imm() -> i16x8 { +block0: + v0 = iconst.i16 123 + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleih %v24, 123, 7 +; br %r14 + +function %scalar_to_vector_i16x8_lane_0(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v3, 3 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i16x8_lane_7(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vreph %v3, %v24, 0 +; vgbm %v5, 3 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %scalar_to_vector_i8x16(i8) -> i8x16 { +block0(v0: i8): + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgb %v24, %r2, 15 +; br %r14 + +function %scalar_to_vector_i8x16_imm() -> i8x16 { +block0: + v0 = iconst.i8 123 + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleib %v24, 123, 15 +; br %r14 + +function %scalar_to_vector_i8x16_lane_0(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v3, 1 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i8x16_lane_15(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v3, %v24, 0 +; vgbm %v5, 1 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2(f64) -> f64x2 { +block0(v0: f64): + v1 = scalar_to_vector.f64x2 v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v0, 0 +; br %r14 + +function %scalar_to_vector_f64x2_lane_0(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_f64x2_lane_1(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_f32x4(f32) -> f32x4 { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v3, %v0, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_lane_0(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_f32x4_lane_3(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-logical.clif b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif new file mode 100644 index 000000000000..b0375f81dc10 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif @@ -0,0 +1,675 @@ +test compile precise-output +target s390x + +function %vany_true_i64x2(i64x2) -> b1 { +block0(v0: i64x2): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqgs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i32x4(i32x4) -> b1 { +block0(v0: i32x4): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqfs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i16x8(i16x8) -> b1 { +block0(v0: i16x8): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqhs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_i8x16(i8x16) -> b1 { +block0(v0: i8x16): + v1 = vany_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqbs %v5, %v24, %v3 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vall_true_i64x2(i64x2) -> b1 { +block0(v0: i64x2): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqgs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i32x4(i32x4) -> b1 { +block0(v0: i32x4): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqfs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i16x8(i16x8) -> b1 { +block0(v0: i16x8): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqhs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_i8x16(i8x16) -> b1 { +block0(v0: i8x16): + v1 = vall_true v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vceqbs %v5, %v24, %v3 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vany_true_icmp_eq_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp eq v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_ne_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ne v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_sgt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sgt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_sle_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sle v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_slt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp slt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_sge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_ugt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ugt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_ule_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ule v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_icmp_ult_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ult v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_icmp_uge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp uge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_eq_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp eq v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ne_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ne v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_gt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp gt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ule_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ule v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_ge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ult_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ult v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_lt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp lt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_uge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp uge v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vany_true_fcmp_le_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp le v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochino %r2, 1 +; br %r14 + +function %vany_true_fcmp_ugt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ugt v0, v1 + v3 = vany_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochine %r2, 1 +; br %r14 + +function %vall_true_icmp_eq_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp eq v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_ne_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ne v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vceqgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_sgt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sgt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_sle_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sle v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_slt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp slt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_sge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_ugt_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ugt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_ule_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ule v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_icmp_ult_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp ult v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_icmp_uge_i64x2(i64x2, i64x2) -> b1 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp uge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vchlgs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_eq_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp eq v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ne_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ne v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfcedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_gt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp gt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ule_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ule v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_ge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ult_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ult v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v24, %v25 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_lt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp lt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_uge_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp uge v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchdbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vall_true_fcmp_le_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp le v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %vall_true_fcmp_ugt_f64x2(f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2): + v2 = fcmp ugt v0, v1 + v3 = vall_true v2 + return v3 +} + +; block0: +; vfchedbs %v5, %v25, %v24 +; lhi %r2, 0 +; lochio %r2, 1 +; br %r14 + +function %vhigh_bits(i64x2) -> i64 { +block0(v0: i64x2): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808080800040 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i32x4) -> i64 { +block0(v0: i32x4): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808000204060 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i16x8) -> i64 { +block0(v0: i16x8): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080800010203040506070 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits(i8x16) -> i64 { +block0(v0: i8x16): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00081018202830384048505860687078 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-permute.clif b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif new file mode 100644 index 000000000000..4e5f7019c5a4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif @@ -0,0 +1,493 @@ +test compile precise-output +target s390x + +function %swizzle(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = swizzle.i8x16 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vrepib %v7, 239 +; vno %v17, %v25, %v25 +; vmxlb %v19, %v7, %v17 +; vperm %v24, %v5, %v24, %v19 +; br %r14 + +function %shuffle_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; block0: +; vrepib %v5, 15 +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_1(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; block0: +; bras %r1, 20 ; data.u128 0x0a1e000d0b1702180403090b15100f0c ; vl %v5, 0(%r1) +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_2(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47] + return v2 +} + +; block0: +; vgbm %v5, 1 +; bras %r1, 20 ; data.u128 0x8080808080808080808080808080800f ; vl %v7, 0(%r1) +; vperm %v17, %v24, %v25, %v7 +; vn %v24, %v5, %v17 +; br %r14 + +function %shuffle_vmrhg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhb_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhb_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhb_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhb_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlb_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlb_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlb_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlb_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v25 +; br %r14 + +;; Special patterns that can be implemented via PACK. +function %shuffle_vpkg_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkf_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkh_xy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkg_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkf_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkh_yx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkg_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkf_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkh_xx(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkg_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkf_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkh_yy(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v25 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif b/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif new file mode 100644 index 000000000000..7713bd0f3340 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-shift-rotate.clif @@ -0,0 +1,427 @@ +test compile precise-output +target s390x + +function %rotr_i64x4_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = rotr.i64x2 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllg %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i64x4_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = rotr.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 47 +; br %r14 + +function %rotr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = rotr.i32x4 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllf %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = rotr.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 15 +; br %r14 + +function %rotr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = rotr.i16x8 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllh %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = rotr.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 6 +; br %r14 + +function %rotr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = rotr.i8x16 v0, v1 + return v2 +} + +; block0: +; lcr %r3, %r2 +; verllb %v24, %v24, 0(%r3) +; br %r14 + +function %rotr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = rotr.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 5 +; br %r14 + +function %rotl_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = rotl.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = rotl.i64x2 v0, v1 + return v2 +} + +; block0: +; verllg %v24, %v24, 17 +; br %r14 + +function %rotl_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = rotl.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = rotl.i32x4 v0, v1 + return v2 +} + +; block0: +; verllf %v24, %v24, 17 +; br %r14 + +function %rotl_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = rotl.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 0(%r2) +; br %r14 + +function %rotl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = rotl.i16x8 v0, v1 + return v2 +} + +; block0: +; verllh %v24, %v24, 10 +; br %r14 + +function %rotl_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = rotl.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 0(%r2) +; br %r14 + +function %rotr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = rotl.i8x16 v0, v1 + return v2 +} + +; block0: +; verllb %v24, %v24, 3 +; br %r14 + +function %ushr_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = ushr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrlg %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = ushr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrlg %v24, %v24, 17 +; br %r14 + +function %ushr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesrlf %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = ushr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesrlf %v24, %v24, 17 +; br %r14 + +function %ushr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = ushr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrlh %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = ushr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrlh %v24, %v24, 10 +; br %r14 + +function %ushr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = ushr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrlb %v24, %v24, 0(%r2) +; br %r14 + +function %ushr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = ushr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrlb %v24, %v24, 3 +; br %r14 + +function %ishl_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = ishl.i64x2 v0, v1 + return v2 +} + +; block0: +; veslg %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = ishl.i64x2 v0, v1 + return v2 +} + +; block0: +; veslg %v24, %v24, 17 +; br %r14 + +function %ishl_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ishl.i32x4 v0, v1 + return v2 +} + +; block0: +; veslf %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = ishl.i32x4 v0, v1 + return v2 +} + +; block0: +; veslf %v24, %v24, 17 +; br %r14 + +function %ishl_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = ishl.i16x8 v0, v1 + return v2 +} + +; block0: +; veslh %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = ishl.i16x8 v0, v1 + return v2 +} + +; block0: +; veslh %v24, %v24, 10 +; br %r14 + +function %ishl_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = ishl.i8x16 v0, v1 + return v2 +} + +; block0: +; veslb %v24, %v24, 0(%r2) +; br %r14 + +function %ishl_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = ishl.i8x16 v0, v1 + return v2 +} + +; block0: +; veslb %v24, %v24, 3 +; br %r14 + +function %sshr_i64x2_reg(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = sshr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrag %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 17 + v2 = sshr.i64x2 v0, v1 + return v2 +} + +; block0: +; vesrag %v24, %v24, 17 +; br %r14 + +function %sshr_i32x4_reg(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = sshr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesraf %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 17 + v2 = sshr.i32x4 v0, v1 + return v2 +} + +; block0: +; vesraf %v24, %v24, 17 +; br %r14 + +function %sshr_i16x8_reg(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = sshr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrah %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 10 + v2 = sshr.i16x8 v0, v1 + return v2 +} + +; block0: +; vesrah %v24, %v24, 10 +; br %r14 + +function %sshr_i8x16_reg(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = sshr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrab %v24, %v24, 0(%r2) +; br %r14 + +function %sshr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = sshr.i8x16 v0, v1 + return v2 +} + +; block0: +; vesrab %v24, %v24, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif new file mode 100644 index 000000000000..a60e7b619476 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif @@ -0,0 +1,375 @@ +test compile precise-output +target s390x arch13 + +function %uload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %store_i8x16_big(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %store_i8x16_little(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem.clif b/cranelift/filetests/filetests/isa/s390x/vecmem.clif new file mode 100644 index 000000000000..5cb297dde83b --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem.clif @@ -0,0 +1,463 @@ +test compile precise-output +target s390x + +function %uload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vl %v24, 0(%r2) +; br %r14 + +function %store_i8x16_big(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vst %v24, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhb %v24, %v5 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhh %v24, %v5 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhf %v24, %v5 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphb %v24, %v5 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphh %v24, %v5 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphf %v24, %v5 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_sum_little(i64, i64) -> f64x2 { +block0(v0: i64, v1: i64): + v2 = iadd.i64 v0, v1 + v3 = load.f64x2 little v2 + return v3 +} + +; block0: +; lrvg %r4, 0(%r3,%r2) +; lrvg %r5, 8(%r3,%r2) +; vlvgp %v24, %r5, %r4 +; br %r14 + +function %load_f64x2_off_little(i64) -> f64x2 { +block0(v0: i64): + v1 = load.f64x2 little v0+128 + return v1 +} + +; block0: +; lrvg %r5, 128(%r2) +; lrvg %r3, 136(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %store_i8x16_little(i8x16, i64) { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_sum_little(f64x2, i64, i64) { +block0(v0: f64x2, v1: i64, v2: i64): + v3 = iadd.i64 v1, v2 + store.f64x2 little v0, v3 + return +} + +; block0: +; vlgvg %r5, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r5, 0(%r3,%r2) +; strvg %r4, 8(%r3,%r2) +; br %r14 + +function %store_f64x2_off_little(f64x2, i64) { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1+128 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 128(%r2) +; strvg %r4, 136(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif index acd7b290dd8c..f5bf2a002ad1 100644 --- a/cranelift/filetests/filetests/runtests/fmax-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmax-pseudo.clif @@ -2,6 +2,7 @@ test interpret test run target x86_64 target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug function %fmax_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): diff --git a/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif b/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif index 9bbba57559ab..520f3aaff85b 100644 --- a/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif +++ b/cranelift/filetests/filetests/runtests/fmin-max-pseudo-vector.clif @@ -1,7 +1,7 @@ test run -; target s390x TODO: Not yet implemented on s390x set enable_simd target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug target x86_64 skylake function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 { diff --git a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif index fc88e34611f1..cb4857d8daba 100644 --- a/cranelift/filetests/filetests/runtests/fmin-pseudo.clif +++ b/cranelift/filetests/filetests/runtests/fmin-pseudo.clif @@ -2,6 +2,7 @@ test interpret test run target x86_64 target aarch64 +; target s390x FIXME: This currently fails under qemu due to a qemu bug function %fmin_p_f32(f32, f32) -> f32 { block0(v0: f32, v1: f32): diff --git a/cranelift/filetests/filetests/runtests/shifts-small-types.clif b/cranelift/filetests/filetests/runtests/shifts-small-types.clif index eae20a8ef01e..9b2207a3933d 100644 --- a/cranelift/filetests/filetests/runtests/shifts-small-types.clif +++ b/cranelift/filetests/filetests/runtests/shifts-small-types.clif @@ -1,7 +1,8 @@ test run target aarch64 +target s390x -; TODO: Merge this with the main shifts file when x86_64 & s390x passes these. +; TODO: Merge this with the main shifts file when x86_64 passes these. function %ishl_i16_i64(i16, i64) -> i16 { block0(v0: i16, v1: i64): diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif index f657d1e5336e..92ffddeef20f 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic-nondeterministic-aarch64.clif @@ -3,6 +3,7 @@ ; simd-arithmetic-nondeterministic*.clif as well. test run target aarch64 +target s390x function %fmax_f64x2(f64x2, f64x2) -> f64x2 { block0(v0: f64x2, v1: f64x2): diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index 1ca8e8fcfa3b..58a0dc1c21f6 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif index a2086b0426b7..4021e89fee42 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set opt_level=speed_and_size set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif index 3d67ff290504..18027373f8c8 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif @@ -1,6 +1,7 @@ test run set enable_simd target aarch64 +target s390x target x86_64 has_sse3 has_ssse3 has_sse41 function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif index af7b24d5e64a..ce3ffa5321df 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif @@ -1,6 +1,7 @@ test run set enable_simd target aarch64 +; target s390x FIXME: s390x implements modulo semantics for shift counts target x86_64 skylake ; TODO: once available, replace all lane extraction with `icmp + all_ones` diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise.clif b/cranelift/filetests/filetests/runtests/simd-bitwise.clif index 670844db22bf..251f9516c15d 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitwise.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitwise.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +; target s390x FIXME: s390x implements modulo semantics for shift counts set enable_simd target x86_64 skylake diff --git a/cranelift/filetests/filetests/runtests/simd-comparison.clif b/cranelift/filetests/filetests/runtests/simd-comparison.clif index cd8341127d3f..dd8c6a80b2ef 100644 --- a/cranelift/filetests/filetests/runtests/simd-comparison.clif +++ b/cranelift/filetests/filetests/runtests/simd-comparison.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-conversion.clif b/cranelift/filetests/filetests/runtests/simd-conversion.clif index 2903a34e1680..da8bced238af 100644 --- a/cranelift/filetests/filetests/runtests/simd-conversion.clif +++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane.clif b/cranelift/filetests/filetests/runtests/simd-extractlane.clif index 6aba1c67ba61..471130f8c252 100644 --- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-iabs.clif b/cranelift/filetests/filetests/runtests/simd-iabs.clif index 95b53844c624..b9d6468f9c51 100644 --- a/cranelift/filetests/filetests/runtests/simd-iabs.clif +++ b/cranelift/filetests/filetests/runtests/simd-iabs.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif index 92f5d776fe6d..3deeb6cddd20 100644 --- a/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif +++ b/cranelift/filetests/filetests/runtests/simd-iaddpairwise.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x function %iaddp_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif index f3624f1bf19f..e0965d8324e3 100644 --- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 @@ -30,4 +31,4 @@ block0(v0: i64x2, v1: i64): v2 = insertlane v0, v1, 0 return v2 } -; run: %insertlane_0([1 1], 5000000000) == [5000000000 1] \ No newline at end of file +; run: %insertlane_0([1 1], 5000000000) == [5000000000 1] diff --git a/cranelift/filetests/filetests/runtests/simd-lane-access.clif b/cranelift/filetests/filetests/runtests/simd-lane-access.clif index 0818bdd85b89..d43a0e20cf63 100644 --- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif +++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-logical.clif b/cranelift/filetests/filetests/runtests/simd-logical.clif index 081d4892de81..406ea9698ddd 100644 --- a/cranelift/filetests/filetests/runtests/simd-logical.clif +++ b/cranelift/filetests/filetests/runtests/simd-logical.clif @@ -1,6 +1,6 @@ test run target aarch64 -; target s390x TODO: Not yet implemented on s390x +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-min-max.clif b/cranelift/filetests/filetests/runtests/simd-min-max.clif index 54653616ffbf..7a4cc0a0784d 100644 --- a/cranelift/filetests/filetests/runtests/simd-min-max.clif +++ b/cranelift/filetests/filetests/runtests/simd-min-max.clif @@ -1,6 +1,7 @@ test run target aarch64 target x86_64 +target s390x function %imin_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-saddsat.clif b/cranelift/filetests/filetests/runtests/simd-saddsat.clif index 515cc83a4a2e..104041204932 100644 --- a/cranelift/filetests/filetests/runtests/simd-saddsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-saddsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index b7850a578165..eaabb23768cf 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-snarrow.clif b/cranelift/filetests/filetests/runtests/simd-snarrow.clif index 082e86c179f2..86d3ee2100bc 100644 --- a/cranelift/filetests/filetests/runtests/simd-snarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-snarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-splat.clif b/cranelift/filetests/filetests/runtests/simd-splat.clif index 19892cb29b7c..1cfef52c78b4 100644 --- a/cranelift/filetests/filetests/runtests/simd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-splat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif index f6809ddc5c80..91554360b664 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat-aarch64.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ;; x86_64 hasn't implemented this for `i32x4` function %sqmulrs_i32x4(i32x4, i32x4) -> i32x4 { diff --git a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif index 723696d25a8f..d7d3ffec7b28 100644 --- a/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-sqmulroundsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-ssubsat.clif b/cranelift/filetests/filetests/runtests/simd-ssubsat.clif index 8841f2275f66..1a517b483a8e 100644 --- a/cranelift/filetests/filetests/runtests/simd-ssubsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-ssubsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif index 1d9c15581b76..169c9122e376 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif index bee577072d13..6c014ad4a4d0 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif index 390780879c77..e1c7fba879da 100644 --- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif +++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uaddsat.clif b/cranelift/filetests/filetests/runtests/simd-uaddsat.clif index d0af940abdee..5610d4d36c3a 100644 --- a/cranelift/filetests/filetests/runtests/simd-uaddsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-uaddsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-unarrow.clif b/cranelift/filetests/filetests/runtests/simd-unarrow.clif index e535df5e0778..f15a3217570c 100644 --- a/cranelift/filetests/filetests/runtests/simd-unarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-unarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-usubsat.clif b/cranelift/filetests/filetests/runtests/simd-usubsat.clif index ca8747c3e900..55a85c8c895b 100644 --- a/cranelift/filetests/filetests/runtests/simd-usubsat.clif +++ b/cranelift/filetests/filetests/runtests/simd-usubsat.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uunarrow.clif b/cranelift/filetests/filetests/runtests/simd-uunarrow.clif index b2a68c44802f..11ff104db149 100644 --- a/cranelift/filetests/filetests/runtests/simd-uunarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uunarrow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ; x86_64 panics: `Did not match fcvt input! ; thread 'worker #0' panicked at 'register allocation: Analysis(EntryLiveinValues([v2V]))', cranelift/codegen/src/machinst/compile.rs:96:10` diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif index 959b6acd7336..aaf8d4102f99 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif index fab64406b946..90f14bb1d331 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue.clif b/cranelift/filetests/filetests/runtests/simd-valltrue.clif index c39a2702e6a4..c799893ac8e2 100644 --- a/cranelift/filetests/filetests/runtests/simd-valltrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-valltrue.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %vall_true_b8x16(b8x16) -> b1 { diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif index 74b99d785e4c..28e1c60a7d50 100644 --- a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %vany_true_b8x16(b8x16) -> b1 { diff --git a/cranelift/filetests/filetests/runtests/simd-vconst.clif b/cranelift/filetests/filetests/runtests/simd-vconst.clif index b2398b6ec01a..5aa5386484f4 100644 --- a/cranelift/filetests/filetests/runtests/simd-vconst.clif +++ b/cranelift/filetests/filetests/runtests/simd-vconst.clif @@ -1,5 +1,5 @@ test run -; target s390x TODO: Not yet implemented on s390x +target s390x target aarch64 set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif index e4ed0e42cf7e..d22abb702414 100644 --- a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif +++ b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-vselect.clif b/cranelift/filetests/filetests/runtests/simd-vselect.clif index 53ef6f6353d7..db5f9180433e 100644 --- a/cranelift/filetests/filetests/runtests/simd-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif @@ -1,6 +1,6 @@ test interpret test run -; target s390x TODO: Not yet implemented on s390x +target s390x target aarch64 set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif index dcfaba0294a9..c38099c429f0 100644 --- a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif +++ b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif index d3a8c655d171..445ccbcc148b 100644 --- a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif +++ b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif @@ -1,5 +1,6 @@ test run target aarch64 +target s390x ; raw_bitcast is needed to get around issue with "bint" on aarch64