diff --git a/build.rs b/build.rs index 81b317f736..4507b4a054 100644 --- a/build.rs +++ b/build.rs @@ -887,10 +887,14 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "bn_gather5", "bn_mul_mont", "bn_mul_mont_gather5", + "bn_mul_mont_nohw", + "bn_mul4x_mont", + "bn_mulx4x_mont", "bn_neg_inv_mod_r_u64", "bn_power5", "bn_scatter5", "bn_sqr8x_internal", + "bn_sqr8x_mont", "bn_sqrx8x_internal", "bsaes_ctr32_encrypt_blocks", "bssl_constant_time_test_conditional_memcpy", diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl index ce89b17679..21f66488da 100755 --- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl +++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl @@ -65,7 +65,7 @@ # output, so this isn't useful anyway. $addx = 1; -# void bn_mul_mont( +# void bn_mul_mont_nohw( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, @@ -87,33 +87,15 @@ .extern OPENSSL_ia32cap_P -.globl bn_mul_mont -.type bn_mul_mont,\@function,6 +.globl bn_mul_mont_nohw +.type bn_mul_mont_nohw,\@function,6 .align 16 -bn_mul_mont: +bn_mul_mont_nohw: .cfi_startproc _CET_ENDBR mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax - test \$3,${num}d - jnz .Lmul_enter - cmp \$8,${num}d - jb .Lmul_enter -___ -$code.=<<___ if ($addx); - leaq OPENSSL_ia32cap_P(%rip),%r11 - mov 8(%r11),%r11d -___ -$code.=<<___; - cmp $ap,$bp - jne .Lmul4x_enter - test \$7,${num}d - jz .Lsqr8x_enter - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: push %rbx .cfi_push %rbx push %rbp @@ -348,27 +330,21 @@ .Lmul_epilogue: ret .cfi_endproc -.size bn_mul_mont,.-bn_mul_mont +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; +.globl bn_mul4x_mont .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: .cfi_startproc + _CET_ENDBR mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax -.Lmul4x_enter: -___ -$code.=<<___ if ($addx); - and \$0x80100,%r11d - cmp \$0x80100,%r11d - je .Lmulx4x_enter -___ -$code.=<<___; push %rbx .cfi_push %rbx push %rbp @@ -825,13 +801,15 @@ $code.=<<___; .extern bn_sqr8x_internal # see x86_64-mont5 module +.globl bn_sqr8x_mont .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: .cfi_startproc + _CET_ENDBR + mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax -.Lsqr8x_enter: push %rbx .cfi_push %rbx push %rbp @@ -1024,13 +1002,14 @@ my $bp="%rdx"; # original value $code.=<<___; +.globl bn_mulx4x_mont .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: .cfi_startproc + _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax -.Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp @@ -1535,9 +1514,9 @@ .section .pdata .align 4 - .rva .LSEH_begin_bn_mul_mont - .rva .LSEH_end_bn_mul_mont - .rva .LSEH_info_bn_mul_mont + .rva .LSEH_begin_bn_mul_mont_nohw + .rva .LSEH_end_bn_mul_mont_nohw + .rva .LSEH_info_bn_mul_mont_nohw .rva .LSEH_begin_bn_mul4x_mont .rva .LSEH_end_bn_mul4x_mont @@ -1555,7 +1534,7 @@ $code.=<<___; .section .xdata .align 8 -.LSEH_info_bn_mul_mont: +.LSEH_info_bn_mul_mont_nohw: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index 3fbb7d7521..d44b120a13 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h @@ -166,22 +166,40 @@ typedef crypto_word_t BN_ULONG; #endif +// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced. +// If neither is fully-reduced, the output may not be either. +// +// This function allocates |num| words on the stack, so |num| should be at most +// |BN_MONTGOMERY_MAX_WORDS|. +// +// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks +// off upper bits. The aarch64 implementation expects a 64-bit input and does +// not. |size_t| is the safer option but not strictly correct for x86_64. But +// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot. +// +// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word +// inputs. +// // |num| must be at least 4, at least on x86. // // In other forks, |bn_mul_mont| returns an |int| indicating whether it // actually did the multiplication. All our implementations always do the // multiplication, and forcing callers to deal with the possibility of it // failing just leads to further problems. -// -// In other forks, |bn_mod_mul|'s `num` argument has type |int| but it is -// implicitly treated as a |size_t|; when |int| is smaller than |size_t| -// then the |movq 48(%rsp),%r9| done by x86_64-xlate.pl implicitly does the -// conversion. OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) || (sizeof(int) == 4 && sizeof(size_t) == 8), "int and size_t ABI mismatch"); +#if !defined(OPENSSL_X86_64) void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); +#else +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); +} +#endif static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out, BN_ULONG a, BN_ULONG b) { diff --git a/src/arithmetic.rs b/src/arithmetic.rs index b240d9df2a..739f2d7822 100644 --- a/src/arithmetic.rs +++ b/src/arithmetic.rs @@ -12,6 +12,9 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +#[macro_use] +mod ffi; + mod constant; #[cfg(feature = "alloc")] @@ -22,6 +25,8 @@ pub mod montgomery; mod n0; +pub(crate) use self::ffi::BIGINT_MODULUS_MIN_LIMBS; + #[allow(dead_code)] const BIGINT_MODULUS_MAX_LIMBS: usize = 8192 / crate::limb::LIMB_BITS; diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs index e0be21c8b4..276f2b8c34 100644 --- a/src/arithmetic/bigint.rs +++ b/src/arithmetic/bigint.rs @@ -100,8 +100,7 @@ fn from_montgomery_amm(limbs: BoxedLimbs, m: &Modulus) -> Elem( let src2 = entry(previous, src2, num_limbs); let dst = entry_mut(rest, 0, num_limbs); limbs_mul_mont( - InOut::Disjoint(dst, src1), - src2, + InOut::Disjoint(dst, src1, src2), m.limbs(), m.n0(), m.cpu_features(), diff --git a/src/arithmetic/bigint/modulus.rs b/src/arithmetic/bigint/modulus.rs index d4ce866745..b79aa40ee1 100644 --- a/src/arithmetic/bigint/modulus.rs +++ b/src/arithmetic/bigint/modulus.rs @@ -21,12 +21,7 @@ use crate::{ }; use core::marker::PhantomData; -/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4 -/// limbs. For a long time we have required 4 limbs for all targets, though -/// this may be unnecessary. TODO: Replace this with -/// `n.len() < 256 / LIMB_BITS` so that 32-bit and 64-bit platforms behave the -/// same. -pub const MODULUS_MIN_LIMBS: usize = 4; +pub const MODULUS_MIN_LIMBS: usize = super::super::BIGINT_MODULUS_MIN_LIMBS; pub const MODULUS_MAX_LIMBS: usize = super::super::BIGINT_MODULUS_MAX_LIMBS; diff --git a/src/arithmetic/ffi.rs b/src/arithmetic/ffi.rs new file mode 100644 index 0000000000..224af15acd --- /dev/null +++ b/src/arithmetic/ffi.rs @@ -0,0 +1,129 @@ +// Copyright 2024-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{n0::N0, InOut}; +use crate::{c, limb::Limb}; + +// See below. +// TODO: Replace this with `n.len() < 256 / LIMB_BITS` so that 32-bit and +// 64-bit platforms behave the same. +pub(crate) const BIGINT_MODULUS_MIN_LIMBS: usize = 4; + +/// `unsafe { ([Limb; chunk_len], n, T) => f }` means it is safe to call `f` if +/// `n.len() >= (n * chunk_len) && n.len() % chunk_len == 0`, the slice(s) in +/// `in_out` have the same length as `n`, and we have constructed a value of +/// type `T`. +macro_rules! bn_mul_mont_ffi { + ( $in_out:expr, $n:expr, $n0:expr, $cpu:expr, + unsafe { ([Limb; $CHUNK:expr], $MIN_CHUNKS:expr, $Cpu:ty) => $f:ident }) => {{ + use crate::{c, limb::Limb}; + prefixed_extern! { + // `r` and/or 'a' and/or 'b' may alias. + // XXX: BoringSSL declares these functions to return `int`. + fn $f( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + len: c::size_t, + ); + } + unsafe { + crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $CHUNK }, { $CHUNK * $MIN_CHUNKS }>( + $in_out, $n, $n0, $cpu, $f, + ) + } + }}; +} + +#[inline] +pub(super) unsafe fn bn_mul_mont_ffi( + in_out: InOut<[Limb]>, + n: &[Limb], + n0: &N0, + cpu: Cpu, + f: unsafe extern "C" fn( + r: *mut Limb, + a: *const Limb, + b: *const Limb, + n: *const Limb, + n0: &N0, + len: c::size_t, + ), +) { + /// The x86 implementation of `bn_mul_mont`, at least, requires at least 4 + /// limbs. For a long time we have required 4 limbs for all targets, though + /// this may be unnecessary. + const _BIGINT_MODULUS_MIN_LIMBS_AT_LEAST_4: () = assert!(BIGINT_MODULUS_MIN_LIMBS >= 4); + assert!(CHUNK > 0); + assert!(n.len() % CHUNK == 0); + assert!(MIN_LEN >= BIGINT_MODULUS_MIN_LIMBS); + assert!(n.len() >= MIN_LEN); + + let (r, a, b) = match in_out { + InOut::SquareInPlace(r) => { + assert_eq!(r.len(), n.len()); + (r.as_mut_ptr(), r.as_ptr(), r.as_ptr()) + } + InOut::InPlace(r, a) => { + assert_eq!(r.len(), n.len()); + assert_eq!(a.len(), n.len()); + (r.as_mut_ptr(), r.as_ptr(), a.as_ptr()) + } + InOut::Disjoint(r, a, b) => { + assert_eq!(r.len(), n.len()); + assert_eq!(a.len(), n.len()); + assert_eq!(b.len(), n.len()); + (r.as_mut_ptr(), a.as_ptr(), b.as_ptr()) + } + }; + let num_limbs = n.len(); + let n = n.as_ptr(); + let _: Cpu = cpu; + unsafe { f(r, a, b, n, n0, num_limbs) }; +} + +// `bn_sqr8x_mont` has a weird signature so it has to be handled separately. +// Note that MULX is in BMI2. +#[cfg(target_arch = "x86_64")] +pub(super) fn bn_sqr8x_mont( + r: &mut [Limb], + n: &[[Limb; 8]], + n0: &N0, + mulx_adx: Option<(crate::cpu::intel::Bmi2, crate::cpu::intel::Adx)>, +) { + use crate::{bssl, polyfill::slice}; + prefixed_extern! { + // `rp` and `ap` may alias. + fn bn_sqr8x_mont( + rp: *mut Limb, + ap: *const Limb, + mulx_adx_capable: Limb, + np: *const Limb, + n0: &N0, + num: c::size_t) -> bssl::Result; + } + assert!(!n.is_empty()); + let n = slice::flatten(n); + assert_eq!(r.len(), n.len()); + + let r_out = r.as_mut_ptr(); + let r_in = r.as_ptr(); + let mulx_adx_capable = Limb::from(mulx_adx.is_some()); + let num = n.len(); + let n = n.as_ptr(); + let r = unsafe { bn_sqr8x_mont(r_out, r_in, mulx_adx_capable, n, n0, num) }; + assert!(Result::from(r).is_ok()); +} diff --git a/src/arithmetic/inout.rs b/src/arithmetic/inout.rs index daafb4f5a9..a576ec91de 100644 --- a/src/arithmetic/inout.rs +++ b/src/arithmetic/inout.rs @@ -13,7 +13,8 @@ // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. pub enum InOut<'io, T: ?Sized> { - InPlace(&'io mut T), + SquareInPlace(&'io mut T), + InPlace(&'io mut T, &'io T), #[cfg_attr(target_arch = "x86_64", allow(dead_code))] - Disjoint(&'io mut T, &'io T), + Disjoint(&'io mut T, &'io T, &'io T), } diff --git a/src/arithmetic/montgomery.rs b/src/arithmetic/montgomery.rs index b58b4ef090..be1c889697 100644 --- a/src/arithmetic/montgomery.rs +++ b/src/arithmetic/montgomery.rs @@ -1,4 +1,4 @@ -// Copyright 2017-2023 Brian Smith. +// Copyright 2017-2025 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above @@ -12,8 +12,10 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -pub use super::{n0::N0, InOut}; +pub use super::n0::N0; +use super::{ffi, InOut}; use crate::cpu; +use cfg_if::cfg_if; // Indicates that the element is not encoded; there is no *R* factor // that needs to be canceled out. @@ -113,25 +115,34 @@ impl ProductEncoding for (RRR, RInverse) { use crate::{bssl, c, limb::Limb}; #[inline(always)] -pub(super) fn limbs_mul_mont(ra: InOut<[Limb]>, b: &[Limb], n: &[Limb], n0: &N0, _: cpu::Features) { - // XXX/TODO: All the `debug_assert_eq!` length checking needs to be - // replaced with enforcement that happens regardless of debug mode. - let (r, a) = match ra { - InOut::InPlace(r) => { - debug_assert_eq!(r.len(), n.len()); - (r.as_mut_ptr(), r.as_ptr()) +pub(super) fn limbs_mul_mont(in_out: InOut<[Limb]>, n: &[Limb], n0: &N0, cpu: cpu::Features) { + cfg_if! { + if #[cfg(target_arch = "x86_64")] { + use cpu::{GetFeature as _, intel::{Adx, Bmi2}}; + const CHUNK_4X: usize = 4; + const MIN_CHUNKS_4X: usize = 2; + + if n.len() % CHUNK_4X == 0 && n.len() >= CHUNK_4X * MIN_CHUNKS_4X { + if let Some(cpu) = cpu.get_feature() { + bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + ([Limb; CHUNK_4X], MIN_CHUNKS_4X, (Bmi2, Adx)) => bn_mulx4x_mont + }) + } else { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + ([Limb; CHUNK_4X], MIN_CHUNKS_4X, ()) => bn_mul4x_mont + }) + } + } else { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + ([Limb; 1], ffi::BIGINT_MODULUS_MIN_LIMBS, ()) => bn_mul_mont_nohw + }) + } + } else { + bn_mul_mont_ffi!(in_out, n, n0, cpu, unsafe { + ([Limb; 1], ffi::BIGINT_MODULUS_MIN_LIMBS, cpu::Features) => bn_mul_mont + }); } - InOut::Disjoint(r, a) => { - debug_assert_eq!(r.len(), n.len()); - debug_assert_eq!(a.len(), n.len()); - (r.as_mut_ptr(), a.as_ptr()) - } - }; - debug_assert_eq!(b.len(), n.len()); - let b = b.as_ptr(); - let num_limbs = n.len(); - let n = n.as_ptr(); - unsafe { bn_mul_mont(r, a, b, n, n0, num_limbs) } + } } #[cfg(not(any( @@ -240,31 +251,19 @@ prefixed_extern! { fn limbs_mul_add_limb(r: *mut Limb, a: *const Limb, b: Limb, num_limbs: c::size_t) -> Limb; } -#[cfg(any( - all(target_arch = "aarch64", target_endian = "little"), - all(target_arch = "arm", target_endian = "little"), - target_arch = "x86_64", - target_arch = "x86" -))] -prefixed_extern! { - // `r` and/or 'a' and/or 'b' may alias. - fn bn_mul_mont( - r: *mut Limb, - a: *const Limb, - b: *const Limb, - n: *const Limb, - n0: &N0, - num_limbs: c::size_t, - ); -} - /// r = r**2 -pub(super) fn limbs_square_mont(r: &mut [Limb], n: &[Limb], n0: &N0, _cpu: cpu::Features) { - debug_assert_eq!(r.len(), n.len()); - let r = r.as_mut_ptr(); - let num_limbs = n.len(); - let n = n.as_ptr(); - unsafe { bn_mul_mont(r, r, r, n, n0, num_limbs) } +pub(super) fn limbs_square_mont(r: &mut [Limb], n: &[Limb], n0: &N0, cpu: cpu::Features) { + #[cfg(target_arch = "x86_64")] + { + use super::ffi; + use crate::polyfill::slice; + if let (n @ [_, ..], &[]) = slice::as_chunks(n) { + use cpu::GetFeature as _; + return ffi::bn_sqr8x_mont(r, n, n0, cpu.get_feature()); + } + } + + limbs_mul_mont(InOut::SquareInPlace(r), n, n0, cpu) } #[cfg(test)] diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index 4d625fc5ee..dccf7bfe38 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -174,6 +174,8 @@ cfg_if! { impl_get_feature!{ MOVBE => Movbe } impl_get_feature!{ AVX => Avx } impl_get_feature!{ AVX2 => Avx2 } + impl_get_feature!{ BMI2 => Bmi2 } + impl_get_feature!{ ADX => Adx } impl_get_feature!{ SHA => Sha } impl Ssse3 {