Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

arithmetic: Dispatch x86_64 bn_mul_mont in Rust (Merge BoringSSL 7cb8df5) #2249

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -887,10 +887,14 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"bn_gather5",
"bn_mul_mont",
"bn_mul_mont_gather5",
"bn_mul_mont_nohw",
"bn_mul4x_mont",
"bn_mulx4x_mont",
"bn_neg_inv_mod_r_u64",
"bn_power5",
"bn_scatter5",
"bn_sqr8x_internal",
"bn_sqr8x_mont",
"bn_sqrx8x_internal",
"bsaes_ctr32_encrypt_blocks",
"bssl_constant_time_test_conditional_memcpy",
Expand Down
53 changes: 16 additions & 37 deletions crypto/fipsmodule/bn/asm/x86_64-mont.pl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
# output, so this isn't useful anyway.
$addx = 1;

# void bn_mul_mont(
# void bn_mul_mont_nohw(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
$bp="%rdx"; # const BN_ULONG *bp,
Expand All @@ -87,33 +87,15 @@

.extern OPENSSL_ia32cap_P

.globl bn_mul_mont
.type bn_mul_mont,\@function,6
.globl bn_mul_mont_nohw
.type bn_mul_mont_nohw,\@function,6
.align 16
bn_mul_mont:
bn_mul_mont_nohw:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
jb .Lmul_enter
___
$code.=<<___ if ($addx);
leaq OPENSSL_ia32cap_P(%rip),%r11
mov 8(%r11),%r11d
___
$code.=<<___;
cmp $ap,$bp
jne .Lmul4x_enter
test \$7,${num}d
jz .Lsqr8x_enter
jmp .Lmul4x_enter

.align 16
.Lmul_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -348,27 +330,21 @@
.Lmul_epilogue:
ret
.cfi_endproc
.size bn_mul_mont,.-bn_mul_mont
.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.globl bn_mul4x_mont
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
and \$0x80100,%r11d
cmp \$0x80100,%r11d
je .Lmulx4x_enter
___
$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -825,13 +801,15 @@
$code.=<<___;
.extern bn_sqr8x_internal # see x86_64-mont5 module

.globl bn_sqr8x_mont
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lsqr8x_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -1024,13 +1002,14 @@
my $bp="%rdx"; # original value

$code.=<<___;
.globl bn_mulx4x_mont
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
.cfi_startproc
_CET_ENDBR
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmulx4x_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -1535,9 +1514,9 @@

.section .pdata
.align 4
.rva .LSEH_begin_bn_mul_mont
.rva .LSEH_end_bn_mul_mont
.rva .LSEH_info_bn_mul_mont
.rva .LSEH_begin_bn_mul_mont_nohw
.rva .LSEH_end_bn_mul_mont_nohw
.rva .LSEH_info_bn_mul_mont_nohw

.rva .LSEH_begin_bn_mul4x_mont
.rva .LSEH_end_bn_mul4x_mont
Expand All @@ -1555,7 +1534,7 @@
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
.LSEH_info_bn_mul_mont_nohw:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
Expand Down
28 changes: 23 additions & 5 deletions crypto/fipsmodule/bn/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,22 +166,40 @@ typedef crypto_word_t BN_ULONG;
#endif


// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
// |BN_MONTGOMERY_MAX_WORDS|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
// not. |size_t| is the safer option but not strictly correct for x86_64. But
// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
//
// |num| must be at least 4, at least on x86.
//
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
// actually did the multiplication. All our implementations always do the
// multiplication, and forcing callers to deal with the possibility of it
// failing just leads to further problems.
//
// In other forks, |bn_mod_mul|'s `num` argument has type |int| but it is
// implicitly treated as a |size_t|; when |int| is smaller than |size_t|
// then the |movq 48(%rsp),%r9| done by x86_64-xlate.pl implicitly does the
// conversion.
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
(sizeof(int) == 4 && sizeof(size_t) == 8),
"int and size_t ABI mismatch");
#if !defined(OPENSSL_X86_64)
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#else
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif

static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
BN_ULONG a, BN_ULONG b) {
Expand Down
5 changes: 5 additions & 0 deletions src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#[macro_use]
mod ffi;

mod constant;

#[cfg(feature = "alloc")]
Expand All @@ -22,6 +25,8 @@ pub mod montgomery;

mod n0;

pub(crate) use self::ffi::BIGINT_MODULUS_MIN_LIMBS;

#[allow(dead_code)]
const BIGINT_MODULUS_MAX_LIMBS: usize = 8192 / crate::limb::LIMB_BITS;

Expand Down
9 changes: 3 additions & 6 deletions src/arithmetic/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ fn from_montgomery_amm<M>(limbs: BoxedLimbs<M>, m: &Modulus<M>) -> Elem<M, Unenc
one[0] = 1;
let one = &one[..m.limbs().len()];
limbs_mul_mont(
InOut::InPlace(&mut limbs),
one,
InOut::InPlace(&mut limbs, one),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down Expand Up @@ -151,8 +150,7 @@ where
(AF, BF): ProductEncoding,
{
limbs_mul_mont(
InOut::InPlace(&mut b.limbs),
&a.limbs,
InOut::InPlace(&mut b.limbs, &a.limbs),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down Expand Up @@ -480,8 +478,7 @@ pub fn elem_exp_consttime<M>(
let src2 = entry(previous, src2, num_limbs);
let dst = entry_mut(rest, 0, num_limbs);
limbs_mul_mont(
InOut::Disjoint(dst, src1),
src2,
InOut::Disjoint(dst, src1, src2),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down
7 changes: 1 addition & 6 deletions src/arithmetic/bigint/modulus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@ use crate::{
};
use core::marker::PhantomData;

/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
/// limbs. For a long time we have required 4 limbs for all targets, though
/// this may be unnecessary. TODO: Replace this with
/// `n.len() < 256 / LIMB_BITS` so that 32-bit and 64-bit platforms behave the
/// same.
pub const MODULUS_MIN_LIMBS: usize = 4;
pub const MODULUS_MIN_LIMBS: usize = super::super::BIGINT_MODULUS_MIN_LIMBS;

pub const MODULUS_MAX_LIMBS: usize = super::super::BIGINT_MODULUS_MAX_LIMBS;

Expand Down
129 changes: 129 additions & 0 deletions src/arithmetic/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Copyright 2024-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

use super::{n0::N0, InOut};
use crate::{c, limb::Limb};

// See below.
// TODO: Replace this with `n.len() < 256 / LIMB_BITS` so that 32-bit and
// 64-bit platforms behave the same.
pub(crate) const BIGINT_MODULUS_MIN_LIMBS: usize = 4;

/// `unsafe { ([Limb; chunk_len], n, T) => f }` means it is safe to call `f` if
/// `n.len() >= (n * chunk_len) && n.len() % chunk_len == 0`, the slice(s) in
/// `in_out` have the same length as `n`, and we have constructed a value of
/// type `T`.
macro_rules! bn_mul_mont_ffi {
( $in_out:expr, $n:expr, $n0:expr, $cpu:expr,
unsafe { ([Limb; $CHUNK:expr], $MIN_CHUNKS:expr, $Cpu:ty) => $f:ident }) => {{
use crate::{c, limb::Limb};
prefixed_extern! {
// `r` and/or 'a' and/or 'b' may alias.
// XXX: BoringSSL declares these functions to return `int`.
fn $f(
r: *mut Limb,
a: *const Limb,
b: *const Limb,
n: *const Limb,
n0: &N0,
len: c::size_t,
);
}
unsafe {
crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $CHUNK }, { $CHUNK * $MIN_CHUNKS }>(
$in_out, $n, $n0, $cpu, $f,
)
}
}};
}

#[inline]
pub(super) unsafe fn bn_mul_mont_ffi<Cpu, const CHUNK: usize, const MIN_LEN: usize>(
in_out: InOut<[Limb]>,
n: &[Limb],
n0: &N0,
cpu: Cpu,
f: unsafe extern "C" fn(
r: *mut Limb,
a: *const Limb,
b: *const Limb,
n: *const Limb,
n0: &N0,
len: c::size_t,
),
) {
/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
/// limbs. For a long time we have required 4 limbs for all targets, though
/// this may be unnecessary.
const _BIGINT_MODULUS_MIN_LIMBS_AT_LEAST_4: () = assert!(BIGINT_MODULUS_MIN_LIMBS >= 4);
assert!(CHUNK > 0);
assert!(n.len() % CHUNK == 0);
assert!(MIN_LEN >= BIGINT_MODULUS_MIN_LIMBS);
assert!(n.len() >= MIN_LEN);

let (r, a, b) = match in_out {
InOut::SquareInPlace(r) => {
assert_eq!(r.len(), n.len());
(r.as_mut_ptr(), r.as_ptr(), r.as_ptr())
}
InOut::InPlace(r, a) => {
assert_eq!(r.len(), n.len());
assert_eq!(a.len(), n.len());
(r.as_mut_ptr(), r.as_ptr(), a.as_ptr())
}
InOut::Disjoint(r, a, b) => {
assert_eq!(r.len(), n.len());
assert_eq!(a.len(), n.len());
assert_eq!(b.len(), n.len());
(r.as_mut_ptr(), a.as_ptr(), b.as_ptr())
}
};
let num_limbs = n.len();
let n = n.as_ptr();
let _: Cpu = cpu;
unsafe { f(r, a, b, n, n0, num_limbs) };
}

// `bn_sqr8x_mont` has a weird signature so it has to be handled separately.
// Note that MULX is in BMI2.
#[cfg(target_arch = "x86_64")]
pub(super) fn bn_sqr8x_mont(
r: &mut [Limb],
n: &[[Limb; 8]],
n0: &N0,
mulx_adx: Option<(crate::cpu::intel::Bmi2, crate::cpu::intel::Adx)>,
) {
use crate::{bssl, polyfill::slice};
prefixed_extern! {
// `rp` and `ap` may alias.
fn bn_sqr8x_mont(
rp: *mut Limb,
ap: *const Limb,
mulx_adx_capable: Limb,
np: *const Limb,
n0: &N0,
num: c::size_t) -> bssl::Result;
}
assert!(!n.is_empty());
let n = slice::flatten(n);
assert_eq!(r.len(), n.len());

let r_out = r.as_mut_ptr();
let r_in = r.as_ptr();
let mulx_adx_capable = Limb::from(mulx_adx.is_some());
let num = n.len();
let n = n.as_ptr();
let r = unsafe { bn_sqr8x_mont(r_out, r_in, mulx_adx_capable, n, n0, num) };
assert!(Result::from(r).is_ok());
}
5 changes: 3 additions & 2 deletions src/arithmetic/inout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

pub enum InOut<'io, T: ?Sized> {
InPlace(&'io mut T),
SquareInPlace(&'io mut T),
InPlace(&'io mut T, &'io T),
#[cfg_attr(target_arch = "x86_64", allow(dead_code))]
Disjoint(&'io mut T, &'io T),
Disjoint(&'io mut T, &'io T, &'io T),
}
Loading