Add bn_add_words and bn_sub_words assembly for aarch64. It is 2023 and compilers *still* cannot use carry flags effectively, particularly GCC. There are some Clang-specific built-ins which help x86_64 (where we have asm anyway) but, on aarch64, the built-ins actually *regress performance* over the current formulation! I suspect Clang is getting confused by Arm and Intel having opposite borrow flags. https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins Just include aarch64 assembly to avoid this. This provides a noticeable perf boost in code that uses these functions (Where bn_mul_mont is available, they're not used much in RSA, but the generic EC implementation does modular additions, and RSA private key checking spends a lot of time in our add/sub-based bn_div_consttime.) The new code is also smaller than the generic one (18 instructions each), probably because it avoids all the flag spills and only tries to unroll by two iterations. Before: Did 7137 RSA 2048 signing operations in 4022094us (1774.4 ops/sec) Did 326000 RSA 2048 verify (same key) operations in 4001828us (81462.8 ops/sec) Did 278000 RSA 2048 verify (fresh key) operations in 4001392us (69475.8 ops/sec) Did 34830 RSA 2048 private key parse operations in 4038893us (8623.7 ops/sec) Did 1196 RSA 4096 signing operations in 4015759us (297.8 ops/sec) Did 90000 RSA 4096 verify (same key) operations in 4041959us (22266.4 ops/sec) Did 79000 RSA 4096 verify (fresh key) operations in 4034561us (19580.8 ops/sec) Did 12222 RSA 4096 private key parse operations in 4004831us (3051.8 ops/sec) Did 10626 ECDSA P-384 signing operations in 4030764us (2636.2 ops/sec) Did 10800 ECDSA P-384 verify operations in 4052718us (2664.9 ops/sec) Did 4182 ECDSA P-521 signing operations in 4076198us (1026.0 ops/sec) Did 4059 ECDSA P-521 verify operations in 4063819us (998.8 ops/sec) After: Did 7189 RSA 2048 signing operations in 4021331us (1787.7 ops/sec) [+0.7%] Did 326000 RSA 2048 verify (same key) operations in 4010811us (81280.3 ops/sec) [-0.2%] Did 278000 RSA 2048 verify (fresh key) operations in 4004206us (69427.0 ops/sec) [-0.1%] Did 53040 RSA 2048 private key parse operations in 4050953us (13093.2 ops/sec) [+51.8%] Did 1200 RSA 4096 signing operations in 4035548us (297.4 ops/sec) [-0.2%] Did 90000 RSA 4096 verify (same key) operations in 4035686us (22301.0 ops/sec) [+0.2%] Did 80000 RSA 4096 verify (fresh key) operations in 4020989us (19895.6 ops/sec) [+1.6%] Did 20468 RSA 4096 private key parse operations in 4037474us (5069.5 ops/sec) [+66.1%] Did 11070 ECDSA P-384 signing operations in 4023595us (2751.3 ops/sec) [+4.4%] Did 11232 ECDSA P-384 verify operations in 4063116us (2764.4 ops/sec) [+3.7%] Did 4387 ECDSA P-521 signing operations in 4052728us (1082.5 ops/sec) [+5.5%] Did 4305 ECDSA P-521 verify operations in 4064660us (1059.1 ops/sec) [+6.0%] Change-Id: If2f739373cdd10fa1d4925d5e2725e87d2255fc0 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56966 Reviewed-by: Bob Beck <bbe@google.com> Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 2bfadab..66fd448 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt
@@ -3,6 +3,7 @@ perlasm(BCM_SOURCES aarch64 aesv8-armv8 aes/asm/aesv8-armx.pl) perlasm(BCM_SOURCES aarch64 aesv8-gcm-armv8 modes/asm/aesv8-gcm-armv8.pl) perlasm(BCM_SOURCES aarch64 armv8-mont bn/asm/armv8-mont.pl) +perlasm(BCM_SOURCES aarch64 bn-armv8 bn/asm/bn-armv8.pl) perlasm(BCM_SOURCES aarch64 ghash-neon-armv8 modes/asm/ghash-neon-armv8.pl) perlasm(BCM_SOURCES aarch64 ghashv8-armv8 modes/asm/ghashv8-armx.pl) perlasm(BCM_SOURCES aarch64 p256_beeu-armv8-asm ec/asm/p256_beeu-armv8-asm.pl)
diff --git a/crypto/fipsmodule/bn/asm/bn-armv8.pl b/crypto/fipsmodule/bn/asm/bn-armv8.pl new file mode 100755 index 0000000..5aed8df --- /dev/null +++ b/crypto/fipsmodule/bn/asm/bn-armv8.pl
@@ -0,0 +1,118 @@ +#!/usr/bin/env perl +# Copyright (c) 2023, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir = $1; +my $xlate; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT = *OUT; + +my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3"); +my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8"); +my $code = <<____; +#include <openssl/arm_arch.h> + +.text + +// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); +.type bn_add_words, %function +.globl bn_add_words +.align 4 +bn_add_words: + AARCH64_VALID_CALL_TARGET + # Clear the carry flag. + cmn xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split $num = 2 * $num_pairs + $num. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr $num_pairs, $num, #1 + and $num, $num, #1 + + cbz $num_pairs, .Ladd_tail +.Ladd_loop: + ldp $a0, $a1, [$ap], #16 + ldp $b0, $b1, [$bp], #16 + sub $num_pairs, $num_pairs, #1 + adcs $a0, $a0, $b0 + adcs $a1, $a1, $b1 + stp $a0, $a1, [$rp], #16 + cbnz $num_pairs, .Ladd_loop + +.Ladd_tail: + cbz $num, .Ladd_exit + ldr $a0, [$ap], #8 + ldr $b0, [$bp], #8 + adcs $a0, $a0, $b0 + str $a0, [$rp], #8 + +.Ladd_exit: + cset x0, cs + ret +.size bn_add_words,.-bn_add_words + +// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); +.type bn_sub_words, %function +.globl bn_sub_words +.align 4 +bn_sub_words: + AARCH64_VALID_CALL_TARGET + # Set the carry flag. Arm's borrow bit is flipped from the carry flag, + # so we want C = 1 here. + cmp xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split $num = 2 * $num_pairs + $num. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr $num_pairs, $num, #1 + and $num, $num, #1 + + cbz $num_pairs, .Lsub_tail +.Lsub_loop: + ldp $a0, $a1, [$ap], #16 + ldp $b0, $b1, [$bp], #16 + sub $num_pairs, $num_pairs, #1 + sbcs $a0, $a0, $b0 + sbcs $a1, $a1, $b1 + stp $a0, $a1, [$rp], #16 + cbnz $num_pairs, .Lsub_loop + +.Lsub_tail: + cbz $num, .Lsub_exit + ldr $a0, [$ap], #8 + ldr $b0, [$bp], #8 + sbcs $a0, $a0, $b0 + str $a0, [$rp], #8 + +.Lsub_exit: + cset x0, cc + ret +size bn_sub_words,.-bn_sub_words +____ + +print $code; +close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c index 628cc53..df4a834 100644 --- a/crypto/fipsmodule/bn/generic.c +++ b/crypto/fipsmodule/bn/generic.c
@@ -74,6 +74,11 @@ #define BN_MUL_ASM #endif +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) +// See asm/bn-armv8.pl. +#define BN_ADD_ASM +#endif + #if !defined(BN_MUL_ASM) #ifdef BN_ULLONG