Add bn_add_words and bn_sub_words assembly for aarch64. It is 2023 and compilers *still* cannot use carry flags effectively, particularly GCC. There are some Clang-specific built-ins which help x86_64 (where we have asm anyway) but, on aarch64, the built-ins actually *regress performance* over the current formulation! I suspect Clang is getting confused by Arm and Intel having opposite borrow flags. https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins Just include aarch64 assembly to avoid this. This provides a noticeable perf boost in code that uses these functions (Where bn_mul_mont is available, they're not used much in RSA, but the generic EC implementation does modular additions, and RSA private key checking spends a lot of time in our add/sub-based bn_div_consttime.) The new code is also smaller than the generic one (18 instructions each), probably because it avoids all the flag spills and only tries to unroll by two iterations. Before: Did 7137 RSA 2048 signing operations in 4022094us (1774.4 ops/sec) Did 326000 RSA 2048 verify (same key) operations in 4001828us (81462.8 ops/sec) Did 278000 RSA 2048 verify (fresh key) operations in 4001392us (69475.8 ops/sec) Did 34830 RSA 2048 private key parse operations in 4038893us (8623.7 ops/sec) Did 1196 RSA 4096 signing operations in 4015759us (297.8 ops/sec) Did 90000 RSA 4096 verify (same key) operations in 4041959us (22266.4 ops/sec) Did 79000 RSA 4096 verify (fresh key) operations in 4034561us (19580.8 ops/sec) Did 12222 RSA 4096 private key parse operations in 4004831us (3051.8 ops/sec) Did 10626 ECDSA P-384 signing operations in 4030764us (2636.2 ops/sec) Did 10800 ECDSA P-384 verify operations in 4052718us (2664.9 ops/sec) Did 4182 ECDSA P-521 signing operations in 4076198us (1026.0 ops/sec) Did 4059 ECDSA P-521 verify operations in 4063819us (998.8 ops/sec) After: Did 7189 RSA 2048 signing operations in 4021331us (1787.7 ops/sec) [+0.7%] Did 326000 RSA 2048 verify (same key) operations in 4010811us (81280.3 ops/sec) [-0.2%] Did 278000 RSA 2048 verify (fresh key) operations in 4004206us (69427.0 ops/sec) [-0.1%] Did 53040 RSA 2048 private key parse operations in 4050953us (13093.2 ops/sec) [+51.8%] Did 1200 RSA 4096 signing operations in 4035548us (297.4 ops/sec) [-0.2%] Did 90000 RSA 4096 verify (same key) operations in 4035686us (22301.0 ops/sec) [+0.2%] Did 80000 RSA 4096 verify (fresh key) operations in 4020989us (19895.6 ops/sec) [+1.6%] Did 20468 RSA 4096 private key parse operations in 4037474us (5069.5 ops/sec) [+66.1%] Did 11070 ECDSA P-384 signing operations in 4023595us (2751.3 ops/sec) [+4.4%] Did 11232 ECDSA P-384 verify operations in 4063116us (2764.4 ops/sec) [+3.7%] Did 4387 ECDSA P-521 signing operations in 4052728us (1082.5 ops/sec) [+5.5%] Did 4305 ECDSA P-521 verify operations in 4064660us (1059.1 ops/sec) [+6.0%] Change-Id: If2f739373cdd10fa1d4925d5e2725e87d2255fc0 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56966 Reviewed-by: Bob Beck <bbe@google.com> Commit-Queue: David Benjamin <davidben@google.com>

commit: d1b451676eada2f2dcad9a20debf8b76fa17f403 [log] [tgz]
author: David Benjamin <davidben@google.com> Thu Feb 02 14:50:36 2023 -0500
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Mon Feb 06 21:10:41 2023 +0000
tree: 1d80d9a83419b57507e65f985edc5934d3eddb24
parent: 3a16df9aa055b8e330bc1fa2e09e0be8ee404a94 [diff]
diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt
index 2bfadab..66fd448 100644
--- a/crypto/fipsmodule/CMakeLists.txt
+++ b/crypto/fipsmodule/CMakeLists.txt

@@ -3,6 +3,7 @@
 perlasm(BCM_SOURCES aarch64 aesv8-armv8 aes/asm/aesv8-armx.pl)
 perlasm(BCM_SOURCES aarch64 aesv8-gcm-armv8 modes/asm/aesv8-gcm-armv8.pl)
 perlasm(BCM_SOURCES aarch64 armv8-mont bn/asm/armv8-mont.pl)
+perlasm(BCM_SOURCES aarch64 bn-armv8 bn/asm/bn-armv8.pl)
 perlasm(BCM_SOURCES aarch64 ghash-neon-armv8 modes/asm/ghash-neon-armv8.pl)
 perlasm(BCM_SOURCES aarch64 ghashv8-armv8 modes/asm/ghashv8-armx.pl)
 perlasm(BCM_SOURCES aarch64 p256_beeu-armv8-asm ec/asm/p256_beeu-armv8-asm.pl)

diff --git a/crypto/fipsmodule/bn/asm/bn-armv8.pl b/crypto/fipsmodule/bn/asm/bn-armv8.pl
new file mode 100755
index 0000000..5aed8df
--- /dev/null
+++ b/crypto/fipsmodule/bn/asm/bn-armv8.pl

@@ -0,0 +1,118 @@
+#!/usr/bin/env perl
+# Copyright (c) 2023, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3");
+my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8");
+my $code = <<____;
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_add_words, %function
+.globl	bn_add_words
+.align	4
+bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Ladd_tail
+.Ladd_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	adcs	$a0, $a0, $b0
+	adcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Ladd_loop
+
+.Ladd_tail:
+	cbz	$num, .Ladd_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	adcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Ladd_exit:
+	cset	x0, cs
+	ret
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_sub_words, %function
+.globl	bn_sub_words
+.align	4
+bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Lsub_tail
+.Lsub_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	sbcs	$a0, $a0, $b0
+	sbcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Lsub_loop
+
+.Lsub_tail:
+	cbz	$num, .Lsub_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	sbcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Lsub_exit:
+	cset x0, cc
+	ret
+size	bn_sub_words,.-bn_sub_words
+____
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";

diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index 628cc53..df4a834 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c

@@ -74,6 +74,11 @@
 #define BN_MUL_ASM
 #endif
 
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+// See asm/bn-armv8.pl.
+#define BN_ADD_ASM
+#endif
+
 #if !defined(BN_MUL_ASM)
 
 #ifdef BN_ULLONG
commit	d1b451676eada2f2dcad9a20debf8b76fa17f403	[log] [tgz]
author	David Benjamin <davidben@google.com>	Thu Feb 02 14:50:36 2023 -0500
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Mon Feb 06 21:10:41 2023 +0000
tree	1d80d9a83419b57507e65f985edc5934d3eddb24
parent	3a16df9aa055b8e330bc1fa2e09e0be8ee404a94 [diff]