Add CRYPTO_{addc,subc}_* functions to crypto/internal.h
I'm getting tired of having to rederive the best way to convince the
compiler to emit addc and subb functions. Do it once and use the Clang
builtins when available, because compilers seem to generally be terrible
at this. (See https://github.com/llvm/llvm-project/issues/73847.)
The immediate trigger was the FIPS 186-2 PRF, which completely doesn't
matter, but reminded me of this mess.
As far as naming and calling conventions go, I just mimicked the Clang
ones. In doing so, also use the Clang builtins when available, which
helps Clang x86_64 no-asm builds a bit:
Before:
Did 704 ECDH P-384 operations in 1018920us (690.9 ops/sec)
Did 1353 ECDSA P-384 signing operations in 1077927us (1255.2 ops/sec)
Did 1190 ECDSA P-384 verify operations in 1020788us (1165.8 ops/sec)
Did 784 RSA 2048 signing operations in 1058644us (740.6 ops/sec)
Did 34000 RSA 2048 verify (same key) operations in 1011854us (33601.7 ops/sec)
Did 30000 RSA 2048 verify (fresh key) operations in 1005974us (29821.8 ops/sec)
Did 7799 RSA 2048 private key parse operations in 1061203us (7349.2 ops/sec)
Did 130 RSA 4096 signing operations in 1082617us (120.1 ops/sec)
Did 10472 RSA 4096 verify (same key) operations in 1082857us (9670.7 ops/sec)
Did 9086 RSA 4096 verify (fresh key) operations in 1039164us (8743.6 ops/sec)
Did 2574 RSA 4096 private key parse operations in 1078946us (2385.7 ops/sec)
After:
Did 775 ECDH P-384 operations in 1008465us (768.5 ops/sec)
Did 1474 ECDSA P-384 signing operations in 1062096us (1387.8 ops/sec)
Did 1485 ECDSA P-384 verify operations in 1086574us (1366.7 ops/sec)
Did 812 RSA 2048 signing operations in 1043705us (778.0 ops/sec)
Did 36000 RSA 2048 verify (same key) operations in 1005643us (35798.0 ops/sec)
Did 33000 RSA 2048 verify (fresh key) operations in 1028256us (32093.2 ops/sec)
Did 10087 RSA 2048 private key parse operations in 1018067us (9908.0 ops/sec)
Did 132 RSA 4096 signing operations in 1033049us (127.8 ops/sec)
Did 11000 RSA 4096 verify (same key) operations in 1070502us (10275.6 ops/sec)
Did 9812 RSA 4096 verify (fresh key) operations in 1047618us (9366.0 ops/sec)
Did 3245 RSA 4096 private key parse operations in 1083247us (2995.6 ops/sec)
But this is also a no-asm build, so we don't really care. Builds with
assembly, broadly, do not use these codepaths. The exception is the
generic ECC code on 32-bit Arm, which has a few mod-add functions, and
we don't have 32-bit Arm bn_add_words assembly:
Before:
Did 168 ECDH P-384 operations in 1003229us (167.5 ops/sec)
Did 330 ECDSA P-384 signing operations in 1076600us (306.5 ops/sec)
Did 319 ECDSA P-384 verify operations in 1080750us (295.2 ops/sec)
After:
Did 195 ECDH P-384 operations in 1026458us (190.0 ops/sec)
Did 350 ECDSA P-384 signing operations in 1005392us (348.1 ops/sec)
Did 341 ECDSA P-384 verify operations in 1008486us (338.1 ops/sec)
Change-Id: Ia3fa51e59398224b9c39180e1d856bb412aa1246
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64309
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/add.c b/crypto/fipsmodule/bn/add.c
index 38a8450..1a8785e 100644
--- a/crypto/fipsmodule/bn/add.c
+++ b/crypto/fipsmodule/bn/add.c
@@ -117,10 +117,7 @@
BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min);
for (int i = min; i < max; i++) {
- // |r| and |a| may alias, so use a temporary.
- BN_ULONG tmp = carry + a->d[i];
- carry = tmp < a->d[i];
- r->d[i] = tmp;
+ r->d[i] = CRYPTO_addc_w(a->d[i], 0, carry, &carry);
}
r->d[max] = carry;
@@ -241,10 +238,7 @@
BN_ULONG borrow = bn_sub_words(r->d, a->d, b->d, b_width);
for (int i = b_width; i < a->width; i++) {
- // |r| and |a| may alias, so use a temporary.
- BN_ULONG tmp = a->d[i];
- r->d[i] = a->d[i] - borrow;
- borrow = tmp < r->d[i];
+ r->d[i] = CRYPTO_subc_w(a->d[i], 0, borrow, &borrow);
}
if (borrow) {
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index e4f4518..247398f 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c
@@ -567,37 +567,6 @@
#if !defined(BN_ADD_ASM)
-// bn_add_with_carry returns |x + y + carry|, and sets |*out_carry| to the
-// carry bit. |carry| must be zero or one.
-static inline BN_ULONG bn_add_with_carry(BN_ULONG x, BN_ULONG y, BN_ULONG carry,
- BN_ULONG *out_carry) {
- assert(carry == 0 || carry == 1);
-#if defined(BN_ULLONG)
- BN_ULLONG ret = carry;
- ret += (BN_ULLONG)x + y;
- *out_carry = (BN_ULONG)(ret >> BN_BITS2);
- return (BN_ULONG)ret;
-#else
- x += carry;
- carry = x < carry;
- BN_ULONG ret = x + y;
- carry += ret < x;
- *out_carry = carry;
- return ret;
-#endif
-}
-
-// bn_sub_with_borrow returns |x - y - borrow|, and sets |*out_borrow| to the
-// borrow bit. |borrow| must be zero or one.
-static inline BN_ULONG bn_sub_with_borrow(BN_ULONG x, BN_ULONG y,
- BN_ULONG borrow,
- BN_ULONG *out_borrow) {
- assert(borrow == 0 || borrow == 1);
- BN_ULONG ret = x - y - borrow;
- *out_borrow = (x < y) | ((x == y) & borrow);
- return ret;
-}
-
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
size_t n) {
if (n == 0) {
@@ -606,17 +575,17 @@
BN_ULONG carry = 0;
while (n & ~3) {
- r[0] = bn_add_with_carry(a[0], b[0], carry, &carry);
- r[1] = bn_add_with_carry(a[1], b[1], carry, &carry);
- r[2] = bn_add_with_carry(a[2], b[2], carry, &carry);
- r[3] = bn_add_with_carry(a[3], b[3], carry, &carry);
+ r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
+ r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry);
+ r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry);
+ r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry);
a += 4;
b += 4;
r += 4;
n -= 4;
}
while (n) {
- r[0] = bn_add_with_carry(a[0], b[0], carry, &carry);
+ r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
a++;
b++;
r++;
@@ -633,17 +602,17 @@
BN_ULONG borrow = 0;
while (n & ~3) {
- r[0] = bn_sub_with_borrow(a[0], b[0], borrow, &borrow);
- r[1] = bn_sub_with_borrow(a[1], b[1], borrow, &borrow);
- r[2] = bn_sub_with_borrow(a[2], b[2], borrow, &borrow);
- r[3] = bn_sub_with_borrow(a[3], b[3], borrow, &borrow);
+ r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
+ r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow);
+ r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow);
+ r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow);
a += 4;
b += 4;
r += 4;
n -= 4;
}
while (n) {
- r[0] = bn_sub_with_borrow(a[0], b[0], borrow, &borrow);
+ r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
a++;
b++;
r++;
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index fe4e4d7..7537899 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -143,17 +143,13 @@
// in |a| were zeros.
dl = -dl;
for (int i = 0; i < dl; i++) {
- r[i] = 0u - b[i] - borrow;
- borrow |= r[i] != 0;
+ r[i] = CRYPTO_subc_w(0, b[i], borrow, &borrow);
}
} else {
// |b| is shorter than |a|. Complete the subtraction as if the excess words
// in |b| were zeros.
for (int i = 0; i < dl; i++) {
- // |r| and |a| may alias, so use a temporary.
- BN_ULONG tmp = a[i];
- r[i] = a[i] - borrow;
- borrow = tmp < r[i];
+ r[i] = CRYPTO_subc_w(a[i], 0, borrow, &borrow);
}
}
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 35dda2f..4baeed6 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -134,12 +134,11 @@
SHA1_Transform(&ctx, block);
// XKEY = (1 + XKEY + w_i) mod 2^b
- uint64_t carry = 1;
+ uint32_t carry = 1;
for (int i = 4; i >= 0; i--) {
- carry += CRYPTO_load_u32_be(block + i * 4);
- carry += ctx.h[i];
- CRYPTO_store_u32_be(block + i * 4, (uint32_t)carry);
- carry >>= 32;
+ uint32_t tmp = CRYPTO_load_u32_be(block + i * 4);
+ tmp = CRYPTO_addc_u32(tmp, ctx.h[i], carry, &carry);
+ CRYPTO_store_u32_be(block + i * 4, tmp);
}
// Output w_i.
diff --git a/crypto/internal.h b/crypto/internal.h
index 4de4597..f2db41c 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -259,6 +259,12 @@
OPENSSL_INLINE void OPENSSL_reset_malloc_counter_for_testing(void) {}
#endif
+#if defined(__has_builtin)
+#define OPENSSL_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define OPENSSL_HAS_BUILTIN(x) 0
+#endif
+
// Pointer utility functions.
@@ -1134,6 +1140,110 @@
}
+// Arithmetic functions.
+
+// CRYPTO_addc_* returns |x + y + carry|, and sets |*out_carry| to the carry
+// bit. |carry| must be zero or one.
+#if OPENSSL_HAS_BUILTIN(__builtin_addc)
+
+#define CRYPTO_GENERIC_ADDC(x, y, carry, out_carry) \
+ (_Generic((x), \
+ unsigned: __builtin_addc, \
+ unsigned long: __builtin_addcl, \
+ unsigned long long: __builtin_addcll))((x), (y), (carry), (out_carry))
+
+static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry,
+ uint32_t *out_carry) {
+ assert(carry <= 1);
+ return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry);
+}
+
+static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
+ uint64_t *out_carry) {
+ assert(carry <= 1);
+ return CRYPTO_GENERIC_ADDC(x, y, carry, out_carry);
+}
+
+#else
+
+static inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry,
+ uint32_t *out_carry) {
+ assert(carry <= 1);
+ uint64_t ret = carry;
+ ret += (uint64_t)x + y;
+ *out_carry = (uint32_t)(ret >> 32);
+ return (uint32_t)ret;
+}
+
+static inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry,
+ uint64_t *out_carry) {
+ assert(carry <= 1);
+#if defined(BORINGSSL_HAS_UINT128)
+ uint128_t ret = carry;
+ ret += (uint128_t)x + y;
+ *out_carry = (uint64_t)(ret >> 64);
+ return (uint64_t)ret;
+#else
+ x += carry;
+ carry = x < carry;
+ uint64_t ret = x + y;
+ carry += ret < x;
+ *out_carry = carry;
+ return ret;
+#endif
+}
+#endif
+
+// CRYPTO_subc_* returns |x - y - borrow|, and sets |*out_borrow| to the borrow
+// bit. |borrow| must be zero or one.
+#if OPENSSL_HAS_BUILTIN(__builtin_subc)
+
+#define CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow) \
+ (_Generic((x), \
+ unsigned: __builtin_subc, \
+ unsigned long: __builtin_subcl, \
+ unsigned long long: __builtin_subcll))((x), (y), (borrow), (out_borrow))
+
+static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow,
+ uint32_t *out_borrow) {
+ assert(borrow <= 1);
+ return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow);
+}
+
+static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
+ uint64_t *out_borrow) {
+ assert(borrow <= 1);
+ return CRYPTO_GENERIC_SUBC(x, y, borrow, out_borrow);
+}
+
+#else
+
+static inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow,
+ uint32_t *out_borrow) {
+ assert(borrow <= 1);
+ uint32_t ret = x - y - borrow;
+ *out_borrow = (x < y) | ((x == y) & borrow);
+ return ret;
+}
+
+static inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow,
+ uint64_t *out_borrow) {
+ assert(borrow <= 1);
+ uint64_t ret = x - y - borrow;
+ *out_borrow = (x < y) | ((x == y) & borrow);
+ return ret;
+}
+#endif
+
+#if defined(OPENSSL_64_BIT)
+#define CRYPTO_addc_w CRYPTO_addc_u64
+#define CRYPTO_subc_w CRYPTO_subc_u64
+#else
+#define CRYPTO_addc_w CRYPTO_addc_u32
+#define CRYPTO_subc_w CRYPTO_subc_u32
+#endif
+
+
// FIPS functions.
#if defined(BORINGSSL_FIPS)