Unify the two copies of bn_add_words and bn_sub_words

Compilers are fine at inlining functions nowadays. We can hide the
BN_ULLONG vs. manual carry extraction inside an inline function. I've
patterned the type signatures intentionally after Clang's builtins, in
case we want to use them in the future.

(Previously I wrote in
https://boringssl-review.googlesource.com/c/boringssl/+/56966 that the
builtins weren't good on aarch64. This wasn't quite right. Rather, they
were bad on both x86_64 and aarch64 in LLVM 13, but they're fine on both
in LLVM 14. My machine's Xcode was just a little old.)

Change-Id: I666466dce7a146d5e49e94ff372ea018b610ef34
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/57245
Commit-Queue: David Benjamin <davidben@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index bf4971f..e4f4518 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c
@@ -567,145 +567,89 @@
 
 #if !defined(BN_ADD_ASM)
 
-#ifdef BN_ULLONG
+// bn_add_with_carry returns |x + y + carry|, and sets |*out_carry| to the
+// carry bit. |carry| must be zero or one.
+static inline BN_ULONG bn_add_with_carry(BN_ULONG x, BN_ULONG y, BN_ULONG carry,
+                                         BN_ULONG *out_carry) {
+  assert(carry == 0 || carry == 1);
+#if defined(BN_ULLONG)
+  BN_ULLONG ret = carry;
+  ret += (BN_ULLONG)x + y;
+  *out_carry = (BN_ULONG)(ret >> BN_BITS2);
+  return (BN_ULONG)ret;
+#else
+  x += carry;
+  carry = x < carry;
+  BN_ULONG ret = x + y;
+  carry += ret < x;
+  *out_carry = carry;
+  return ret;
+#endif
+}
+
+// bn_sub_with_borrow returns |x - y - borrow|, and sets |*out_borrow| to the
+// borrow bit. |borrow| must be zero or one.
+static inline BN_ULONG bn_sub_with_borrow(BN_ULONG x, BN_ULONG y,
+                                          BN_ULONG borrow,
+                                          BN_ULONG *out_borrow) {
+  assert(borrow == 0 || borrow == 1);
+  BN_ULONG ret = x - y - borrow;
+  *out_borrow = (x < y) | ((x == y) & borrow);
+  return ret;
+}
+
 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                       size_t n) {
-  BN_ULLONG ll = 0;
-
   if (n == 0) {
     return 0;
   }
 
+  BN_ULONG carry = 0;
   while (n & ~3) {
-    ll += (BN_ULLONG)a[0] + b[0];
-    r[0] = (BN_ULONG)ll;
-    ll >>= BN_BITS2;
-    ll += (BN_ULLONG)a[1] + b[1];
-    r[1] = (BN_ULONG)ll;
-    ll >>= BN_BITS2;
-    ll += (BN_ULLONG)a[2] + b[2];
-    r[2] = (BN_ULONG)ll;
-    ll >>= BN_BITS2;
-    ll += (BN_ULLONG)a[3] + b[3];
-    r[3] = (BN_ULONG)ll;
-    ll >>= BN_BITS2;
+    r[0] = bn_add_with_carry(a[0], b[0], carry, &carry);
+    r[1] = bn_add_with_carry(a[1], b[1], carry, &carry);
+    r[2] = bn_add_with_carry(a[2], b[2], carry, &carry);
+    r[3] = bn_add_with_carry(a[3], b[3], carry, &carry);
     a += 4;
     b += 4;
     r += 4;
     n -= 4;
   }
   while (n) {
-    ll += (BN_ULLONG)a[0] + b[0];
-    r[0] = (BN_ULONG)ll;
-    ll >>= BN_BITS2;
+    r[0] = bn_add_with_carry(a[0], b[0], carry, &carry);
     a++;
     b++;
     r++;
     n--;
   }
-  return (BN_ULONG)ll;
+  return carry;
 }
 
-#else  // !BN_ULLONG
-
-BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
-                      size_t n) {
-  BN_ULONG c, l, t;
-
-  if (n == 0) {
-    return (BN_ULONG)0;
-  }
-
-  c = 0;
-  while (n & ~3) {
-    t = a[0];
-    t += c;
-    c = (t < c);
-    l = t + b[0];
-    c += (l < t);
-    r[0] = l;
-    t = a[1];
-    t += c;
-    c = (t < c);
-    l = t + b[1];
-    c += (l < t);
-    r[1] = l;
-    t = a[2];
-    t += c;
-    c = (t < c);
-    l = t + b[2];
-    c += (l < t);
-    r[2] = l;
-    t = a[3];
-    t += c;
-    c = (t < c);
-    l = t + b[3];
-    c += (l < t);
-    r[3] = l;
-    a += 4;
-    b += 4;
-    r += 4;
-    n -= 4;
-  }
-  while (n) {
-    t = a[0];
-    t += c;
-    c = (t < c);
-    l = t + b[0];
-    c += (l < t);
-    r[0] = l;
-    a++;
-    b++;
-    r++;
-    n--;
-  }
-  return (BN_ULONG)c;
-}
-
-#endif  // !BN_ULLONG
-
 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                       size_t n) {
-  BN_ULONG t1, t2;
-  BN_ULONG c = 0;
-
   if (n == 0) {
     return (BN_ULONG)0;
   }
 
+  BN_ULONG borrow = 0;
   while (n & ~3) {
-    t1 = a[0];
-    t2 = b[0];
-    r[0] = t1 - t2 - c;
-    c = (t1 < t2) | ((t1 == t2) & c);
-    t1 = a[1];
-    t2 = b[1];
-    r[1] = t1 - t2 - c;
-    c = (t1 < t2) | ((t1 == t2) & c);
-    t1 = a[2];
-    t2 = b[2];
-    r[2] = t1 - t2 - c;
-    c = (t1 < t2) | ((t1 == t2) & c);
-    t1 = a[3];
-    t2 = b[3];
-    r[3] = t1 - t2 - c;
-    c = (t1 < t2) | ((t1 == t2) & c);
+    r[0] = bn_sub_with_borrow(a[0], b[0], borrow, &borrow);
+    r[1] = bn_sub_with_borrow(a[1], b[1], borrow, &borrow);
+    r[2] = bn_sub_with_borrow(a[2], b[2], borrow, &borrow);
+    r[3] = bn_sub_with_borrow(a[3], b[3], borrow, &borrow);
     a += 4;
     b += 4;
     r += 4;
     n -= 4;
   }
   while (n) {
-    t1 = a[0];
-    t2 = b[0];
-    r[0] = t1 - t2 - c;
-    c = (t1 < t2) | ((t1 == t2) & c);
+    r[0] = bn_sub_with_borrow(a[0], b[0], borrow, &borrow);
     a++;
     b++;
     r++;
     n--;
   }
-  return c;
+  return borrow;
 }
 
 #endif  // !BN_ADD_ASM