Avoid branches in GCC in bn/generic.c.

bn/generic.c is used for functions like bn_add_words, when there is no
assembly implementation available. They're meant to be constant-time,
but are particularly dependent on the compiler in this. I ran our
valgrind-based tooling and found a couple issues in GCC:

First, the various mul_add and sqr_add macros end up branching in GCC.
Replacing the conditionals with expressions like c += (a < b) seems to
mitigate this.

Second, bn_sub_words produces branches in GCC. Replacing the expressions
with bit expressions involving the boolean comparisons seems to work for
now. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79173 discusses
problems with GCC here, which seem to be as yet unresolved.

Clang already reliably avoided branches in all of these. (Though it
still spills the carry flag far more than would be ideal.)

I also checked in godbolt that the new versions didn't generate branches
in MSVC, but we don't have tooling to validate this as rigorously.

Change-Id: I739758a396fb5ee27fb88bee71bd13ae9cb92bd0
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/56967
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index df4a834..bf4971f 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c
@@ -232,9 +232,7 @@
     (c0) = (BN_ULONG)Lw(t);             \
     hi = (BN_ULONG)Hw(t);               \
     (c1) += (hi);                       \
-    if ((c1) < hi) {                    \
-      (c2)++;                           \
-    }                                   \
+    (c2) += (c1) < hi;                  \
   } while (0)
 
 #define mul_add_c2(a, b, c0, c1, c2)        \
@@ -245,16 +243,12 @@
     (c0) = (BN_ULONG)Lw(tt);                \
     hi = (BN_ULONG)Hw(tt);                  \
     (c1) += hi;                             \
-    if ((c1) < hi) {                        \
-      (c2)++;                               \
-    }                                       \
+    (c2) += (c1) < hi;                      \
     t += (c0); /* no carry */               \
     (c0) = (BN_ULONG)Lw(t);                 \
     hi = (BN_ULONG)Hw(t);                   \
     (c1) += hi;                             \
-    if ((c1) < hi) {                        \
-      (c2)++;                               \
-    }                                       \
+    (c2) += (c1) < hi;                      \
   } while (0)
 
 #define sqr_add_c(a, i, c0, c1, c2)           \
@@ -265,9 +259,7 @@
     (c0) = (BN_ULONG)Lw(t);                   \
     hi = (BN_ULONG)Hw(t);                     \
     (c1) += hi;                               \
-    if ((c1) < hi) {                          \
-      (c2)++;                                 \
-    }                                         \
+    (c2) += (c1) < hi;                        \
   } while (0)
 
 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
@@ -675,7 +667,7 @@
 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
                       size_t n) {
   BN_ULONG t1, t2;
-  int c = 0;
+  BN_ULONG c = 0;
 
   if (n == 0) {
     return (BN_ULONG)0;
@@ -685,27 +677,19 @@
     t1 = a[0];
     t2 = b[0];
     r[0] = t1 - t2 - c;
-    if (t1 != t2) {
-      c = (t1 < t2);
-    }
+    c = (t1 < t2) | ((t1 == t2) & c);
     t1 = a[1];
     t2 = b[1];
     r[1] = t1 - t2 - c;
-    if (t1 != t2) {
-      c = (t1 < t2);
-    }
+    c = (t1 < t2) | ((t1 == t2) & c);
     t1 = a[2];
     t2 = b[2];
     r[2] = t1 - t2 - c;
-    if (t1 != t2) {
-      c = (t1 < t2);
-    }
+    c = (t1 < t2) | ((t1 == t2) & c);
     t1 = a[3];
     t2 = b[3];
     r[3] = t1 - t2 - c;
-    if (t1 != t2) {
-      c = (t1 < t2);
-    }
+    c = (t1 < t2) | ((t1 == t2) & c);
     a += 4;
     b += 4;
     r += 4;
@@ -715,9 +699,7 @@
     t1 = a[0];
     t2 = b[0];
     r[0] = t1 - t2 - c;
-    if (t1 != t2) {
-      c = (t1 < t2);
-    }
+    c = (t1 < t2) | ((t1 == t2) & c);
     a++;
     b++;
     r++;