Simplify bn_sub_part_words. This function does not need to be so complex. The bulk of the work is done by bn_sub_words. The unrolled loop is only used when bn_sub_part_words is called in different-length inputs. bn_sub_part_words is itself only called from bn_abs_sub_consttime and bn_mul_part_recursive. bn_abs_sub_consttime is used to compute p - q in RSA key generation so it never sees different-width inputs. bn_mul_part_recursive is called from bn_mul_impl if all of the following are true: - Both inputs are at least 16 words long (1024 bits on 64-bit platforms and 512 bits on 32-bit). - The two inputs are within one word from each other. - The length of the larger input is one more than a power of two. The first condition rules out all EC configurations except P-521 on 32-bit platforms. The EC code uses bn_mul_mont assembly if available, so this only affects non-x86 and non-ARM 32-bit architectures. Building for 32-bit x86 without assembly shows no significant change: Before: Did 88 ECDH P-521 operations in 1014591us (86.7 ops/sec) Did 165 ECDSA P-521 signing operations in 1066428us (154.7 ops/sec) Did 150 ECDSA P-521 verify operations in 1001749us (149.7 ops/sec) After: Did 90 ECDH P-521 operations in 1045905us (86.0 ops/sec) Did 165 ECDSA P-521 signing operations in 1071189us (154.0 ops/sec) Did 154 ECDSA P-521 verify operations in 1050509us (146.6 ops/sec) RSA does meet the first condition, but the third condition rules out all common RSA sizes, with one quirk: RSA_check_key uses the non-normalized BIGNUMs, but RSA_check_key is not called as part of private key operations. (https://crbug.com/boringssl/316 discusses what to do about RSA longer term. It's kind of a mess right now.) Bug: 314 Change-Id: I0bd604e2cd6a75c266f64476c23a730ca1721ea6 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40145 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com>

commit: ce9b8737cb3e113c9f986d6a94aeedb09e8207b2 [log] [tgz]
author: David Benjamin <davidben@google.com> Fri Feb 28 12:31:25 2020 -0500
committer: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Tue Mar 31 22:56:39 2020 +0000
tree: ee482fd0980974282dca678c4e0513acd97ad92e
parent: 8b4fa1b6b5c8235e51e25bd1615e079bc6f2718b [diff]
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index dd31580..6e1c3ca 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c

@@ -119,23 +119,20 @@
   }
 }
 
-// Here follows specialised variants of bn_add_words() and bn_sub_words(). They
-// have the property performing operations on arrays of different sizes. The
-// sizes of those arrays is expressed through cl, which is the common length (
-// basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
-// lengths, calculated as len(a)-len(b). All lengths are the number of
-// BN_ULONGs...  For the operations that require a result array as parameter,
-// it must have the length cl+abs(dl).
-
+// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is
+// one if the operation underflowed and zero otherwise. |cl| is the common
+// length, that is, the shorter of len(a) or len(b). |dl| is the delta length,
+// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or
+// cl + abs(dl).
+//
+// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
+// is confusing.
 static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
                                   const BN_ULONG *b, int cl, int dl) {
-  BN_ULONG c, t;
-
   assert(cl >= 0);
-  c = bn_sub_words(r, a, b, cl);
-
+  BN_ULONG borrow = bn_sub_words(r, a, b, cl);
   if (dl == 0) {
-    return c;
+    return borrow;
   }
 
   r += cl;
@@ -143,141 +140,25 @@
   b += cl;
 
   if (dl < 0) {
-    for (;;) {
-      t = b[0];
-      r[0] = 0 - t - c;
-      if (t != 0) {
-        c = 1;
-      }
-      if (++dl >= 0) {
-        break;
-      }
-
-      t = b[1];
-      r[1] = 0 - t - c;
-      if (t != 0) {
-        c = 1;
-      }
-      if (++dl >= 0) {
-        break;
-      }
-
-      t = b[2];
-      r[2] = 0 - t - c;
-      if (t != 0) {
-        c = 1;
-      }
-      if (++dl >= 0) {
-        break;
-      }
-
-      t = b[3];
-      r[3] = 0 - t - c;
-      if (t != 0) {
-        c = 1;
-      }
-      if (++dl >= 0) {
-        break;
-      }
-
-      b += 4;
-      r += 4;
+    // |a| is shorter than |b|. Complete the subtraction as if the excess words
+    // in |a| were zeros.
+    dl = -dl;
+    for (int i = 0; i < dl; i++) {
+      r[i] = 0u - b[i] - borrow;
+      borrow |= r[i] != 0;
     }
   } else {
-    int save_dl = dl;
-    while (c) {
-      t = a[0];
-      r[0] = t - c;
-      if (t != 0) {
-        c = 0;
-      }
-      if (--dl <= 0) {
-        break;
-      }
-
-      t = a[1];
-      r[1] = t - c;
-      if (t != 0) {
-        c = 0;
-      }
-      if (--dl <= 0) {
-        break;
-      }
-
-      t = a[2];
-      r[2] = t - c;
-      if (t != 0) {
-        c = 0;
-      }
-      if (--dl <= 0) {
-        break;
-      }
-
-      t = a[3];
-      r[3] = t - c;
-      if (t != 0) {
-        c = 0;
-      }
-      if (--dl <= 0) {
-        break;
-      }
-
-      save_dl = dl;
-      a += 4;
-      r += 4;
-    }
-    if (dl > 0) {
-      if (save_dl > dl) {
-        switch (save_dl - dl) {
-          case 1:
-            r[1] = a[1];
-            if (--dl <= 0) {
-              break;
-            }
-            OPENSSL_FALLTHROUGH;
-          case 2:
-            r[2] = a[2];
-            if (--dl <= 0) {
-              break;
-            }
-            OPENSSL_FALLTHROUGH;
-          case 3:
-            r[3] = a[3];
-            if (--dl <= 0) {
-              break;
-            }
-        }
-        a += 4;
-        r += 4;
-      }
-    }
-
-    if (dl > 0) {
-      for (;;) {
-        r[0] = a[0];
-        if (--dl <= 0) {
-          break;
-        }
-        r[1] = a[1];
-        if (--dl <= 0) {
-          break;
-        }
-        r[2] = a[2];
-        if (--dl <= 0) {
-          break;
-        }
-        r[3] = a[3];
-        if (--dl <= 0) {
-          break;
-        }
-
-        a += 4;
-        r += 4;
-      }
+    // |b| is shorter than |a|. Complete the subtraction as if the excess words
+    // in |b| were zeros.
+    for (int i = 0; i < dl; i++) {
+      // |r| and |a| may alias, so use a temporary.
+      BN_ULONG tmp = a[i];
+      r[i] = a[i] - borrow;
+      borrow = tmp < r[i];
     }
   }
 
-  return c;
+  return borrow;
 }
 
 // bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
@@ -574,7 +455,7 @@
   static const int kMulNormalSize = 16;
   if (al >= kMulNormalSize && bl >= kMulNormalSize) {
     if (-1 <= i && i <= 1) {
-      // Find the larger power of two less than or equal to the larger length.
+      // Find the largest power of two less than or equal to the larger length.
       int j;
       if (i >= 0) {
         j = BN_num_bits_word((BN_ULONG)al);
@@ -590,6 +471,10 @@
       if (al > j || bl > j) {
         // We know |al| and |bl| are at most one from each other, so if al > j,
         // bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|.
+        //
+        // TODO(davidben): This codepath is almost unused in standard
+        // algorithms. Is this optimization necessary? See notes in
+        // https://boringssl-review.googlesource.com/q/I0bd604e2cd6a75c266f64476c23a730ca1721ea6
         assert(al >= j && bl >= j);
         if (!bn_wexpand(t, j * 8) ||
             !bn_wexpand(rr, j * 4)) {
commit	ce9b8737cb3e113c9f986d6a94aeedb09e8207b2	[log] [tgz]
author	David Benjamin <davidben@google.com>	Fri Feb 28 12:31:25 2020 -0500
committer	CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>	Tue Mar 31 22:56:39 2020 +0000
tree	ee482fd0980974282dca678c4e0513acd97ad92e
parent	8b4fa1b6b5c8235e51e25bd1615e079bc6f2718b [diff]