Simplify bn_sub_part_words.
This function does not need to be so complex. The bulk of
the work is done by bn_sub_words. The unrolled loop is only used when
bn_sub_part_words is called in different-length inputs.
bn_sub_part_words is itself only called from bn_abs_sub_consttime and
bn_mul_part_recursive. bn_abs_sub_consttime is used to compute p - q in
RSA key generation so it never sees different-width inputs.
bn_mul_part_recursive is called from bn_mul_impl if all of the following
are true:
- Both inputs are at least 16 words long (1024 bits on 64-bit platforms
and 512 bits on 32-bit).
- The two inputs are within one word from each other.
- The length of the larger input is one more than a power of two.
The first condition rules out all EC configurations except P-521 on
32-bit platforms. The EC code uses bn_mul_mont assembly if available, so
this only affects non-x86 and non-ARM 32-bit architectures. Building for
32-bit x86 without assembly shows no significant change:
Before:
Did 88 ECDH P-521 operations in 1014591us (86.7 ops/sec)
Did 165 ECDSA P-521 signing operations in 1066428us (154.7 ops/sec)
Did 150 ECDSA P-521 verify operations in 1001749us (149.7 ops/sec)
After:
Did 90 ECDH P-521 operations in 1045905us (86.0 ops/sec)
Did 165 ECDSA P-521 signing operations in 1071189us (154.0 ops/sec)
Did 154 ECDSA P-521 verify operations in 1050509us (146.6 ops/sec)
RSA does meet the first condition, but the third condition rules out all
common RSA sizes, with one quirk: RSA_check_key uses the non-normalized
BIGNUMs, but RSA_check_key is not called as part of private key
operations. (https://crbug.com/boringssl/316 discusses what to do about
RSA longer term. It's kind of a mess right now.)
Bug: 314
Change-Id: I0bd604e2cd6a75c266f64476c23a730ca1721ea6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40145
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index dd31580..6e1c3ca 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -119,23 +119,20 @@
}
}
-// Here follows specialised variants of bn_add_words() and bn_sub_words(). They
-// have the property performing operations on arrays of different sizes. The
-// sizes of those arrays is expressed through cl, which is the common length (
-// basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
-// lengths, calculated as len(a)-len(b). All lengths are the number of
-// BN_ULONGs... For the operations that require a result array as parameter,
-// it must have the length cl+abs(dl).
-
+// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is
+// one if the operation underflowed and zero otherwise. |cl| is the common
+// length, that is, the shorter of len(a) or len(b). |dl| is the delta length,
+// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or
+// cl + abs(dl).
+//
+// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
+// is confusing.
static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int cl, int dl) {
- BN_ULONG c, t;
-
assert(cl >= 0);
- c = bn_sub_words(r, a, b, cl);
-
+ BN_ULONG borrow = bn_sub_words(r, a, b, cl);
if (dl == 0) {
- return c;
+ return borrow;
}
r += cl;
@@ -143,141 +140,25 @@
b += cl;
if (dl < 0) {
- for (;;) {
- t = b[0];
- r[0] = 0 - t - c;
- if (t != 0) {
- c = 1;
- }
- if (++dl >= 0) {
- break;
- }
-
- t = b[1];
- r[1] = 0 - t - c;
- if (t != 0) {
- c = 1;
- }
- if (++dl >= 0) {
- break;
- }
-
- t = b[2];
- r[2] = 0 - t - c;
- if (t != 0) {
- c = 1;
- }
- if (++dl >= 0) {
- break;
- }
-
- t = b[3];
- r[3] = 0 - t - c;
- if (t != 0) {
- c = 1;
- }
- if (++dl >= 0) {
- break;
- }
-
- b += 4;
- r += 4;
+ // |a| is shorter than |b|. Complete the subtraction as if the excess words
+ // in |a| were zeros.
+ dl = -dl;
+ for (int i = 0; i < dl; i++) {
+ r[i] = 0u - b[i] - borrow;
+ borrow |= r[i] != 0;
}
} else {
- int save_dl = dl;
- while (c) {
- t = a[0];
- r[0] = t - c;
- if (t != 0) {
- c = 0;
- }
- if (--dl <= 0) {
- break;
- }
-
- t = a[1];
- r[1] = t - c;
- if (t != 0) {
- c = 0;
- }
- if (--dl <= 0) {
- break;
- }
-
- t = a[2];
- r[2] = t - c;
- if (t != 0) {
- c = 0;
- }
- if (--dl <= 0) {
- break;
- }
-
- t = a[3];
- r[3] = t - c;
- if (t != 0) {
- c = 0;
- }
- if (--dl <= 0) {
- break;
- }
-
- save_dl = dl;
- a += 4;
- r += 4;
- }
- if (dl > 0) {
- if (save_dl > dl) {
- switch (save_dl - dl) {
- case 1:
- r[1] = a[1];
- if (--dl <= 0) {
- break;
- }
- OPENSSL_FALLTHROUGH;
- case 2:
- r[2] = a[2];
- if (--dl <= 0) {
- break;
- }
- OPENSSL_FALLTHROUGH;
- case 3:
- r[3] = a[3];
- if (--dl <= 0) {
- break;
- }
- }
- a += 4;
- r += 4;
- }
- }
-
- if (dl > 0) {
- for (;;) {
- r[0] = a[0];
- if (--dl <= 0) {
- break;
- }
- r[1] = a[1];
- if (--dl <= 0) {
- break;
- }
- r[2] = a[2];
- if (--dl <= 0) {
- break;
- }
- r[3] = a[3];
- if (--dl <= 0) {
- break;
- }
-
- a += 4;
- r += 4;
- }
+ // |b| is shorter than |a|. Complete the subtraction as if the excess words
+ // in |b| were zeros.
+ for (int i = 0; i < dl; i++) {
+ // |r| and |a| may alias, so use a temporary.
+ BN_ULONG tmp = a[i];
+ r[i] = a[i] - borrow;
+ borrow = tmp < r[i];
}
}
- return c;
+ return borrow;
}
// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
@@ -574,7 +455,7 @@
static const int kMulNormalSize = 16;
if (al >= kMulNormalSize && bl >= kMulNormalSize) {
if (-1 <= i && i <= 1) {
- // Find the larger power of two less than or equal to the larger length.
+ // Find the largest power of two less than or equal to the larger length.
int j;
if (i >= 0) {
j = BN_num_bits_word((BN_ULONG)al);
@@ -590,6 +471,10 @@
if (al > j || bl > j) {
// We know |al| and |bl| are at most one from each other, so if al > j,
// bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|.
+ //
+ // TODO(davidben): This codepath is almost unused in standard
+ // algorithms. Is this optimization necessary? See notes in
+ // https://boringssl-review.googlesource.com/q/I0bd604e2cd6a75c266f64476c23a730ca1721ea6
assert(al >= j && bl >= j);
if (!bn_wexpand(t, j * 8) ||
!bn_wexpand(rr, j * 4)) {