Speed up variable windowed exponentation a bit.
The first non-zero window (which we can condition on for public
exponents) always multiplies by one. This means we can cut out one
Montgomery multiplication. It also means we never actually need to
initialize r to one, saving another Montgomery multiplication for P-521.
This, in turn, means we don't need the bn_one_to_montgomery optimization
for the public-exponent exponentations, so we can delete
bn_one_to_montgomery_small. (The function does currently promise to
handle p = 0, but this is not actually reachable, so it can just do a
reduction on RR.)
For RSA, where we're not doing many multiplications to begin with,
saving one is noticeable.
Before:
Did 92000 RSA 2048 verify (same key) operations in 3002557us (30640.6 ops/sec)
Did 25165 RSA 4096 verify (same key) operations in 3045046us (8264.2 ops/sec)
After:
Did 100000 RSA 2048 verify (same key) operations in 3002483us (33305.8 ops/sec)
Did 26603 RSA 4096 verify (same key) operations in 3010942us (8835.4 ops/sec)
(Not looking at the fresh key number yet as that still needs to be
fixed.)
Change-Id: I81a025a68d9b0f8eb0f9c6c04ec4eedf0995a345
Reviewed-on: https://boringssl-review.googlesource.com/27286
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index db73b0c..ffeccec 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -660,10 +660,8 @@
}
}
- if (!bn_one_to_montgomery(r, mont, ctx)) {
- goto err;
- }
-
+ // |p| is non-zero, so at least one window is non-zero. To save some
+ // multiplications, defer initializing |r| until then.
int r_is_one = 1;
int wstart = bits - 1; // The top bit of the window.
for (;;) {
@@ -700,7 +698,11 @@
assert(wvalue & 1);
assert(wvalue < (1 << window));
- if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
+ if (r_is_one) {
+ if (!BN_copy(r, val[wvalue >> 1])) {
+ goto err;
+ }
+ } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
goto err;
}
@@ -711,6 +713,9 @@
wstart -= wsize + 1;
}
+ // |p| is non-zero, so |r_is_one| must be cleared at some point.
+ assert(!r_is_one);
+
if (!BN_from_montgomery(rr, r, mont, ctx)) {
goto err;
}
@@ -730,17 +735,17 @@
OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
return 0;
}
- if (!BN_is_odd(&mont->N)) {
- OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
- return 0;
+ assert(BN_is_odd(&mont->N));
+
+ // Count the number of bits in |p|. Note this function treats |p| as public.
+ while (num_p != 0 && p[num_p - 1] == 0) {
+ num_p--;
}
- unsigned bits = 0;
- if (num_p != 0) {
- bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+ if (num_p == 0) {
+ return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.width, mont);
}
- if (bits == 0) {
- return bn_one_to_montgomery_small(r, num_r, mont);
- }
+ unsigned bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+ assert(bits != 0);
// We exponentiate by looking at sliding windows of the exponent and
// precomputing powers of |a|. Windows may be shifted so they always end on a
@@ -767,10 +772,8 @@
}
}
- if (!bn_one_to_montgomery_small(r, num_r, mont)) {
- goto err;
- }
-
+ // |p| is non-zero, so at least one window is non-zero. To save some
+ // multiplications, defer initializing |r| until then.
int r_is_one = 1;
unsigned wstart = bits - 1; // The top bit of the window.
for (;;) {
@@ -808,8 +811,10 @@
assert(wvalue & 1);
assert(wvalue < (1u << window));
- if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r, val[wvalue >> 1],
- num_n, mont)) {
+ if (r_is_one) {
+ OPENSSL_memcpy(r, val[wvalue >> 1], num_r * sizeof(BN_ULONG));
+ } else if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r,
+ val[wvalue >> 1], num_n, mont)) {
goto err;
}
@@ -820,6 +825,8 @@
wstart -= wsize + 1;
}
+ // |p| is non-zero, so |r_is_one| must be cleared at some point.
+ assert(!r_is_one);
ret = 1;
err:
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 668d8dd..1d720fc 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -547,13 +547,6 @@
int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
size_t num_a, const BN_MONT_CTX *mont);
-// bn_one_to_montgomery_small sets |r| to one in Montgomery form. It returns one
-// on success and zero on error. |num_r| must be the length of the modulus,
-// which is |mont->N.top|. This function treats the bit width of the modulus as
-// public.
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
- const BN_MONT_CTX *mont);
-
// bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
// and outputs are in the Montgomery domain. |num_r| must be the length of the
// modulus, which is |mont->N.top|. This function returns one on success and
diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c
index 7ce8c4c..5c72e1a 100644
--- a/crypto/fipsmodule/bn/montgomery.c
+++ b/crypto/fipsmodule/bn/montgomery.c
@@ -449,28 +449,6 @@
return ret;
}
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
- const BN_MONT_CTX *mont) {
- const BN_ULONG *n = mont->N.d;
- size_t num_n = mont->N.width;
- if (num_n == 0 || num_r != num_n) {
- OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
-
- // If the high bit of |n| is set, R = 2^(num_n*BN_BITS2) < 2 * |n|, so we
- // compute R - |n| rather than perform Montgomery reduction.
- if (num_n > 0 && (n[num_n - 1] >> (BN_BITS2 - 1)) != 0) {
- r[0] = 0 - n[0];
- for (size_t i = 1; i < num_n; i++) {
- r[i] = ~n[i];
- }
- return 1;
- }
-
- return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.width, mont);
-}
-
int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
size_t num_a, const BN_ULONG *b, size_t num_b,
const BN_MONT_CTX *mont) {