Calculate Montgomery RR without division.

Get one step closer to removing the dependency on |BN_div| from most
programs. Also get one step closer to a constant-time implementation of
|BN_MONT_CTX_set|; we now "just" need to create a constant-time variant
of |BN_mod_lshift1_quick|.

Note that this version might actually increase the side channel signal,
since the variance in timing in |BN_div| is probably less than the variance
from the many conditional reductions in the new method.

On one Windows x64 machine, the speed of RSA verification using the new
version is not too different from the speed of the old code. However,
|BN_div| is generally slow on Windows x64 so I expect this isn't faster
on all platforms. Regardless, we generally consider ECDSA/EdDSA
signature verification performance to be adaquate and RSA signature
verification is much, much faster even with this change.

For RSA signing the performance is not a significant factor since
performance-sensitive applications will cache the |RSA| structure and
the |RSA| structure will cache the Montgomery contexts.

Change-Id: Ib14f1a35c99b8da435e190342657f6a839381a1a
Reviewed-on: https://boringssl-review.googlesource.com/10520
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/bn/internal.h b/crypto/bn/internal.h
index aeed88f..f2141d7 100644
--- a/crypto/bn/internal.h
+++ b/crypto/bn/internal.h
@@ -228,6 +228,7 @@
                 const BN_ULONG *np, const BN_ULONG *n0, int num);
 
 uint64_t bn_mont_n0(const BIGNUM *n);
+int bn_mod_exp_base_2_vartime(BIGNUM *r, unsigned p, const BIGNUM *n);
 
 #if defined(OPENSSL_X86_64) && defined(_MSC_VER)
 #define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
diff --git a/crypto/bn/montgomery.c b/crypto/bn/montgomery.c
index 91251e5..9aa04d6 100644
--- a/crypto/bn/montgomery.c
+++ b/crypto/bn/montgomery.c
@@ -108,6 +108,7 @@
 
 #include <openssl/bn.h>
 
+#include <assert.h>
 #include <string.h>
 
 #include <openssl/err.h>
@@ -207,12 +208,13 @@
   /* Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS such that R
    * > mod. Even though the assembly on some 32-bit platforms works with 64-bit
    * values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS *
-   * BN_BITS2|, is correct because because R^2 will still be a multiple of the
-   * latter as |BN_MONT_CTX_N0_LIMBS| is either one or two. */
+   * BN_BITS2|, is correct because R**2 will still be a multiple of the latter
+   * as |BN_MONT_CTX_N0_LIMBS| is either one or two.
+   *
+   * XXX: This is not constant time with respect to |mont->N|, but it should
+   * be. */
   unsigned lgBigR = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
-  BN_zero(&mont->RR);
-  if (!BN_set_bit(&mont->RR, lgBigR * 2) ||
-      !BN_mod(&mont->RR, &mont->RR, &mont->N, ctx)) {
+  if (!bn_mod_exp_base_2_vartime(&mont->RR, lgBigR * 2, &mont->N)) {
     return 0;
   }
 
diff --git a/crypto/bn/montgomery_inv.c b/crypto/bn/montgomery_inv.c
index 28db62b..9264adb 100644
--- a/crypto/bn/montgomery_inv.c
+++ b/crypto/bn/montgomery_inv.c
@@ -158,3 +158,50 @@
 
   return v;
 }
+
+/* bn_mod_exp_base_2_vartime calculates r = 2**p (mod n). |p| must be larger
+ * than log_2(n); i.e. 2**p must be larger than |n|. |n| must be positive and
+ * odd. */
+int bn_mod_exp_base_2_vartime(BIGNUM *r, unsigned p, const BIGNUM *n) {
+  assert(!BN_is_zero(n));
+  assert(!BN_is_negative(n));
+  assert(BN_is_odd(n));
+
+  BN_zero(r);
+
+  unsigned n_bits = BN_num_bits(n);
+  assert(n_bits != 0);
+  if (n_bits == 1) {
+    return 1;
+  }
+
+  /* Set |r| to the smallest power of two larger than |n|. */
+  assert(p > n_bits);
+  if (!BN_set_bit(r, n_bits)) {
+    return 0;
+  }
+
+  /* Unconditionally reduce |r|. */
+  assert(BN_cmp(r, n) > 0);
+  if (!BN_usub(r, r, n)) {
+    return 0;
+  }
+  assert(BN_cmp(r, n) < 0);
+
+  for (unsigned i = n_bits; i < p; ++i) {
+    /* This is like |BN_mod_lshift1_quick| except using |BN_usub|.
+     *
+     * TODO: Replace this with the use of a constant-time variant of
+     * |BN_mod_lshift1_quick|. */
+    if (!BN_lshift1(r, r)) {
+      return 0;
+    }
+    if (BN_cmp(r, n) >= 0) {
+      if (!BN_usub(r, r, n)) {
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}