Restore the BN_mod codepath for public Montgomery moduli.

https://boringssl-review.googlesource.com/10520 and then later
https://boringssl-review.googlesource.com/25285 made BN_MONT_CTX_set
constant-time, which is necessary for RSA's mont_p and mont_q. However,
due to a typo in the benchmark, they did not correctly measure.

Split BN_MONT_CTX creation into a constant-time and variable-time one.
The constant-time one uses our current algorithm and the latter restores
the original BN_mod codepath.

Should we wish to avoid BN_mod, I have an alternate version lying
around:

First, BN_set_bit + bn_mod_lshift1_consttime as now to count up to 2*R.
Next, observe that 2*R = BN_to_montgomery(2) and R*R =
BN_to_montgomery(R) = BN_to_montgomery(2^r_bits) Also observe that
BN_mod_mul_montgomery only needs n0, not RR. Split the core of
BN_mod_exp_mont into its own function so the caller handles conversion.
Raise 2*R to the r_bits power to get 2^r_bits*R = R*R.

The advantage of that algorithm is that it is still constant-time, so we
only need one BN_MONT_CTX_new. Additionally, it avoids BN_mod which is
otherwise (almost, but the remaining links should be easy to cut) out of
the critical path for correctness. One less operation to worry about.

The disadvantage is that it is gives a 25% (RSA-2048) or 32% (RSA-4096)
slower RSA verification speed. I went with the BN_mod one for the time
being.

Before:
Did 9204 RSA 2048 signing operations in 10052053us (915.6 ops/sec)
Did 326000 RSA 2048 verify (same key) operations in 10028823us (32506.3 ops/sec)
Did 50830 RSA 2048 verify (fresh key) operations in 10033794us (5065.9 ops/sec)
Did 1269 RSA 4096 signing operations in 10019204us (126.7 ops/sec)
Did 88435 RSA 4096 verify (same key) operations in 10031129us (8816.1 ops/sec)
Did 14552 RSA 4096 verify (fresh key) operations in 10053411us (1447.5 ops/sec)

After:
Did 9150 RSA 2048 signing operations in 10022831us (912.9 ops/sec)
Did 322000 RSA 2048 verify (same key) operations in 10028604us (32108.2 ops/sec)
Did 289000 RSA 2048 verify (fresh key) operations in 10017205us (28850.4 ops/sec)
Did 1270 RSA 4096 signing operations in 10072950us (126.1 ops/sec)
Did 87480 RSA 4096 verify (same key) operations in 10036328us (8716.3 ops/sec)
Did 80730 RSA 4096 verify (fresh key) operations in 10073614us (8014.0 ops/sec)

Change-Id: Ie8916d1634ccf8513ceda458fa302f09f3e93c07
Reviewed-on: https://boringssl-review.googlesource.com/27287
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index 93d6d0f..6230be1 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -630,8 +630,17 @@
     // Reduce |a| and |b| and test the Montgomery version.
     bssl::UniquePtr<BN_MONT_CTX> mont(
         BN_MONT_CTX_new_for_modulus(m.get(), ctx));
-    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(mont);
+
+    // Sanity-check that the constant-time version computes the same n0 and RR.
+    bssl::UniquePtr<BN_MONT_CTX> mont2(
+        BN_MONT_CTX_new_consttime(m.get(), ctx));
+    ASSERT_TRUE(mont2);
+    EXPECT_BIGNUMS_EQUAL("RR (mod M) (constant-time)", &mont->RR, &mont2->RR);
+    EXPECT_EQ(mont->n0[0], mont2->n0[0]);
+    EXPECT_EQ(mont->n0[1], mont2->n0[1]);
+
+    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(a_tmp);
     ASSERT_TRUE(b_tmp);
     ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
@@ -1530,6 +1539,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   // Some operations also may not be used with an even modulus.
   ASSERT_TRUE(BN_set_word(b.get(), 16));
 
@@ -1537,6 +1550,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   EXPECT_FALSE(BN_mod_exp_mont(a.get(), BN_value_one(), BN_value_one(), b.get(),
                                ctx(), NULL));
   ERR_clear_error();
@@ -2271,17 +2288,29 @@
   bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
   ASSERT_TRUE(p);
 
+  // Test both the constant-time and variable-time functions at both minimal and
+  // non-minimal |p|.
   bssl::UniquePtr<BN_MONT_CTX> mont(
       BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
   ASSERT_TRUE(mont);
-
-  ASSERT_TRUE(bn_resize_words(p.get(), 32));
   bssl::UniquePtr<BN_MONT_CTX> mont2(
-      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
   ASSERT_TRUE(mont2);
 
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  bssl::UniquePtr<BN_MONT_CTX> mont3(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont3);
+  bssl::UniquePtr<BN_MONT_CTX> mont4(
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
+  ASSERT_TRUE(mont4);
+
   EXPECT_EQ(mont->N.width, mont2->N.width);
+  EXPECT_EQ(mont->N.width, mont3->N.width);
+  EXPECT_EQ(mont->N.width, mont4->N.width);
   EXPECT_EQ(0, BN_cmp(&mont->RR, &mont2->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont3->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont4->RR));
 }
 
 TEST_F(BNTest, CountLowZeroBits) {
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index ffeccec..87bd929 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -622,7 +622,7 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
@@ -1007,7 +1007,7 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c
index 5c72e1a..9e7cbc4 100644
--- a/crypto/fipsmodule/bn/montgomery.c
+++ b/crypto/fipsmodule/bn/montgomery.c
@@ -170,7 +170,7 @@
 OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS ==
                        sizeof(uint64_t), BN_MONT_CTX_set_64_bit_mismatch);
 
-int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) {
   if (BN_is_zero(mod)) {
     OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
     return 0;
@@ -207,6 +207,13 @@
 #else
   mont->n0[1] = 0;
 #endif
+  return 1;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+  if (!bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    return 0;
+  }
 
   BN_CTX *new_ctx = NULL;
   if (ctx == NULL) {
@@ -223,7 +230,9 @@
   // BN_BITS2|, is correct because R**2 will still be a multiple of the latter
   // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
   unsigned lgBigR = mont->N.width * BN_BITS2;
-  int ok = bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx);
+  BN_zero(&mont->RR);
+  int ok = BN_set_bit(&mont->RR, lgBigR * 2) &&
+           BN_mod(&mont->RR, &mont->RR, &mont->N, ctx);
   BN_CTX_free(new_ctx);
   return ok;
 }
@@ -238,6 +247,23 @@
   return mont;
 }
 
+BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    goto err;
+  }
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  if (!bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx)) {
+    goto err;
+  }
+  return mont;
+
+err:
+  BN_MONT_CTX_free(mont);
+  return NULL;
+}
+
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
                            const BIGNUM *mod, BN_CTX *bn_ctx) {
   CRYPTO_MUTEX_lock_read(lock);
diff --git a/crypto/fipsmodule/bn/montgomery_inv.c b/crypto/fipsmodule/bn/montgomery_inv.c
index a920ca4..94d99e8 100644
--- a/crypto/fipsmodule/bn/montgomery_inv.c
+++ b/crypto/fipsmodule/bn/montgomery_inv.c
@@ -32,7 +32,8 @@
 #define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
 
 uint64_t bn_mont_n0(const BIGNUM *n) {
-  // These conditions are checked by the caller, |BN_MONT_CTX_set|.
+  // These conditions are checked by the caller, |BN_MONT_CTX_set| or
+  // |BN_MONT_CTX_new_consttime|.
   assert(!BN_is_zero(n));
   assert(!BN_is_negative(n));
   assert(BN_is_odd(n));
diff --git a/crypto/fipsmodule/bn/prime.c b/crypto/fipsmodule/bn/prime.c
index a18d377..80b33c2 100644
--- a/crypto/fipsmodule/bn/prime.c
+++ b/crypto/fipsmodule/bn/prime.c
@@ -690,7 +690,7 @@
   BIGNUM *z = BN_CTX_get(ctx);
   BIGNUM *one_mont = BN_CTX_get(ctx);
   BIGNUM *w1_mont = BN_CTX_get(ctx);
-  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  mont = BN_MONT_CTX_new_consttime(w, ctx);
   if (b == NULL || z == NULL || one_mont == NULL || w1_mont == NULL ||
       mont == NULL ||
       !bn_one_to_montgomery(one_mont, mont, ctx) ||
diff --git a/crypto/fipsmodule/rsa/rsa_impl.c b/crypto/fipsmodule/rsa/rsa_impl.c
index 9aaa5eb..86249ec 100644
--- a/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/crypto/fipsmodule/rsa/rsa_impl.c
@@ -167,7 +167,7 @@
 
   if (rsa->p != NULL && rsa->q != NULL) {
     if (rsa->mont_p == NULL) {
-      rsa->mont_p = BN_MONT_CTX_new_for_modulus(rsa->p, ctx);
+      rsa->mont_p = BN_MONT_CTX_new_consttime(rsa->p, ctx);
       if (rsa->mont_p == NULL) {
         goto err;
       }
@@ -175,7 +175,7 @@
     const BIGNUM *p_fixed = &rsa->mont_p->N;
 
     if (rsa->mont_q == NULL) {
-      rsa->mont_q = BN_MONT_CTX_new_for_modulus(rsa->q, ctx);
+      rsa->mont_q = BN_MONT_CTX_new_consttime(rsa->q, ctx);
       if (rsa->mont_q == NULL) {
         goto err;
       }
diff --git a/include/openssl/bn.h b/include/openssl/bn.h
index 1618022..1ed9864 100644
--- a/include/openssl/bn.h
+++ b/include/openssl/bn.h
@@ -812,10 +812,15 @@
 // Montgomery domain.
 
 // BN_MONT_CTX_new_for_modulus returns a fresh |BN_MONT_CTX| given the modulus,
-// |mod| or NULL on error.
+// |mod| or NULL on error. Note this function assumes |mod| is public.
 OPENSSL_EXPORT BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod,
                                                         BN_CTX *ctx);
 
+// BN_MONT_CTX_new_consttime behaves like |BN_MONT_CTX_new_for_modulus| but
+// treats |mod| as secret.
+OPENSSL_EXPORT BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod,
+                                                      BN_CTX *ctx);
+
 // BN_MONT_CTX_free frees memory associated with |mont|.
 OPENSSL_EXPORT void BN_MONT_CTX_free(BN_MONT_CTX *mont);
 
@@ -826,7 +831,8 @@
 
 // BN_MONT_CTX_set_locked takes |lock| and checks whether |*pmont| is NULL. If
 // so, it creates a new |BN_MONT_CTX| and sets the modulus for it to |mod|. It
-// then stores it as |*pmont|. It returns one on success and zero on error.
+// then stores it as |*pmont|. It returns one on success and zero on error. Note
+// this function assumes |mod| is public.
 //
 // If |*pmont| is already non-NULL then it does nothing and returns one.
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,