Make BN_mod_*_quick constant-time.

As the EC code will ultimately want to use these in "words" form by way
of EC_FELEM, and because it's much easier, I've implement these as
low-level words-based functions that require all inputs have the same
width. The BIGNUM versions which RSA and, for now, EC calls are
implemented on top of that.

Unfortunately, doing such things in constant-time and accounting for
undersized inputs requires some scratch space, and these functions don't
take BN_CTX. So I've added internal bn_mod_*_quick_ctx functions that
take a BN_CTX and the old functions now allocate a bit unnecessarily.
RSA only needs lshift (for BN_MONT_CTX) and sub (for CRT), but the
generic EC code wants add as well.

The generic EC code isn't even remotely constant-time, and I hope to
ultimately use stack-allocated EC_FELEMs, so I've made the actual
implementations here implemented in "words", which is much simpler
anyway due to not having to take care of widths.

I've also gone ahead and switched the EC code to these functions,
largely as a test of their performance (an earlier iteration made the EC
code noticeably slower). These operations are otherwise not
performance-critical in RSA.

The conversion from BIGNUM to BIGNUM+BN_CTX should be dropped by the
static linker already, and the unused BIGNUM+BN_CTX functions will fall
off when EC_FELEM happens.

Update-Note: BN_mod_*_quick bounce on malloc a bit now, but they're not
    really used externally. The one caller I found was wpa_supplicant
    which bounces on malloc already. They appear to be implementing
    compressed coordinates by hand? We may be able to convince them to
    call EC_POINT_set_compressed_coordinates_GFp.

Bug: 233, 236
Change-Id: I2bf361e9c089e0211b97d95523dbc06f1168e12b
Reviewed-on: https://boringssl-review.googlesource.com/25261
Commit-Queue: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/div.c b/crypto/fipsmodule/bn/div.c
index 453bd0f..85b7421 100644
--- a/crypto/fipsmodule/bn/div.c
+++ b/crypto/fipsmodule/bn/div.c
@@ -414,6 +414,89 @@
   return (d->neg ? BN_sub : BN_add)(r, r, d);
 }
 
+// bn_select_words sets |r| to |a| if |mask| is all ones or |b| if |mask| is
+// all zeros.
+static void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
+                            const BN_ULONG *b, size_t num) {
+  for (size_t i = 0; i < num; i++) {
+    OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                           crypto_word_t_too_small);
+    r[i] = constant_time_select_w(mask, a[i], b[i]);
+  }
+}
+
+// bn_mod_sub_words sets |r| to |a| - |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+static void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  // r = a - b
+  BN_ULONG borrow = bn_sub_words(r, a, b, num);
+  // tmp = a - b + m
+  bn_add_words(tmp, r, m, num);
+  bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num);
+}
+
+// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+static void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  // tmp = a + b. Note the result fits in |num|+1 words. We store the extra word
+  // in |carry|.
+  BN_ULONG carry = bn_add_words(tmp, a, b, num);
+  // r = a + b - m. We use |bn_sub_words| to perform the bulk of the
+  // subtraction, and then apply the borrow to |carry|.
+  carry -= bn_sub_words(r, tmp, m, num);
+  // |a| and |b| were both fully-reduced, so we know:
+  //
+  //   0 + 0 - m <= r < m + m - m
+  //          -m <= r < m
+  //
+  // If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then
+  // wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to
+  // return |r| + |m|, or |tmp|. |carry| must then be -1 or all ones. In both
+  // cases, |carry| is a suitable input to |bn_select_words|.
+  //
+  // Although |carry| may be one if |bn_add_words| returns one and
+  // |bn_sub_words| returns zero, this would give |r| > |m|, which violates are
+  // input assumptions.
+  assert(carry == 0 || carry == (BN_ULONG)-1);
+  bn_select_words(r, carry, tmp /* r < 0 */, r /* r >= 0 */, num);
+}
+
+static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) {
+  BIGNUM *ret = BN_CTX_get(ctx);
+  if (ret == NULL ||
+      !bn_wexpand(ret, width)) {
+    return NULL;
+  }
+  ret->neg = 0;
+  ret->width = width;
+  return ret;
+}
+
+// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on
+// error. This is so it may be used with low-level "words" functions. If
+// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope
+// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in
+// |width| words.
+static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width,
+                                         BN_CTX *ctx) {
+  if ((size_t)bn->width >= width) {
+    // Any excess words must be zero.
+    assert(bn_fits_in_words(bn, width));
+    return bn;
+  }
+  BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx);
+  if (ret == NULL ||
+      !BN_copy(ret, bn) ||
+      !bn_resize_words(ret, width)) {
+    return 0;
+  }
+  return ret;
+}
+
 int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
                BN_CTX *ctx) {
   if (!BN_add(r, a, b)) {
@@ -424,13 +507,27 @@
 
 int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                      const BIGNUM *m) {
-  if (!BN_uadd(r, a, b)) {
-    return 0;
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_add_quick_ctx(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+int bn_mod_add_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
   }
-  if (BN_ucmp(r, m) >= 0) {
-    return BN_usub(r, r, m);
-  }
-  return 1;
+  BN_CTX_end(ctx);
+  return ok;
 }
 
 int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
@@ -441,17 +538,29 @@
   return BN_nnmod(r, r, m, ctx);
 }
 
-// BN_mod_sub variant that may be used if both  a  and  b  are non-negative
-// and less than  m
+int bn_mod_sub_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
 int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                      const BIGNUM *m) {
-  if (!BN_sub(r, a, b)) {
-    return 0;
-  }
-  if (r->neg) {
-    return BN_add(r, r, m);
-  }
-  return 1;
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_sub_quick_ctx(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
 }
 
 int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
@@ -512,58 +621,33 @@
     abs_m->neg = 0;
   }
 
-  ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+  ret = bn_mod_lshift_quick_ctx(r, r, n, (abs_m ? abs_m : m), ctx);
 
   BN_free(abs_m);
   return ret;
 }
 
-int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
-  if (r != a) {
-    if (BN_copy(r, a) == NULL) {
+int bn_mod_lshift_quick_ctx(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx) {
+  if (!BN_copy(r, a)) {
+    return 0;
+  }
+  for (int i = 0; i < n; i++) {
+    if (!bn_mod_lshift1_quick_ctx(r, r, m, ctx)) {
       return 0;
     }
   }
-
-  while (n > 0) {
-    int max_shift;
-
-    // 0 < r < m
-    max_shift = BN_num_bits(m) - BN_num_bits(r);
-    // max_shift >= 0
-
-    if (max_shift < 0) {
-      OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
-      return 0;
-    }
-
-    if (max_shift > n) {
-      max_shift = n;
-    }
-
-    if (max_shift) {
-      if (!BN_lshift(r, r, max_shift)) {
-        return 0;
-      }
-      n -= max_shift;
-    } else {
-      if (!BN_lshift1(r, r)) {
-        return 0;
-      }
-      --n;
-    }
-
-    // BN_num_bits(r) <= BN_num_bits(m)
-    if (BN_cmp(r, m) >= 0) {
-      if (!BN_sub(r, r, m)) {
-        return 0;
-      }
-    }
-  }
-
   return 1;
 }
 
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift_quick_ctx(r, a, n, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
 int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
   if (!BN_lshift1(r, a)) {
     return 0;
@@ -572,15 +656,17 @@
   return BN_nnmod(r, r, m, ctx);
 }
 
-int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
-  if (!BN_lshift1(r, a)) {
-    return 0;
-  }
-  if (BN_cmp(r, m) >= 0) {
-    return BN_sub(r, r, m);
-  }
+int bn_mod_lshift1_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx) {
+  return bn_mod_add_quick_ctx(r, a, a, m, ctx);
+}
 
-  return 1;
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift1_quick_ctx(r, a, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
 }
 
 BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 22dc3a1..addc4bb 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -371,6 +371,29 @@
 int bn_sqr_fixed(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
 
 
+// Constant-time modular arithmetic.
+//
+// The following functions implement basic constant-time modular arithemtic on
+// word arrays.
+
+// bn_mod_add_quick_ctx acts like |BN_mod_add_quick| but takes a |BN_CTX|.
+int bn_mod_add_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_sub_quick_ctx acts like |BN_mod_sub_quick| but takes a |BN_CTX|.
+int bn_mod_sub_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_lshift1_quick_ctx acts like |BN_mod_lshift1_quick| but takes a
+// |BN_CTX|.
+int bn_mod_lshift1_quick_ctx(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx);
+
+// bn_mod_lshift_quick_ctx acts like |BN_mod_lshift_quick| but takes a |BN_CTX|.
+int bn_mod_lshift_quick_ctx(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx);
+
+
 // Low-level operations for small numbers.
 //
 // The following functions implement algorithms suitable for use with scalars
diff --git a/crypto/fipsmodule/bn/sqrt.c b/crypto/fipsmodule/bn/sqrt.c
index 68ccb91..852512c 100644
--- a/crypto/fipsmodule/bn/sqrt.c
+++ b/crypto/fipsmodule/bn/sqrt.c
@@ -184,7 +184,7 @@
     // November 1992.)
 
     // t := 2*a
-    if (!BN_mod_lshift1_quick(t, A, p)) {
+    if (!bn_mod_lshift1_quick_ctx(t, A, p, ctx)) {
       goto end;
     }
 
diff --git a/crypto/fipsmodule/ec/oct.c b/crypto/fipsmodule/ec/oct.c
index 3a6b4dd..38a3342 100644
--- a/crypto/fipsmodule/ec/oct.c
+++ b/crypto/fipsmodule/ec/oct.c
@@ -316,20 +316,20 @@
 
   // tmp1 := tmp1 + a*x
   if (group->a_is_minus3) {
-    if (!BN_mod_lshift1_quick(tmp2, x, &group->field) ||
-        !BN_mod_add_quick(tmp2, tmp2, x, &group->field) ||
-        !BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) {
+    if (!bn_mod_lshift1_quick_ctx(tmp2, x, &group->field, ctx) ||
+        !bn_mod_add_quick_ctx(tmp2, tmp2, x, &group->field, ctx) ||
+        !bn_mod_sub_quick_ctx(tmp1, tmp1, tmp2, &group->field, ctx)) {
       goto err;
     }
   } else {
     if (!BN_mod_mul(tmp2, a, x, &group->field, ctx) ||
-        !BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) {
+        !bn_mod_add_quick_ctx(tmp1, tmp1, tmp2, &group->field, ctx)) {
       goto err;
     }
   }
 
   // tmp1 := tmp1 + b
-  if (!BN_mod_add_quick(tmp1, tmp1, b, &group->field)) {
+  if (!bn_mod_add_quick_ctx(tmp1, tmp1, b, &group->field, ctx)) {
     goto err;
   }
 
diff --git a/crypto/fipsmodule/ec/simple.c b/crypto/fipsmodule/ec/simple.c
index 08954ea..57a9099 100644
--- a/crypto/fipsmodule/ec/simple.c
+++ b/crypto/fipsmodule/ec/simple.c
@@ -395,8 +395,8 @@
   }
 
   // n5, n6
-  if (!BN_mod_sub_quick(n5, n1, n3, p) ||
-      !BN_mod_sub_quick(n6, n2, n4, p)) {
+  if (!bn_mod_sub_quick_ctx(n5, n1, n3, p, ctx) ||
+      !bn_mod_sub_quick_ctx(n6, n2, n4, p, ctx)) {
     goto end;
   }
   // n5 = n1 - n3
@@ -418,8 +418,8 @@
   }
 
   // 'n7', 'n8'
-  if (!BN_mod_add_quick(n1, n1, n3, p) ||
-      !BN_mod_add_quick(n2, n2, n4, p)) {
+  if (!bn_mod_add_quick_ctx(n1, n1, n3, p, ctx) ||
+      !bn_mod_add_quick_ctx(n2, n2, n4, p, ctx)) {
     goto end;
   }
   // 'n7' = n1 + n3
@@ -453,14 +453,14 @@
   if (!field_sqr(group, n0, n6, ctx) ||
       !field_sqr(group, n4, n5, ctx) ||
       !field_mul(group, n3, n1, n4, ctx) ||
-      !BN_mod_sub_quick(&r->X, n0, n3, p)) {
+      !bn_mod_sub_quick_ctx(&r->X, n0, n3, p, ctx)) {
     goto end;
   }
   // X_r = n6^2 - n5^2 * 'n7'
 
   // 'n9'
-  if (!BN_mod_lshift1_quick(n0, &r->X, p) ||
-      !BN_mod_sub_quick(n0, n3, n0, p)) {
+  if (!bn_mod_lshift1_quick_ctx(n0, &r->X, p, ctx) ||
+      !bn_mod_sub_quick_ctx(n0, n3, n0, p, ctx)) {
     goto end;
   }
   // n9 = n5^2 * 'n7' - 2 * X_r
@@ -471,7 +471,7 @@
     goto end;  // now n5 is n5^3
   }
   if (!field_mul(group, n1, n2, n5, ctx) ||
-      !BN_mod_sub_quick(n0, n0, n1, p)) {
+      !bn_mod_sub_quick_ctx(n0, n0, n1, p, ctx)) {
     goto end;
   }
   if (BN_is_odd(n0) && !BN_add(n0, n0, p)) {
@@ -536,31 +536,31 @@
   // n1
   if (BN_cmp(&a->Z, &group->one) == 0) {
     if (!field_sqr(group, n0, &a->X, ctx) ||
-        !BN_mod_lshift1_quick(n1, n0, p) ||
-        !BN_mod_add_quick(n0, n0, n1, p) ||
-        !BN_mod_add_quick(n1, n0, &group->a, p)) {
+        !bn_mod_lshift1_quick_ctx(n1, n0, p, ctx) ||
+        !bn_mod_add_quick_ctx(n0, n0, n1, p, ctx) ||
+        !bn_mod_add_quick_ctx(n1, n0, &group->a, p, ctx)) {
       goto err;
     }
     // n1 = 3 * X_a^2 + a_curve
   } else if (group->a_is_minus3) {
     if (!field_sqr(group, n1, &a->Z, ctx) ||
-        !BN_mod_add_quick(n0, &a->X, n1, p) ||
-        !BN_mod_sub_quick(n2, &a->X, n1, p) ||
+        !bn_mod_add_quick_ctx(n0, &a->X, n1, p, ctx) ||
+        !bn_mod_sub_quick_ctx(n2, &a->X, n1, p, ctx) ||
         !field_mul(group, n1, n0, n2, ctx) ||
-        !BN_mod_lshift1_quick(n0, n1, p) ||
-        !BN_mod_add_quick(n1, n0, n1, p)) {
+        !bn_mod_lshift1_quick_ctx(n0, n1, p, ctx) ||
+        !bn_mod_add_quick_ctx(n1, n0, n1, p, ctx)) {
       goto err;
     }
     // n1 = 3 * (X_a + Z_a^2) * (X_a - Z_a^2)
     //    = 3 * X_a^2 - 3 * Z_a^4
   } else {
     if (!field_sqr(group, n0, &a->X, ctx) ||
-        !BN_mod_lshift1_quick(n1, n0, p) ||
-        !BN_mod_add_quick(n0, n0, n1, p) ||
+        !bn_mod_lshift1_quick_ctx(n1, n0, p, ctx) ||
+        !bn_mod_add_quick_ctx(n0, n0, n1, p, ctx) ||
         !field_sqr(group, n1, &a->Z, ctx) ||
         !field_sqr(group, n1, n1, ctx) ||
         !field_mul(group, n1, n1, &group->a, ctx) ||
-        !BN_mod_add_quick(n1, n1, n0, p)) {
+        !bn_mod_add_quick_ctx(n1, n1, n0, p, ctx)) {
       goto err;
     }
     // n1 = 3 * X_a^2 + a_curve * Z_a^4
@@ -574,7 +574,7 @@
   } else if (!field_mul(group, n0, &a->Y, &a->Z, ctx)) {
     goto err;
   }
-  if (!BN_mod_lshift1_quick(&r->Z, n0, p)) {
+  if (!bn_mod_lshift1_quick_ctx(&r->Z, n0, p, ctx)) {
     goto err;
   }
   // Z_r = 2 * Y_a * Z_a
@@ -582,30 +582,30 @@
   // n2
   if (!field_sqr(group, n3, &a->Y, ctx) ||
       !field_mul(group, n2, &a->X, n3, ctx) ||
-      !BN_mod_lshift_quick(n2, n2, 2, p)) {
+      !bn_mod_lshift_quick_ctx(n2, n2, 2, p, ctx)) {
     goto err;
   }
   // n2 = 4 * X_a * Y_a^2
 
   // X_r
-  if (!BN_mod_lshift1_quick(n0, n2, p) ||
+  if (!bn_mod_lshift1_quick_ctx(n0, n2, p, ctx) ||
       !field_sqr(group, &r->X, n1, ctx) ||
-      !BN_mod_sub_quick(&r->X, &r->X, n0, p)) {
+      !bn_mod_sub_quick_ctx(&r->X, &r->X, n0, p, ctx)) {
     goto err;
   }
   // X_r = n1^2 - 2 * n2
 
   // n3
   if (!field_sqr(group, n0, n3, ctx) ||
-      !BN_mod_lshift_quick(n3, n0, 3, p)) {
+      !bn_mod_lshift_quick_ctx(n3, n0, 3, p, ctx)) {
     goto err;
   }
   // n3 = 8 * Y_a^4
 
   // Y_r
-  if (!BN_mod_sub_quick(n0, n2, &r->X, p) ||
+  if (!bn_mod_sub_quick_ctx(n0, n2, &r->X, p, ctx) ||
       !field_mul(group, n0, n1, n0, ctx) ||
-      !BN_mod_sub_quick(&r->Y, n0, n3, p)) {
+      !bn_mod_sub_quick_ctx(&r->Y, n0, n3, p, ctx)) {
     goto err;
   }
   // Y_r = n1 * (n2 - X_r) - n3
@@ -688,15 +688,15 @@
 
     // rh := (rh + a*Z^4)*X
     if (group->a_is_minus3) {
-      if (!BN_mod_lshift1_quick(tmp, Z4, p) ||
-          !BN_mod_add_quick(tmp, tmp, Z4, p) ||
-          !BN_mod_sub_quick(rh, rh, tmp, p) ||
+      if (!bn_mod_lshift1_quick_ctx(tmp, Z4, p, ctx) ||
+          !bn_mod_add_quick_ctx(tmp, tmp, Z4, p, ctx) ||
+          !bn_mod_sub_quick_ctx(rh, rh, tmp, p, ctx) ||
           !field_mul(group, rh, rh, &point->X, ctx)) {
         goto err;
       }
     } else {
       if (!field_mul(group, tmp, Z4, &group->a, ctx) ||
-          !BN_mod_add_quick(rh, rh, tmp, p) ||
+          !bn_mod_add_quick_ctx(rh, rh, tmp, p, ctx) ||
           !field_mul(group, rh, rh, &point->X, ctx)) {
         goto err;
       }
@@ -704,17 +704,17 @@
 
     // rh := rh + b*Z^6
     if (!field_mul(group, tmp, &group->b, Z6, ctx) ||
-        !BN_mod_add_quick(rh, rh, tmp, p)) {
+        !bn_mod_add_quick_ctx(rh, rh, tmp, p, ctx)) {
       goto err;
     }
   } else {
     // rh := (rh + a)*X
-    if (!BN_mod_add_quick(rh, rh, &group->a, p) ||
+    if (!bn_mod_add_quick_ctx(rh, rh, &group->a, p, ctx) ||
         !field_mul(group, rh, rh, &point->X, ctx)) {
       goto err;
     }
     // rh := rh + b
-    if (!BN_mod_add_quick(rh, rh, &group->b, p)) {
+    if (!bn_mod_add_quick_ctx(rh, rh, &group->b, p, ctx)) {
       goto err;
     }
   }
diff --git a/crypto/fipsmodule/rsa/rsa_impl.c b/crypto/fipsmodule/rsa/rsa_impl.c
index 772288d..3f1c3b2 100644
--- a/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/crypto/fipsmodule/rsa/rsa_impl.c
@@ -805,8 +805,7 @@
   // 2. Canonicalize keys on p > q in |freeze_private_key|. (p > q for keys we
   //    generate, but not ones we import.) This removes the p < q case below.
   //
-  // 3. Make |BN_mod_sub_quick| constant-time (use |bn_sub_words| and select on
-  //    the borrow bit) and compute r0 - m1 (mod p) with it.
+  // 3. Use |bn_mod_sub_quick_ctx| to compute r0 - m1 (mod p).
   //
   // 4. When computing mont_*, additionally compute iqmp_mont, iqmp in
   //    Montgomery form. The |BN_mul| and |BN_mod| pair can then be replaced