Move some RSA keygen support code into separate files.

This was all new code. There was a request to make this available under
ISC.

Change-Id: Ibabbe6fbf593c2a781aac47a4de7ac378604dbcf
Reviewed-on: https://boringssl-review.googlesource.com/28267
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
index 51ac7d5..e456d3a 100644
--- a/crypto/fipsmodule/bcm.c
+++ b/crypto/fipsmodule/bcm.c
@@ -36,8 +36,10 @@
 #include "bn/cmp.c"
 #include "bn/ctx.c"
 #include "bn/div.c"
+#include "bn/div_extra.c"
 #include "bn/exponentiation.c"
 #include "bn/gcd.c"
+#include "bn/gcd_extra.c"
 #include "bn/generic.c"
 #include "bn/jacobi.c"
 #include "bn/montgomery.c"
diff --git a/crypto/fipsmodule/bn/div_extra.c b/crypto/fipsmodule/bn/div_extra.c
new file mode 100644
index 0000000..7f03f28
--- /dev/null
+++ b/crypto/fipsmodule/bn/div_extra.c
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+// The following functions use a Barrett reduction variant to avoid leaking the
+// numerator. See http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+// We use 32-bit numerator and 16-bit divisor for simplicity. This allows
+// computing |m| and |q| without architecture-specific code.
+
+// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see
+// reference). For proof of correctness in Coq, see
+// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v
+// Note the Coq version of |mod_u16| additionally includes the computation of
+// |p| and |m| from |bn_mod_u16_consttime| below.
+static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) {
+  // Compute floor(n/d) per steps 3 through 5.
+  uint32_t q = ((uint64_t)m * n) >> 32;
+  // Note there is a typo in the reference. We right-shift by one, not two.
+  uint32_t t = ((n - q) >> 1) + q;
+  t = t >> (p - 1);
+
+  // Multiply and subtract to get the remainder.
+  n -= d * t;
+  assert(n < d);
+  return n;
+}
+
+// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the
+// "magic numbers" for |d| (see reference).
+static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d,
+                                      uint32_t p, uint32_t m) {
+  // Incorporate |a| in two 16-bit chunks.
+  uint32_t t = r;
+  t <<= 16;
+  t |= a >> 16;
+  t = mod_u16(t, d, p, m);
+
+  t <<= 16;
+  t |= a & 0xffff;
+  t = mod_u16(t, d, p, m);
+  return t;
+}
+
+uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) {
+  if (d <= 1) {
+    return 0;
+  }
+
+  // Compute the "magic numbers" for |d|. See steps 1 and 2.
+  // This computes p = ceil(log_2(d)).
+  uint32_t p = BN_num_bits_word(d - 1);
+  // This operation is not constant-time, but |p| and |d| are public values.
+  // Note that |p| is at most 16, so the computation fits in |uint64_t|.
+  assert(p <= 16);
+  uint32_t m = ((UINT64_C(1) << (32 + p)) + d - 1) / d;
+
+  uint16_t ret = 0;
+  for (int i = bn->width - 1; i >= 0; i--) {
+#if BN_BITS2 == 32
+    ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m);
+#elif BN_BITS2 == 64
+    ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m);
+    ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m);
+#else
+#error "Unknown BN_ULONG size"
+#endif
+  }
+  return ret;
+}
diff --git a/crypto/fipsmodule/bn/gcd.c b/crypto/fipsmodule/bn/gcd.c
index 7868b40..bd0fa6f 100644
--- a/crypto/fipsmodule/bn/gcd.c
+++ b/crypto/fipsmodule/bn/gcd.c
@@ -108,316 +108,11 @@
 
 #include <openssl/bn.h>
 
-#include <assert.h>
-
 #include <openssl/err.h>
 
 #include "internal.h"
 
 
-static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); }
-
-static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp,
-                                size_t num) {
-  bn_rshift1_words(tmp, a, num);
-  bn_select_words(a, mask, tmp, a, num);
-}
-
-static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry,
-                                      BN_ULONG mask, BN_ULONG *tmp,
-                                      size_t num) {
-  maybe_rshift1_words(a, mask, tmp, num);
-  if (num != 0) {
-    carry &= mask;
-    a[num - 1] |= carry << (BN_BITS2-1);
-  }
-}
-
-static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b,
-                                BN_ULONG *tmp, size_t num) {
-  BN_ULONG carry = bn_add_words(tmp, a, b, num);
-  bn_select_words(a, mask, tmp, a, num);
-  return carry & mask;
-}
-
-static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
-                            const BIGNUM *y, BN_CTX *ctx) {
-  size_t width = x->width > y->width ? x->width : y->width;
-  if (width == 0) {
-    *out_shift = 0;
-    BN_zero(r);
-    return 1;
-  }
-
-  // This is a constant-time implementation of Stein's algorithm (binary GCD).
-  int ret = 0;
-  BN_CTX_start(ctx);
-  BIGNUM *u = BN_CTX_get(ctx);
-  BIGNUM *v = BN_CTX_get(ctx);
-  BIGNUM *tmp = BN_CTX_get(ctx);
-  if (u == NULL || v == NULL || tmp == NULL ||
-      !BN_copy(u, x) ||
-      !BN_copy(v, y) ||
-      !bn_resize_words(u, width) ||
-      !bn_resize_words(v, width) ||
-      !bn_resize_words(tmp, width)) {
-    goto err;
-  }
-
-  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
-  // most the combined bit width of inputs for at least one value to be zero.
-  unsigned x_bits = x->width * BN_BITS2, y_bits = y->width * BN_BITS2;
-  unsigned num_iters = x_bits + y_bits;
-  if (num_iters < x_bits) {
-    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
-    goto err;
-  }
-
-  unsigned shift = 0;
-  for (unsigned i = 0; i < num_iters; i++) {
-    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
-
-    // If both |u| and |v| are odd, subtract the smaller from the larger.
-    BN_ULONG u_less_than_v =
-        (BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width);
-    bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width);
-    bn_sub_words(tmp->d, v->d, u->d, width);
-    bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width);
-
-    // At least one of |u| and |v| is now even.
-    BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]);
-    BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]);
-    assert(!(u_is_odd & v_is_odd));
-
-    // If both are even, the final GCD gains a factor of two.
-    shift += 1 & (~u_is_odd & ~v_is_odd);
-
-    // Halve any which are even.
-    maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width);
-    maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width);
-  }
-
-  // One of |u| or |v| is zero at this point. The algorithm usually makes |u|
-  // zero, unless |y| was already zero on input. Fix this by combining the
-  // values.
-  assert(BN_is_zero(u) || BN_is_zero(v));
-  for (size_t i = 0; i < width; i++) {
-    v->d[i] |= u->d[i];
-  }
-
-  *out_shift = shift;
-  ret = bn_set_words(r, v->d, width);
-
-err:
-  BN_CTX_end(ctx);
-  return ret;
-}
-
-int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) {
-  unsigned shift;
-  return bn_gcd_consttime(r, &shift, x, y, ctx) &&
-         BN_lshift(r, r, shift);
-}
-
-int bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x,
-                           const BIGNUM *y, BN_CTX *ctx) {
-  int ret = 0;
-  BN_CTX_start(ctx);
-  unsigned shift;
-  BIGNUM *gcd = BN_CTX_get(ctx);
-  if (gcd == NULL ||
-      !bn_gcd_consttime(gcd, &shift, x, y, ctx)) {
-    goto err;
-  }
-
-  // Check that 2^|shift| * |gcd| is one.
-  if (gcd->width == 0) {
-    *out_relatively_prime = 0;
-  } else {
-    BN_ULONG mask = shift | (gcd->d[0] ^ 1);
-    for (int i = 1; i < gcd->width; i++) {
-      mask |= gcd->d[i];
-    }
-    *out_relatively_prime = mask == 0;
-  }
-  ret = 1;
-
-err:
-  BN_CTX_end(ctx);
-  return ret;
-}
-
-int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
-  BN_CTX_start(ctx);
-  unsigned shift;
-  BIGNUM *gcd = BN_CTX_get(ctx);
-  int ret = gcd != NULL &&
-            bn_mul_consttime(r, a, b, ctx) &&
-            bn_gcd_consttime(gcd, &shift, a, b, ctx) &&
-            bn_div_consttime(r, NULL, r, gcd, ctx) &&
-            bn_rshift_secret_shift(r, r, shift, ctx);
-  BN_CTX_end(ctx);
-  return ret;
-}
-
-int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
-                             const BIGNUM *n, BN_CTX *ctx) {
-  *out_no_inverse = 0;
-  if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) {
-    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
-    return 0;
-  }
-  if (BN_is_zero(a)) {
-    if (BN_is_one(n)) {
-      BN_zero(r);
-      return 1;
-    }
-    *out_no_inverse = 1;
-    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
-    return 0;
-  }
-
-  // This is a constant-time implementation of the extended binary GCD
-  // algorithm. It is adapted from the Handbook of Applied Cryptography, section
-  // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid
-  // negative numbers.
-  //
-  // For more details and proof of correctness, see
-  // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step|
-  // and |mod_inverse_consttime| for the algorithm in Gallina and see
-  // |mod_inverse_consttime_spec| for the correctness result.
-
-  if (!BN_is_odd(a) && !BN_is_odd(n)) {
-    *out_no_inverse = 1;
-    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
-    return 0;
-  }
-
-  // This function exists to compute the RSA private exponent, where |a| is one
-  // word. We'll thus use |a_width| when available.
-  size_t n_width = n->width, a_width = a->width;
-  if (a_width > n_width) {
-    a_width = n_width;
-  }
-
-  int ret = 0;
-  BN_CTX_start(ctx);
-  BIGNUM *u = BN_CTX_get(ctx);
-  BIGNUM *v = BN_CTX_get(ctx);
-  BIGNUM *A = BN_CTX_get(ctx);
-  BIGNUM *B = BN_CTX_get(ctx);
-  BIGNUM *C = BN_CTX_get(ctx);
-  BIGNUM *D = BN_CTX_get(ctx);
-  BIGNUM *tmp = BN_CTX_get(ctx);
-  BIGNUM *tmp2 = BN_CTX_get(ctx);
-  if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL ||
-      D == NULL || tmp == NULL || tmp2 == NULL ||
-      !BN_copy(u, a) ||
-      !BN_copy(v, n) ||
-      !BN_one(A) ||
-      !BN_one(D) ||
-      // For convenience, size |u| and |v| equivalently.
-      !bn_resize_words(u, n_width) ||
-      !bn_resize_words(v, n_width) ||
-      // |A| and |C| are bounded by |m|.
-      !bn_resize_words(A, n_width) ||
-      !bn_resize_words(C, n_width) ||
-      // |B| and |D| are bounded by |a|.
-      !bn_resize_words(B, a_width) ||
-      !bn_resize_words(D, a_width) ||
-      // |tmp| and |tmp2| may be used at either size.
-      !bn_resize_words(tmp, n_width) ||
-      !bn_resize_words(tmp2, n_width)) {
-    goto err;
-  }
-
-  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
-  // most the combined bit width of inputs for at least one value to be zero.
-  unsigned a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2;
-  unsigned num_iters = a_bits + n_bits;
-  if (num_iters < a_bits) {
-    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
-    goto err;
-  }
-
-  // Before and after each loop iteration, the following hold:
-  //
-  //   u = A*a - B*n
-  //   v = D*n - C*a
-  //   0 < u <= a
-  //   0 <= v <= n
-  //   0 <= A < n
-  //   0 <= B <= a
-  //   0 <= C < n
-  //   0 <= D <= a
-  //
-  // After each loop iteration, u and v only get smaller, and at least one of
-  // them shrinks by at least a factor of two.
-  for (unsigned i = 0; i < num_iters; i++) {
-    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
-
-    // If both |u| and |v| are odd, subtract the smaller from the larger.
-    BN_ULONG v_less_than_u =
-        (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width);
-    bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width);
-    bn_sub_words(tmp->d, u->d, v->d, n_width);
-    bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width);
-
-    // If we updated one of the values, update the corresponding coefficient.
-    BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width);
-    carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width);
-    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width);
-    bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width);
-    bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width);
-
-    bn_add_words(tmp->d, B->d, D->d, a_width);
-    bn_sub_words(tmp2->d, tmp->d, a->d, a_width);
-    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width);
-    bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width);
-    bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width);
-
-    // Our loop invariants hold at this point. Additionally, exactly one of |u|
-    // and |v| is now even.
-    BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
-    BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
-    assert(u_is_even != v_is_even);
-
-    // Halve the even one and adjust the corresponding coefficient.
-    maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
-    BN_ULONG A_or_B_is_odd =
-        word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]);
-    BN_ULONG A_carry =
-        maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width);
-    BN_ULONG B_carry =
-        maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width);
-    maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width);
-    maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width);
-
-    maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width);
-    BN_ULONG C_or_D_is_odd =
-        word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]);
-    BN_ULONG C_carry =
-        maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width);
-    BN_ULONG D_carry =
-        maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width);
-    maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width);
-    maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
-  }
-
-  assert(BN_is_zero(v));
-  if (!BN_is_one(u)) {
-    *out_no_inverse = 1;
-    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
-    goto err;
-  }
-
-  ret = BN_copy(r, A) != NULL;
-
-err:
-  BN_CTX_end(ctx);
-  return ret;
-}
-
 int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
                        const BIGNUM *n, BN_CTX *ctx) {
   *out_no_inverse = 0;
diff --git a/crypto/fipsmodule/bn/gcd_extra.c b/crypto/fipsmodule/bn/gcd_extra.c
new file mode 100644
index 0000000..30540e3
--- /dev/null
+++ b/crypto/fipsmodule/bn/gcd_extra.c
@@ -0,0 +1,325 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); }
+
+static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp,
+                                size_t num) {
+  bn_rshift1_words(tmp, a, num);
+  bn_select_words(a, mask, tmp, a, num);
+}
+
+static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry,
+                                      BN_ULONG mask, BN_ULONG *tmp,
+                                      size_t num) {
+  maybe_rshift1_words(a, mask, tmp, num);
+  if (num != 0) {
+    carry &= mask;
+    a[num - 1] |= carry << (BN_BITS2-1);
+  }
+}
+
+static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b,
+                                BN_ULONG *tmp, size_t num) {
+  BN_ULONG carry = bn_add_words(tmp, a, b, num);
+  bn_select_words(a, mask, tmp, a, num);
+  return carry & mask;
+}
+
+static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
+                            const BIGNUM *y, BN_CTX *ctx) {
+  size_t width = x->width > y->width ? x->width : y->width;
+  if (width == 0) {
+    *out_shift = 0;
+    BN_zero(r);
+    return 1;
+  }
+
+  // This is a constant-time implementation of Stein's algorithm (binary GCD).
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *u = BN_CTX_get(ctx);
+  BIGNUM *v = BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (u == NULL || v == NULL || tmp == NULL ||
+      !BN_copy(u, x) ||
+      !BN_copy(v, y) ||
+      !bn_resize_words(u, width) ||
+      !bn_resize_words(v, width) ||
+      !bn_resize_words(tmp, width)) {
+    goto err;
+  }
+
+  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
+  // most the combined bit width of inputs for at least one value to be zero.
+  unsigned x_bits = x->width * BN_BITS2, y_bits = y->width * BN_BITS2;
+  unsigned num_iters = x_bits + y_bits;
+  if (num_iters < x_bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    goto err;
+  }
+
+  unsigned shift = 0;
+  for (unsigned i = 0; i < num_iters; i++) {
+    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
+
+    // If both |u| and |v| are odd, subtract the smaller from the larger.
+    BN_ULONG u_less_than_v =
+        (BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width);
+    bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width);
+    bn_sub_words(tmp->d, v->d, u->d, width);
+    bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width);
+
+    // At least one of |u| and |v| is now even.
+    BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]);
+    BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]);
+    assert(!(u_is_odd & v_is_odd));
+
+    // If both are even, the final GCD gains a factor of two.
+    shift += 1 & (~u_is_odd & ~v_is_odd);
+
+    // Halve any which are even.
+    maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width);
+    maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width);
+  }
+
+  // One of |u| or |v| is zero at this point. The algorithm usually makes |u|
+  // zero, unless |y| was already zero on input. Fix this by combining the
+  // values.
+  assert(BN_is_zero(u) || BN_is_zero(v));
+  for (size_t i = 0; i < width; i++) {
+    v->d[i] |= u->d[i];
+  }
+
+  *out_shift = shift;
+  ret = bn_set_words(r, v->d, width);
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) {
+  unsigned shift;
+  return bn_gcd_consttime(r, &shift, x, y, ctx) &&
+         BN_lshift(r, r, shift);
+}
+
+int bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x,
+                           const BIGNUM *y, BN_CTX *ctx) {
+  int ret = 0;
+  BN_CTX_start(ctx);
+  unsigned shift;
+  BIGNUM *gcd = BN_CTX_get(ctx);
+  if (gcd == NULL ||
+      !bn_gcd_consttime(gcd, &shift, x, y, ctx)) {
+    goto err;
+  }
+
+  // Check that 2^|shift| * |gcd| is one.
+  if (gcd->width == 0) {
+    *out_relatively_prime = 0;
+  } else {
+    BN_ULONG mask = shift | (gcd->d[0] ^ 1);
+    for (int i = 1; i < gcd->width; i++) {
+      mask |= gcd->d[i];
+    }
+    *out_relatively_prime = mask == 0;
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  unsigned shift;
+  BIGNUM *gcd = BN_CTX_get(ctx);
+  int ret = gcd != NULL &&
+            bn_mul_consttime(r, a, b, ctx) &&
+            bn_gcd_consttime(gcd, &shift, a, b, ctx) &&
+            bn_div_consttime(r, NULL, r, gcd, ctx) &&
+            bn_rshift_secret_shift(r, r, shift, ctx);
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
+                             const BIGNUM *n, BN_CTX *ctx) {
+  *out_no_inverse = 0;
+  if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+  if (BN_is_zero(a)) {
+    if (BN_is_one(n)) {
+      BN_zero(r);
+      return 1;
+    }
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This is a constant-time implementation of the extended binary GCD
+  // algorithm. It is adapted from the Handbook of Applied Cryptography, section
+  // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid
+  // negative numbers.
+  //
+  // For more details and proof of correctness, see
+  // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step|
+  // and |mod_inverse_consttime| for the algorithm in Gallina and see
+  // |mod_inverse_consttime_spec| for the correctness result.
+
+  if (!BN_is_odd(a) && !BN_is_odd(n)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This function exists to compute the RSA private exponent, where |a| is one
+  // word. We'll thus use |a_width| when available.
+  size_t n_width = n->width, a_width = a->width;
+  if (a_width > n_width) {
+    a_width = n_width;
+  }
+
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *u = BN_CTX_get(ctx);
+  BIGNUM *v = BN_CTX_get(ctx);
+  BIGNUM *A = BN_CTX_get(ctx);
+  BIGNUM *B = BN_CTX_get(ctx);
+  BIGNUM *C = BN_CTX_get(ctx);
+  BIGNUM *D = BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  BIGNUM *tmp2 = BN_CTX_get(ctx);
+  if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL ||
+      D == NULL || tmp == NULL || tmp2 == NULL ||
+      !BN_copy(u, a) ||
+      !BN_copy(v, n) ||
+      !BN_one(A) ||
+      !BN_one(D) ||
+      // For convenience, size |u| and |v| equivalently.
+      !bn_resize_words(u, n_width) ||
+      !bn_resize_words(v, n_width) ||
+      // |A| and |C| are bounded by |m|.
+      !bn_resize_words(A, n_width) ||
+      !bn_resize_words(C, n_width) ||
+      // |B| and |D| are bounded by |a|.
+      !bn_resize_words(B, a_width) ||
+      !bn_resize_words(D, a_width) ||
+      // |tmp| and |tmp2| may be used at either size.
+      !bn_resize_words(tmp, n_width) ||
+      !bn_resize_words(tmp2, n_width)) {
+    goto err;
+  }
+
+  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
+  // most the combined bit width of inputs for at least one value to be zero.
+  unsigned a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2;
+  unsigned num_iters = a_bits + n_bits;
+  if (num_iters < a_bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    goto err;
+  }
+
+  // Before and after each loop iteration, the following hold:
+  //
+  //   u = A*a - B*n
+  //   v = D*n - C*a
+  //   0 < u <= a
+  //   0 <= v <= n
+  //   0 <= A < n
+  //   0 <= B <= a
+  //   0 <= C < n
+  //   0 <= D <= a
+  //
+  // After each loop iteration, u and v only get smaller, and at least one of
+  // them shrinks by at least a factor of two.
+  for (unsigned i = 0; i < num_iters; i++) {
+    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
+
+    // If both |u| and |v| are odd, subtract the smaller from the larger.
+    BN_ULONG v_less_than_u =
+        (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width);
+    bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width);
+    bn_sub_words(tmp->d, u->d, v->d, n_width);
+    bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width);
+
+    // If we updated one of the values, update the corresponding coefficient.
+    BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width);
+    carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width);
+    bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width);
+    bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width);
+
+    bn_add_words(tmp->d, B->d, D->d, a_width);
+    bn_sub_words(tmp2->d, tmp->d, a->d, a_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width);
+    bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width);
+    bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width);
+
+    // Our loop invariants hold at this point. Additionally, exactly one of |u|
+    // and |v| is now even.
+    BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
+    BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
+    assert(u_is_even != v_is_even);
+
+    // Halve the even one and adjust the corresponding coefficient.
+    maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
+    BN_ULONG A_or_B_is_odd =
+        word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]);
+    BN_ULONG A_carry =
+        maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width);
+    BN_ULONG B_carry =
+        maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width);
+
+    maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width);
+    BN_ULONG C_or_D_is_odd =
+        word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]);
+    BN_ULONG C_carry =
+        maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width);
+    BN_ULONG D_carry =
+        maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
+  }
+
+  assert(BN_is_zero(v));
+  if (!BN_is_one(u)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    goto err;
+  }
+
+  ret = BN_copy(r, A) != NULL;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
diff --git a/crypto/fipsmodule/bn/prime.c b/crypto/fipsmodule/bn/prime.c
index 80b33c2..903d6b1 100644
--- a/crypto/fipsmodule/bn/prime.c
+++ b/crypto/fipsmodule/bn/prime.c
@@ -532,73 +532,6 @@
   return found;
 }
 
-// The following functions use a Barrett reduction variant to avoid leaking the
-// numerator. See http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
-//
-// We use 32-bit numerator and 16-bit divisor for simplicity. This allows
-// computing |m| and |q| without architecture-specific code.
-
-// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see
-// reference). For proof of correctness in Coq, see
-// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v
-// Note the Coq version of |mod_u16| additionally includes the computation of
-// |p| and |m| from |bn_mod_u16_consttime| below.
-static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) {
-  // Compute floor(n/d) per steps 3 through 5.
-  uint32_t q = ((uint64_t)m * n) >> 32;
-  // Note there is a typo in the reference. We right-shift by one, not two.
-  uint32_t t = ((n - q) >> 1) + q;
-  t = t >> (p - 1);
-
-  // Multiply and subtract to get the remainder.
-  n -= d * t;
-  assert(n < d);
-  return n;
-}
-
-// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the
-// "magic numbers" for |d| (see reference).
-static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d,
-                                      uint32_t p, uint32_t m) {
-  // Incorporate |a| in two 16-bit chunks.
-  uint32_t t = r;
-  t <<= 16;
-  t |= a >> 16;
-  t = mod_u16(t, d, p, m);
-
-  t <<= 16;
-  t |= a & 0xffff;
-  t = mod_u16(t, d, p, m);
-  return t;
-}
-
-uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) {
-  if (d <= 1) {
-    return 0;
-  }
-
-  // Compute the "magic numbers" for |d|. See steps 1 and 2.
-  // This computes p = ceil(log_2(d)).
-  uint32_t p = BN_num_bits_word(d - 1);
-  // This operation is not constant-time, but |p| and |d| are public values.
-  // Note that |p| is at most 16, so the computation fits in |uint64_t|.
-  assert(p <= 16);
-  uint32_t m = ((UINT64_C(1) << (32 + p)) + d - 1) / d;
-
-  uint16_t ret = 0;
-  for (int i = bn->width - 1; i >= 0; i--) {
-#if BN_BITS2 == 32
-    ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m);
-#elif BN_BITS2 == 64
-    ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m);
-    ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m);
-#else
-#error "Unknown BN_ULONG size"
-#endif
-  }
-  return ret;
-}
-
 static int bn_trial_division(uint16_t *out, const BIGNUM *bn) {
   const size_t num_primes = num_trial_division_primes(bn);
   for (size_t i = 1; i < num_primes; i++) {